diff --git a/pyproject.toml b/pyproject.toml index 13c421d8ea..805fda5971 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,6 +106,10 @@ windows = [ base = [ "unsloth[huggingface]", ] +eval = [ + "unsloth[huggingface]", + "lm_eval>=0.4.4", +] cu118only = [ "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)", "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)", diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index 9060b57542..1e76fb8d19 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -1922,6 +1922,56 @@ def pip_install( temp_req.unlink(missing_ok = True) +def _eval_extra_requirements(local_repo: str) -> list: + """The [eval] extra's third-party requirements from the checkout's pyproject. + + Self-referential entries (``unsloth[huggingface]``) are dropped: the local + checkout is already overlaid with --no-deps and must not be re-resolved + from PyPI over the editable install. + """ + try: + import tomllib + + with open(Path(local_repo) / "pyproject.toml", "rb") as fh: + data = tomllib.load(fh) + requirements = data["project"]["optional-dependencies"]["eval"] + if isinstance(requirements, list): + filtered = [ + requirement + for requirement in requirements + if re.split(r"[\[<>=!~; ]", str(requirement).strip(), maxsplit = 1)[0].lower() + != "unsloth" + ] + if filtered: + return filtered + except Exception: + pass + # fallback mirrors the pin in pyproject.toml + return ["lm_eval>=0.4.4"] + + +def _install_eval_extra(*, package_name: str, local_repo: str) -> None: + """Install pyproject.toml's [eval] extra (lm-eval-harness for `unsloth eval`).""" + if NO_TORCH or package_name != "unsloth": + return + _progress("eval extra") + if local_repo: + # the checkout is overlaid with --no-deps so the torch/CUDA stack is + # not re-resolved; a full `-e repo[eval]` install would resolve the + # base deps again, so install only the extra's own packages + pip_install( + "Installing unsloth[eval] extra", + "--no-cache-dir", + *_eval_extra_requirements(local_repo), + ) + else: + pip_install( + "Installing unsloth[eval] extra", + "--no-cache-dir", + "unsloth[eval]", + ) + + def download_file(url: str, dest: Path) -> None: """Download a file using urllib (no curl dependency).""" urllib.request.urlretrieve(url, dest) @@ -1977,6 +2027,8 @@ def install_python_stack() -> int: base_total += 1 # ROCm torch check (line 1526) -- all non-macOS platforms if not IS_WINDOWS: base_total += 2 # flash-attn (line 1620) + ROCm torch final (line 1705) -- Linux only + if not NO_TORCH and package_name == "unsloth": + base_total += 1 # pyproject.toml [eval] extra _TOTAL = (base_total - 1) if skip_base else base_total # 1. Try uv for faster installs (before pip upgrade -- uv venvs don't @@ -2197,6 +2249,8 @@ def _win_amd_smi_has_gpu(stdout: str) -> bool: "Manual install may be required. See: https://docs.unsloth.ai/get-started/install-and-update/amd", ) + _install_eval_extra(package_name = package_name, local_repo = local_repo) + # 3. Extra dependencies _progress("unsloth extras") pip_install( diff --git a/unsloth_cli/__init__.py b/unsloth_cli/__init__.py index 440b6276cd..c6e866f7c0 100644 --- a/unsloth_cli/__init__.py +++ b/unsloth_cli/__init__.py @@ -13,6 +13,7 @@ from unsloth_cli.commands.chat import chat from unsloth_cli.commands.connect import connect_app from unsloth_cli.commands.export import export, list_checkpoints +from unsloth_cli.commands.eval import evaluate as eval_command from unsloth_cli.commands.studio import ( run as studio_run, studio_app, @@ -76,6 +77,7 @@ def main( app.command()(inference) app.command()(chat) app.command()(export) +app.command("eval")(eval_command) app.command("list-checkpoints")(list_checkpoints) app.add_typer(studio_app, name = "studio", help = "Unsloth Studio commands.") app.add_typer( diff --git a/unsloth_cli/commands/eval.py b/unsloth_cli/commands/eval.py new file mode 100644 index 0000000000..9c91c00fd7 --- /dev/null +++ b/unsloth_cli/commands/eval.py @@ -0,0 +1,876 @@ +# SPDX-License-Identifier: AGPL-3.0-only +# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0 + +import contextlib +import json +import os +import re +import shutil +import sys +import tempfile +from pathlib import Path +from typing import List, Optional, Tuple + +import typer +import yaml + + +@contextlib.contextmanager +def _spinner(console, text): + from rich.live import Live + from rich.spinner import Spinner + with Live( + Spinner("dots", text = text, style = "cyan"), + console = console, + transient = True, + refresh_per_second = 12, + redirect_stdout = False, + redirect_stderr = False, + ): + yield + + +@contextlib.contextmanager +def _silence(): + from rich.console import Console + + sys.stdout.flush() + sys.stderr.flush() + real = os.fdopen(os.dup(1), "w", closefd = True) + saved_out, saved_err = os.dup(1), os.dup(2) + devnull_fd = os.open(os.devnull, os.O_WRONLY) + try: + os.dup2(devnull_fd, 1) + os.dup2(devnull_fd, 2) + yield Console(file = real) + finally: + sys.stdout.flush() + sys.stderr.flush() + os.dup2(saved_out, 1) + os.dup2(saved_err, 2) + os.close(saved_out) + os.close(saved_err) + os.close(devnull_fd) + real.close() + + +def _read_adapter_base(config: Path) -> Optional[str]: + # ValueError covers both JSONDecodeError and UnicodeDecodeError + try: + data = json.loads(config.read_text(encoding = "utf-8")) + except (ValueError, OSError): + return None + if not isinstance(data, dict): + return None + return data.get("base_model_name_or_path") + + +_TOKENIZER_FILES = ("tokenizer_config.json", "tokenizer.json", "tokenizer.model") +_HUB_REPO_RE = re.compile(r"[\w.\-]+/[\w.\-]+") + + +def _has_tokenizer_files(model: str) -> bool: + path = Path(model) + if path.is_dir(): + return any((path / name).exists() for name in _TOKENIZER_FILES) + if path.exists() or not _HUB_REPO_RE.fullmatch(model): + return False + try: + from huggingface_hub import list_repo_files + files = set(list_repo_files(model)) + except Exception: + return False + return any(name in files for name in _TOKENIZER_FILES) + + +def _bitsandbytes_available() -> bool: + from importlib.util import find_spec + return find_spec("bitsandbytes") is not None + + +def _lm_eval_available() -> bool: + # probe without importing: on lm-eval 0.4.4 `import lm_eval` pulls in + # transformers, which must stay unimported until unsloth has loaded + if "lm_eval" in sys.modules: + return sys.modules["lm_eval"] is not None + from importlib.util import find_spec + try: + return find_spec("lm_eval") is not None + except (ImportError, ValueError): + return False + + +def _lm_eval_version() -> tuple: + from importlib.metadata import PackageNotFoundError, version + for dist in ("lm_eval", "lm-eval"): + try: + parts = re.findall(r"\d+", version(dist))[:3] + return tuple(int(part) for part in parts) + except PackageNotFoundError: + continue + except Exception: + break + # unknown version: don't block devices the runtime may well support + return (999,) + + +def _hf_device_error(device: str) -> Optional[str]: + # lm-eval's HFLM only recognises 'cuda', canonical 'cuda:', 'mps' and + # 'mps:0'; anything else (cuda0, cuda:, cuda:01, an out-of-range index) + # silently falls back to its default device, so reject those up front + if device.startswith("cuda"): + match = re.fullmatch(r"cuda(?::(0|[1-9]\d*))?", device) + if not match: + return f"invalid --device '{device}' — use 'cuda' or 'cuda:'." + import torch + + if not torch.cuda.is_available(): + return f"--device {device} requested but CUDA is not available." + if match.group(1) is not None: + idx = int(match.group(1)) + count = torch.cuda.device_count() + if idx >= count: + return f"--device {device} requested but only {count} CUDA device(s) are available." + elif device.startswith("mps"): + if not re.fullmatch(r"mps(?::0)?", device): + return f"invalid --device '{device}' — use 'mps'." + import torch + + mps = getattr(torch.backends, "mps", None) + if not (mps and mps.is_available()): + return f"--device {device} requested but MPS is not available." + elif device != "cpu": + match = re.fullmatch(r"(npu|xpu|hpu):(\d+)", device) + if not match: + # a typo like 'cpuu' or 'cude' would silently fall back to HFLM's + # default device + return ( + f"invalid --device '{device}' — use 'cpu', 'cuda[:]', 'mps', " + "or ':'." + ) + # an unavailable or out-of-range accelerator would also silently fall + # back, so validate against the installed torch build like cuda above + kind, index = match.group(1), int(match.group(2)) + if kind in ("xpu", "hpu") and _lm_eval_version() < (0, 4, 10): + # HFLM only enumerated cuda/cpu/mps/npu before 0.4.10; xpu/hpu + # strings fell through to its silent default-device fallback + return ( + f"--device {device} needs lm-eval >= 0.4.10 — upgrade with " + "`pip install -U lm_eval`." + ) + import torch + + backend_mod = getattr(torch, kind, None) + try: + available = bool(backend_mod is not None and backend_mod.is_available()) + except Exception: + available = False + if not available: + return ( + f"--device {device} requested but {kind.upper()} is not available " + "in this torch build." + ) + try: + count = int(backend_mod.device_count()) + except Exception: + count = 0 + if index >= count: + return ( + f"--device {device} requested but only {count} {kind.upper()} " + "device(s) are available." + ) + return None + + +def _registry_names(manager) -> set: + return ( + set(getattr(manager, "all_tasks", []) or []) + | set(getattr(manager, "all_groups", []) or []) + | set(getattr(manager, "all_tags", []) or []) + ) + + +def resolve_base_model(model: str) -> Optional[str]: + path = Path(model) + if path.is_dir(): + config = path / "adapter_config.json" + return _read_adapter_base(config) if config.exists() else None + # adapter-only Hub repos carry adapter_config.json but no config.json, so + # they cannot be passed to lm-eval as `pretrained` — detect them up front + if path.exists() or not _HUB_REPO_RE.fullmatch(model): + return None + try: + from huggingface_hub import hf_hub_download + config_path = hf_hub_download(model, "adapter_config.json") + except Exception: + return None + return _read_adapter_base(Path(config_path)) + + +class _TaskYamlLoader(yaml.SafeLoader): + """safe_load that tolerates lm-eval's custom tags (!function utils.fn).""" + + +# map local tags to their raw scalar so a valid lm-eval config parses for +# name extraction; TaskManager loads the original file with its own loader +_TaskYamlLoader.add_multi_constructor( + "!", lambda loader, suffix, node: getattr(node, "value", None) +) + + +def _load_task_spec( + path: Path, + depth: int = 0, + first_include_wins: bool = False, +) -> dict: + # the task/group name may live in an included base config, which lm-eval + # resolves during indexing — mirror that (child keys override the base); + # depth-limited in case of include cycles. Current lm-eval merges include + # lists in listed order (later wins); some older releases merged in + # reverse, so callers compare both orders and reject specs whose name + # depends on it. + spec = yaml.load(path.read_text(encoding = "utf-8"), Loader = _TaskYamlLoader) or {} + includes = spec.get("include") if isinstance(spec, dict) else None + if not includes or depth >= 8: + return spec + if isinstance(includes, str): + includes = [includes] + ordered = list(reversed(includes)) if first_include_wins else list(includes) + merged: dict = {} + for include in ordered: + # lm-eval resolves relative includes against the including file's + # directory, never the current working directory + include_path = Path(include) + if not include_path.is_absolute(): + include_path = path.parent / include + try: + base = _load_task_spec(include_path, depth + 1, first_include_wins) + except (OSError, yaml.YAMLError): + continue + if isinstance(base, dict): + merged.update(base) + merged.update(spec) + return merged + + +def _sibling_defines_task(directory: Path, group_file: Path, child: str) -> bool: + # rglob: lm-eval indexes include paths recursively, so a child yaml in a + # subdirectory shadows just the same + for sibling in sorted(directory.rglob("*.yaml")): + if sibling == group_file: + continue + try: + spec = _load_task_spec(sibling) + except (OSError, yaml.YAMLError): + continue + if isinstance(spec, dict) and isinstance(spec.get("task"), str) and spec["task"] == child: + return True + return False + + +def _doc_column(key: str) -> str: + # a jinja template stringifies the value (needed e.g. for numeric answer + # columns in few-shot prompts), but jinja can't parse keys that aren't + # plain identifiers ("prompt-text", "expected answer") or that collide + # with its keywords/literals — lm-eval treats a raw column name as a + # direct lookup, so fall back to that for such keys + import keyword + if key.isidentifier() and not keyword.iskeyword(key) and key not in ("true", "false", "none"): + return "{{" + key + "}}" + return key + + +def make_jsonl_task( + data_file: Path, + input_key: str, + target_key: str, + out_dir: Path, + reserved: frozenset = frozenset(), +) -> str: + data_file = Path(data_file).resolve() + out_dir = Path(out_dir) + out_dir.mkdir(parents = True, exist_ok = True) + # a generated task must not shadow a registered task (gsm8k.jsonl vs the + # gsm8k benchmark) or an earlier dataset with the same stem + base_name = data_file.stem + task_name = base_name + counter = 2 + while task_name in reserved or (out_dir / f"{task_name}.yaml").exists(): + task_name = f"{base_name}_{counter}" + counter += 1 + if task_name != base_name: + typer.echo( + f"Note: task name '{base_name}' is taken — running dataset " + f"'{data_file.name}' as '{task_name}'." + ) + builder = "json" if data_file.suffix.lower() in {".json", ".jsonl"} else "csv" + task_spec = { + "task": task_name, + "dataset_path": builder, + "dataset_kwargs": {"data_files": str(data_file)}, + "test_split": "train", + # explicit few-shot source so --num-fewshot works on every lm-eval + # version we support (the file has a single split) + "fewshot_split": "train", + "output_type": "generate_until", + "doc_to_text": _doc_column(input_key), + "doc_to_target": _doc_column(target_key), + "generation_kwargs": {"until": ["\n"]}, + # strip surrounding whitespace so " 2" matches gold "2": lm-eval's + # regex filter runs re.findall, which with one capture group yields + # the group's text; group_select indexes those matches, not groups + "filter_list": [ + { + "name": "strip", + "filter": [ + {"function": "regex", "regex_pattern": r"^\s*(.*?)\s*$", "group_select": 0}, + {"function": "take_first"}, + ], + }, + ], + "metric_list": [ + {"metric": "exact_match", "aggregation": "mean", "higher_is_better": True}, + ], + } + (out_dir / f"{task_name}.yaml").write_text( + yaml.safe_dump(task_spec, sort_keys = False), encoding = "utf-8" + ) + return task_name + + +def resolve_tasks( + tasks: str, + input_key: str, + target_key: str, + tmp_dir: Path, + reserved: frozenset = frozenset(), +) -> Tuple[List[str], List[str]]: + include_paths: List[str] = [] + sibling_names: set = set() + yaml_names: set = set() + # (kind, value) in argument order; datasets are generated in a second + # pass so every yaml/group/child name is known first — the names a + # generated task gets must not depend on argument order + entries: List[Tuple[str, object]] = [] + + def _add_include(directory: str) -> None: + if directory not in include_paths: + include_paths.append(directory) + + for raw in tasks.split(","): + entry = raw.strip() + if not entry: + continue + suffix = Path(entry).suffix.lower() + + if suffix in {".yaml", ".yml"}: + path = Path(entry) + if not path.exists(): + raise FileNotFoundError(f"Custom task file not found: {entry}") + text = path.read_text(encoding = "utf-8") + try: + spec = _load_task_spec(path) or {} + except yaml.YAMLError as e: + raise ValueError(f"Invalid YAML in custom task file '{entry}': {e}") from e + if not isinstance(spec, dict): + raise ValueError(f"Custom task file '{entry}' must define a YAML mapping.") + if "include" in spec: + # lm-eval versions disagree on include precedence (older ones + # merged last-to-first), so a name that changes with the merge + # order cannot be trusted on either side + alt = _load_task_spec(path, first_include_wins = True) or {} + if isinstance(alt, dict) and (spec.get("task"), spec.get("group")) != ( + alt.get("task"), + alt.get("group"), + ): + raise ValueError( + f"Custom task file '{entry}' gets its task/group name from its " + "include: files, and the winner depends on the lm-eval version's " + "include order. Set 'task:' (or 'group:') in the top-level file." + ) + name = spec.get("task") + if isinstance(name, list): + # a group file (group: suite, task: [a, b]) is registered + # under its group name; its child task names are taken too, + # so later dataset entries must not generate a clashing task + for child in name: + child_name = None + if isinstance(child, str): + child_name = child + elif isinstance(child, dict) and child.get("task"): + child_name = str(child["task"]) + if not child_name: + continue + sibling_names.add(child_name) + # a string child that names a registered task AND a sibling + # yaml is ambiguous: which one runs depends on the lm-eval + # version's registry precedence + if ( + isinstance(child, str) + and child_name in reserved + and _sibling_defines_task(path.resolve().parent, path.resolve(), child_name) + ): + raise ValueError( + f"Custom task file '{entry}' lists child task '{child_name}', " + "which is both a registered lm-eval task and defined by a " + "sibling YAML in the same directory — which one runs depends " + "on the lm-eval version. Rename the sibling task." + ) + name = spec.get("group") + if not name: + raise ValueError( + f"Custom task file '{entry}' defines a task list but no 'group:' name." + ) + if not name: + raise ValueError(f"Custom task file '{entry}' is missing a 'task:' name.") + # tag: (and legacy string group:) values register alias names in + # lm-eval's index, so generated datasets must avoid them too + for alias_key in ("tag", "group"): + alias_value = spec.get(alias_key) + for alias in alias_value if isinstance(alias_value, list) else [alias_value]: + if isinstance(alias, str) and alias: + sibling_names.add(alias) + name = str(name) + if name in reserved: + raise ValueError( + f"Custom task file '{entry}' redefines '{name}', which is already a " + "registered lm-eval task — the registered one would silently win. " + "Rename the task in the YAML." + ) + if name in yaml_names: + raise ValueError(f"Duplicate task name '{name}' in --tasks.") + if "include" in spec or isinstance(spec.get("task"), list) or "!function" in text: + # include-bearing, group and !function configs reference + # sibling files (base yaml, subtasks, helper modules), so + # their directory must stay on the include path — which + # only works for .yaml, the sole extension lm-eval indexes + if suffix == ".yml": + raise ValueError( + f"Custom task file '{entry}' references sibling files " + "(include:/group/!function) but is a .yml file — lm-eval only " + "indexes .yaml files, so it would never register. Rename it " + "(and the files it references) to .yaml." + ) + _add_include(str(path.resolve().parent)) + else: + # copy just this file into the temp include dir so a broken + # sibling yaml can't take down TaskManager's include scan + # (this also normalises .yml, which lm-eval doesn't index) + custom_dir = Path(tmp_dir) / "custom" + custom_dir.mkdir(parents = True, exist_ok = True) + shutil.copy2(path, custom_dir / f"{name}.yaml") + _add_include(str(custom_dir.resolve())) + yaml_names.add(name) + entries.append(("yaml", name)) + + elif suffix in {".jsonl", ".json", ".csv"}: + path = Path(entry) + if not path.exists(): + raise FileNotFoundError(f"Dataset file not found: {entry}") + entries.append(("dataset", path)) + + else: + entries.append(("plain", entry)) + + names: List[str] = [] + for kind, value in entries: + if kind == "dataset": + gen_dir = Path(tmp_dir) / "generated" + # every yaml task, group child and earlier name counts as taken + names.append( + make_jsonl_task( + value, + input_key, + target_key, + gen_dir, + reserved | frozenset(names) | yaml_names | frozenset(sibling_names), + ) + ) + _add_include(str(gen_dir.resolve())) + else: + if value in names: + message = ( + f"Duplicate task name '{value}' in --tasks." + if kind == "yaml" + else f"Duplicate task '{value}' in --tasks." + ) + raise ValueError(message) + names.append(value) + + if not names: + raise ValueError("No tasks provided. Pass --tasks with at least one task.") + return names, include_paths + + +def _metric_number(value): + # numpy float32/int64 aren't int/float subclasses; unwrap scalars via item() + if isinstance(value, (int, float)): + return value + item = getattr(value, "item", None) + if callable(item): + try: + value = item() + except Exception: + return None + if isinstance(value, (int, float)): + return value + return None + + +def _json_default(value): + # numpy/torch scalars and arrays serialise as numbers/lists, not strings, + # so results.json agrees numerically with the in-memory results + tolist = getattr(value, "tolist", None) + if callable(tolist): + try: + return tolist() + except Exception: + pass + return str(value) + + +def _render_results(results: dict) -> None: + from rich.console import Console + from rich.table import Table + + table = Table(title = "Evaluation results") + table.add_column("Task", style = "cyan") + table.add_column("Metric") + table.add_column("Value", justify = "right") + table.add_column("± stderr", justify = "right") + + rows = dict(results.get("results", {}) or {}) + # group aggregates (mmlu, custom suites) live in a separate section + for task, metrics in (results.get("groups") or {}).items(): + rows.setdefault(task, metrics) + + for task, metrics in rows.items(): + for key, raw_value in metrics.items(): + if key == "alias" or "_stderr" in key: + continue + value = _metric_number(raw_value) + if value is None: + continue + metric, _, flt = key.partition(",") + stderr_key = f"{metric}_stderr,{flt}" if flt else f"{metric}_stderr" + stderr = _metric_number(metrics.get(stderr_key)) + stderr_str = f"{stderr:.4f}" if stderr is not None else "—" + table.add_row(task, key, f"{value:.4f}", stderr_str) + + Console().print(table) + + +def evaluate( + model: str = typer.Argument( + ..., help = "Path to a checkpoint/adapter directory or a HuggingFace model id." + ), + tasks: str = typer.Option( + ..., + "--tasks", + "-t", + help = "Comma-separated built-in task names (e.g. mmlu,gsm8k), or a path to a " + "custom .yaml task or a .jsonl/.csv dataset.", + ), + base_model: Optional[str] = typer.Option( + None, + "--base-model", + help = "Base model for a LoRA adapter. Auto-detected from adapter_config.json; " + "set this to override a moved/renamed base.", + ), + num_fewshot: Optional[int] = typer.Option( + None, "--num-fewshot", "-n", help = "Few-shot examples (default: per-task)." + ), + limit: Optional[int] = typer.Option( + None, "--limit", help = "Cap examples per task (for quick smoke tests)." + ), + batch_size: str = typer.Option("auto", "--batch-size", "-b", help = "Batch size, or 'auto'."), + max_seq_length: int = typer.Option( + 2048, "--max-seq-length", help = "Max sequence length for the model." + ), + load_in_4bit: bool = typer.Option( + True, "--load-in-4bit/--no-load-in-4bit", help = "Load the model in 4-bit." + ), + backend: str = typer.Option( + "unsloth", + "--backend", + help = "Model backend: 'unsloth' (fast kernels; needs an NVIDIA/AMD/Intel " + "GPU) or 'hf' (plain transformers; works on CPU/MPS/Mac). " + "Auto-falls back to 'hf' on Apple Silicon.", + ), + device: Optional[str] = typer.Option( + None, + "--device", + help = "Device for the hf backend (e.g. cpu, mps, cuda). Default: auto.", + ), + input_key: str = typer.Option( + "question", "--input-key", help = "Prompt field for a .jsonl/.csv dataset task." + ), + target_key: str = typer.Option( + "answer", "--target-key", help = "Answer field for a .jsonl/.csv dataset task." + ), + output_dir: Path = typer.Option( + Path("./eval_results"), "--output-dir", "-o", help = "Directory for results.json." + ), + hf_token: Optional[str] = typer.Option( + None, "--hf-token", envvar = "HF_TOKEN", help = "HuggingFace token if needed." + ), +): + """Evaluate a checkpoint or LoRA adapter using lm-eval-harness.""" + if batch_size == "auto": + bs = "auto" + else: + try: + bs = int(batch_size) + if bs <= 0: + raise ValueError + except ValueError: + typer.echo("Error: --batch-size must be a positive integer or 'auto'.", err = True) + raise typer.Exit(code = 2) + + if backend not in ("unsloth", "hf"): + typer.echo(f"Error: --backend must be 'unsloth' or 'hf', got '{backend}'.", err = True) + raise typer.Exit(code = 2) + + if num_fewshot is not None and num_fewshot < 0: + # lm-eval treats a negative count as zero-shot while recording the + # bogus value in the results metadata + typer.echo("Error: --num-fewshot must be >= 0.", err = True) + raise typer.Exit(code = 2) + + if limit is not None and limit <= 0: + # lm-eval reads values below 1 as a dataset fraction: 0 builds no + # requests and crashes, negatives take an unintended slice + typer.echo("Error: --limit must be a positive integer.", err = True) + raise typer.Exit(code = 2) + + if max_seq_length <= 0: + # HFLM treats a falsy 0 as unset (silently dropping the cap) and + # uses negatives in truncation arithmetic + typer.echo("Error: --max-seq-length must be a positive integer.", err = True) + raise typer.Exit(code = 2) + + if not _lm_eval_available(): + typer.echo( + "Error: evaluation requires lm-eval. Install it with `pip install unsloth[eval]`.", + err = True, + ) + raise typer.Exit(code = 1) + + if backend == "unsloth": + # unsloth must be imported before transformers (which lm-eval pulls + # in) or its patches don't fully apply + with _silence(): + import unsloth + + if getattr(unsloth, "DEVICE_TYPE", None) == "mlx": + typer.echo( + "Note: Apple Silicon (MLX) detected — falling back to " + "--backend hf (plain transformers)." + ) + backend = "hf" + + # a pre-loaded model object makes lm-eval single-process (rank 0 + # everywhere), so under accelerate/torchrun every worker would run + # the full task set and write results + if backend == "unsloth" and os.environ.get("WORLD_SIZE", "1") not in ("", "1"): + typer.echo( + "Error: multi-process launches (accelerate/torchrun) are not " + "supported with --backend unsloth. Use --backend hf for " + "multi-GPU evaluation.", + err = True, + ) + raise typer.Exit(code = 2) + + try: + import lm_eval + from lm_eval.models.huggingface import HFLM + from lm_eval.tasks import TaskManager + except ImportError as e: + typer.echo( + "Error: evaluation requires lm-eval. Install it with `pip install unsloth[eval]`.", + err = True, + ) + raise typer.Exit(code = 1) from e + + if hf_token: + os.environ["HF_TOKEN"] = hf_token # both backends read it from the env + + # --base-model => treat as an adapter on this base (and skip the + # local/Hub adapter_config.json lookup) + effective_base = base_model or resolve_base_model(model) + + tmp_dir = Path(tempfile.mkdtemp(prefix = "unsloth_eval_")) + try: + # a dataset or custom task named after a registered task (gsm8k.jsonl, + # task: gsm8k) must not be shadowed by the built-in benchmark, so + # collect registry names first + base_manager = None + reserved: frozenset = frozenset() + if any( + Path(e.strip()).suffix.lower() in {".jsonl", ".json", ".csv", ".yaml", ".yml"} + for e in tasks.split(",") + ): + base_manager = TaskManager() + reserved = frozenset(_registry_names(base_manager)) + + try: + task_names, include_paths = resolve_tasks( + tasks, input_key, target_key, tmp_dir, reserved = reserved + ) + except (FileNotFoundError, ValueError) as e: + typer.echo(f"Error: {e}", err = True) + raise typer.Exit(code = 2) from e + + # reuse for validation and the eval run + if include_paths: + task_manager = TaskManager(include_path = include_paths) + else: + task_manager = base_manager or TaskManager() + + registered = getattr(task_manager, "all_tasks", None) + if registered: + known = _registry_names(task_manager) + unknown = [t for t in task_names if t not in known] + if unknown: + typer.echo( + f"Error: unknown task(s): {', '.join(unknown)}. Pass a built-in " + "task name, a .yaml task file, or a .jsonl/.csv dataset.", + err = True, + ) + raise typer.Exit(code = 2) + + if num_fewshot and any((tmp_dir / "generated" / f"{t}.yaml").exists() for t in task_names): + raw_keys = [k for k in dict.fromkeys((input_key, target_key)) if _doc_column(k) == k] + if raw_keys: + # raw column lookups feed unstringified values into lm-eval's + # few-shot prompt builder, which fails on non-string data + typer.echo( + "Error: --num-fewshot needs plain-identifier column names for a " + f"dataset task; rename column(s) {', '.join(map(repr, raw_keys))} " + "or drop --num-fewshot.", + err = True, + ) + raise typer.Exit(code = 2) + typer.echo( + "Note: few-shot examples for a generated task come from the same " + "file (no held-out split)." + ) + + typer.echo(f"Running tasks: {', '.join(task_names)} (backend: {backend})") + + eval_kwargs = dict( + tasks = task_names, + num_fewshot = num_fewshot, + limit = limit, + task_manager = task_manager, + log_samples = False, + ) + + if backend == "hf": + if device is None: + import torch + if torch.cuda.is_available(): + device = "cuda" + elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + else: + device_error = _hf_device_error(device) + if device_error: + typer.echo(f"Error: {device_error}", err = True) + raise typer.Exit(code = 2) + if bs == "auto" and not device.startswith("cuda"): + typer.echo( + "Note: batch_size 'auto' is slow on CPU/MPS — using 1 (override with --batch-size)." + ) + bs = 1 + # dict form: a comma in a path can't corrupt key=value parsing + if effective_base: + model_args = {"pretrained": effective_base, "peft": model} + # adapters that saved their own tokenizer (added tokens etc.) + # must not be scored with the base tokenizer + if _has_tokenizer_files(model): + model_args["tokenizer"] = model + typer.echo(f"Evaluating adapter '{model}' on base '{effective_base}'.") + else: + model_args = {"pretrained": model} + model_args["max_length"] = max_seq_length + if load_in_4bit and device.startswith("cuda"): + if _bitsandbytes_available(): + model_args["load_in_4bit"] = True + else: + typer.echo( + "Note: bitsandbytes is not installed — loading in full " + "precision (`pip install bitsandbytes` to enable 4-bit)." + ) + eval_kwargs.update( + model = "hf", + model_args = model_args, + batch_size = bs, + device = device, + ) + else: + from unsloth import FastLanguageModel + + load_kwargs = dict( + max_seq_length = max_seq_length, + load_in_4bit = load_in_4bit, + token = hf_token or None, + ) + if effective_base: + typer.echo(f"Loading base model '{effective_base}' with adapter '{model}'...") + with _silence(): + lmodel, tokenizer = FastLanguageModel.from_pretrained( + model_name = effective_base, **load_kwargs + ) + # adapters that saved their own tokenizer (added tokens + # etc.) must not be scored with the base tokenizer, and + # the embeddings must match its vocab before the adapter + # weights are applied or PEFT fails on a size mismatch + if _has_tokenizer_files(model): + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(model) + embeddings = lmodel.get_input_embeddings() + if embeddings is not None and embeddings.weight.shape[0] != len(tokenizer): + lmodel.resize_token_embeddings(len(tokenizer)) + from peft import PeftModel + + lmodel = PeftModel.from_pretrained(lmodel, model) + else: + typer.echo(f"Loading model: {model}") + with _silence(): + lmodel, tokenizer = FastLanguageModel.from_pretrained( + model_name = model, **load_kwargs + ) + with _silence(): + FastLanguageModel.for_inference(lmodel) + lm = HFLM( + pretrained = lmodel, + tokenizer = tokenizer, + batch_size = bs, + max_length = max_seq_length, + ) + eval_kwargs["model"] = lm + + with _silence() as ui, _spinner(ui, f"Evaluating {', '.join(task_names)}…"): + results = lm_eval.simple_evaluate(**eval_kwargs) + finally: + shutil.rmtree(tmp_dir, ignore_errors = True) + + if results is None: + # lm-eval hands results only to rank 0 of a multi-process run + # (accelerate/torchrun); worker ranks get None and must exit cleanly + if os.environ.get("RANK", "0") != "0" or os.environ.get("LOCAL_RANK", "0") != "0": + return + typer.echo("Error: evaluation returned no results.", err = True) + raise typer.Exit(code = 1) + + _render_results(results) + + output_dir = Path(output_dir) + output_dir.mkdir(parents = True, exist_ok = True) + results_path = output_dir / "results.json" + results_path.write_text(json.dumps(results, indent = 2, default = _json_default), encoding = "utf-8") + typer.echo(f"Saved results to: {results_path}") diff --git a/unsloth_cli/tests/test_eval.py b/unsloth_cli/tests/test_eval.py new file mode 100644 index 0000000000..dff8b46730 --- /dev/null +++ b/unsloth_cli/tests/test_eval.py @@ -0,0 +1,1396 @@ +# SPDX-License-Identifier: AGPL-3.0-only +# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0 + +from __future__ import annotations + +import json +import os +import sys +import types +from pathlib import Path +from types import SimpleNamespace + +_REPO_ROOT = Path(__file__).resolve().parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +import typer +import pytest +import yaml +from typer.testing import CliRunner + +import unsloth_cli.commands.eval as evalmod + + +def _eval_app(): + cli = typer.Typer() + cli.command()(evalmod.evaluate) + return cli + + +def test_resolve_base_model_reads_adapter_config(tmp_path): + (tmp_path / "adapter_config.json").write_text( + json.dumps({"base_model_name_or_path": "unsloth/Llama-3.2-1B"}) + ) + assert evalmod.resolve_base_model(str(tmp_path)) == "unsloth/Llama-3.2-1B" + + +def test_resolve_base_model_none_for_plain_dir(tmp_path): + assert evalmod.resolve_base_model(str(tmp_path)) is None + + +def test_resolve_base_model_none_for_missing_path(): + assert evalmod.resolve_base_model("/no/such/dir") is None + + +def test_resolve_base_model_none_for_non_dict_config(tmp_path): + (tmp_path / "adapter_config.json").write_text(json.dumps(["not", "a", "dict"])) + assert evalmod.resolve_base_model(str(tmp_path)) is None + + +def test_resolve_base_model_finds_hub_adapter(tmp_path, monkeypatch): + remote_config = tmp_path / "adapter_config.json" + remote_config.write_text(json.dumps({"base_model_name_or_path": "unsloth/Llama-3.2-1B"})) + + hub_mod = types.ModuleType("huggingface_hub") + + def fake_download(repo_id, filename, **kwargs): + assert repo_id == "someuser/my-lora" + assert filename == "adapter_config.json" + return str(remote_config) + + hub_mod.hf_hub_download = fake_download + monkeypatch.setitem(sys.modules, "huggingface_hub", hub_mod) + + assert evalmod.resolve_base_model("someuser/my-lora") == "unsloth/Llama-3.2-1B" + + +def test_resolve_base_model_none_when_hub_lookup_fails(monkeypatch): + hub_mod = types.ModuleType("huggingface_hub") + + def fake_download(*args, **kwargs): + raise RuntimeError("no adapter_config.json in repo") + + hub_mod.hf_hub_download = fake_download + monkeypatch.setitem(sys.modules, "huggingface_hub", hub_mod) + + assert evalmod.resolve_base_model("someuser/full-model") is None + + +def test_make_jsonl_task_generates_expected_spec(tmp_path): + data = tmp_path / "qa.jsonl" + data.write_text('{"question": "1+1?", "answer": "2"}\n') + out_dir = tmp_path / "tasks" + + name = evalmod.make_jsonl_task(data, "question", "answer", out_dir) + + assert name == "qa" + spec = yaml.safe_load((out_dir / "qa.yaml").read_text()) + assert spec["task"] == "qa" + assert spec["dataset_path"] == "json" + assert spec["dataset_kwargs"]["data_files"] == str(data.resolve()) + assert spec["doc_to_text"] == "{{question}}" + assert spec["doc_to_target"] == "{{answer}}" + assert spec["metric_list"][0]["metric"] == "exact_match" + assert spec["fewshot_split"] == "train" + + +def test_make_jsonl_task_honours_custom_keys(tmp_path): + data = tmp_path / "prompts.csv" + data.write_text("prompt,label\nhi,hello\n") + name = evalmod.make_jsonl_task(data, "prompt", "label", tmp_path / "t") + + spec = yaml.safe_load((tmp_path / "t" / "prompts.yaml").read_text()) + assert name == "prompts" + assert spec["dataset_path"] == "csv" + assert spec["doc_to_text"] == "{{prompt}}" + assert spec["doc_to_target"] == "{{label}}" + + +def test_make_jsonl_task_uses_raw_lookup_for_non_identifier_keys(tmp_path): + data = tmp_path / "weird.jsonl" + data.write_text('{"prompt-text": "1+1?", "expected answer": "2"}\n') + evalmod.make_jsonl_task(data, "prompt-text", "expected answer", tmp_path / "t") + + spec = yaml.safe_load((tmp_path / "t" / "weird.yaml").read_text()) + # jinja can't parse these keys; lm-eval resolves raw column names directly + assert spec["doc_to_text"] == "prompt-text" + assert spec["doc_to_target"] == "expected answer" + + +def test_make_jsonl_task_avoids_reserved_names(tmp_path): + data = tmp_path / "gsm8k.jsonl" + data.write_text('{"question": "q", "answer": "a"}\n') + + name = evalmod.make_jsonl_task( + data, "question", "answer", tmp_path / "t", reserved = frozenset({"gsm8k"}) + ) + + assert name == "gsm8k_2" + assert (tmp_path / "t" / "gsm8k_2.yaml").exists() + + +def test_has_tokenizer_files_checks_hub_repo(monkeypatch): + hub_mod = types.ModuleType("huggingface_hub") + hub_mod.list_repo_files = lambda repo_id: [ + "adapter_config.json", + "tokenizer_config.json", + "tokenizer.json", + ] + monkeypatch.setitem(sys.modules, "huggingface_hub", hub_mod) + + assert evalmod._has_tokenizer_files("someuser/my-lora") is True + + +def test_has_tokenizer_files_false_when_hub_listing_fails(monkeypatch): + hub_mod = types.ModuleType("huggingface_hub") + + def _fail(repo_id): + raise RuntimeError("offline") + + hub_mod.list_repo_files = _fail + monkeypatch.setitem(sys.modules, "huggingface_hub", hub_mod) + + assert evalmod._has_tokenizer_files("someuser/my-lora") is False + + +def test_resolve_tasks_builtin_names(tmp_path): + names, includes = evalmod.resolve_tasks("mmlu, gsm8k", "question", "answer", tmp_path) + assert names == ["mmlu", "gsm8k"] + assert includes == [] + + +def test_resolve_tasks_custom_yaml_copied_to_include_dir(tmp_path): + src = tmp_path / "src" + src.mkdir() + task_file = src / "custom.yaml" + task_file.write_text(yaml.safe_dump({"task": "my_task", "output_type": "generate_until"})) + # a broken sibling must not end up on the include path + (src / "broken.yaml").write_text("task: [unclosed") + tmp_dir = tmp_path / "gen" + + names, includes = evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_dir) + + custom_dir = tmp_dir / "custom" + assert names == ["my_task"] + assert includes == [str(custom_dir.resolve())] + assert (custom_dir / "my_task.yaml").exists() + assert not (custom_dir / "broken.yaml").exists() + + +def test_resolve_tasks_yml_normalised_to_yaml(tmp_path): + task_file = tmp_path / "custom.yml" + task_file.write_text(yaml.safe_dump({"task": "my_task", "output_type": "generate_until"})) + tmp_dir = tmp_path / "gen" + + names, _ = evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_dir) + + assert names == ["my_task"] + # lm-eval only indexes .yaml files + assert (tmp_dir / "custom" / "my_task.yaml").exists() + + +def test_resolve_tasks_include_yaml_keeps_parent_dir(tmp_path): + task_file = tmp_path / "custom.yaml" + task_file.write_text(yaml.safe_dump({"task": "my_task", "include": "base.yaml"})) + + names, includes = evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path / "gen") + + assert names == ["my_task"] + # the config references a sibling file, so its directory stays included + assert includes == [str(tmp_path.resolve())] + + +def test_resolve_tasks_yaml_with_function_tag_keeps_parent_dir(tmp_path): + task_file = tmp_path / "custom.yaml" + task_file.write_text( + "task: fn_task\noutput_type: generate_until\n" + "process_docs: !function utils.process_docs\n" + ) + tmp_dir = tmp_path / "gen" + + names, includes = evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_dir) + + assert names == ["fn_task"] + # !function imports resolve relative to the yaml, so utils.py must stay + # next to it — no copy into the temp dir + assert includes == [str(tmp_path.resolve())] + assert not (tmp_dir / "custom" / "fn_task.yaml").exists() + + +def test_resolve_tasks_task_name_from_included_base(tmp_path): + (tmp_path / "base.yaml").write_text( + yaml.safe_dump({"task": "from_base", "output_type": "generate_until"}) + ) + child = tmp_path / "child.yaml" + child.write_text(yaml.safe_dump({"include": "base.yaml", "dataset_path": "json"})) + + names, includes = evalmod.resolve_tasks(str(child), "question", "answer", tmp_path / "gen") + + # lm-eval resolves include: during indexing, so the name from the base + # config counts + assert names == ["from_base"] + assert includes == [str(tmp_path.resolve())] + + +def test_resolve_tasks_rejects_yml_group_config(tmp_path): + task_file = tmp_path / "suite.yml" + task_file.write_text(yaml.safe_dump({"group": "my_suite", "task": ["task_a", "task_b"]})) + with pytest.raises(ValueError, match = "only indexes .yaml"): + evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path / "gen") + + +def test_resolve_tasks_jsonl_generates_task(tmp_path): + data = tmp_path / "qa.jsonl" + data.write_text('{"question": "q", "answer": "a"}\n') + tmp_dir = tmp_path / "gen" + + names, includes = evalmod.resolve_tasks(str(data), "question", "answer", tmp_dir) + + gen_dir = tmp_dir / "generated" + assert names == ["qa"] + assert includes == [str(gen_dir.resolve())] + assert (gen_dir / "qa.yaml").exists() + + +def test_resolve_tasks_uniquifies_colliding_dataset_stems(tmp_path): + dir_a = tmp_path / "a" + dir_b = tmp_path / "b" + dir_a.mkdir() + dir_b.mkdir() + (dir_a / "qa.jsonl").write_text('{"question": "q", "answer": "a"}\n') + (dir_b / "qa.jsonl").write_text('{"question": "q2", "answer": "a2"}\n') + gen_dir = tmp_path / "gen" + + names, _ = evalmod.resolve_tasks( + f"{dir_a / 'qa.jsonl'},{dir_b / 'qa.jsonl'}", "question", "answer", gen_dir + ) + + assert names == ["qa", "qa_2"] + spec_a = yaml.safe_load((gen_dir / "generated" / "qa.yaml").read_text()) + spec_b = yaml.safe_load((gen_dir / "generated" / "qa_2.yaml").read_text()) + assert spec_a["dataset_kwargs"]["data_files"] == str((dir_a / "qa.jsonl").resolve()) + assert spec_b["dataset_kwargs"]["data_files"] == str((dir_b / "qa.jsonl").resolve()) + assert spec_b["task"] == "qa_2" + + +def test_resolve_tasks_reserves_group_child_names_for_datasets(tmp_path): + (tmp_path / "suite.yaml").write_text( + yaml.safe_dump({"group": "suite", "task": ["qa", {"task": "qa_inline"}]}) + ) + (tmp_path / "qa.jsonl").write_text('{"question": "q", "answer": "a"}\n') + tmp_dir = tmp_path / "gen" + + names, _ = evalmod.resolve_tasks( + f"{tmp_path / 'suite.yaml'},{tmp_path / 'qa.jsonl'}", "question", "answer", tmp_dir + ) + + # the dataset must not generate a task shadowing the suite's child 'qa' + assert names == ["suite", "qa_2"] + assert (tmp_dir / "generated" / "qa_2.yaml").exists() + + +def test_resolve_tasks_invalid_yaml_raises(tmp_path): + task_file = tmp_path / "broken.yaml" + task_file.write_text("task: [unclosed") + with pytest.raises(ValueError, match = "Invalid YAML"): + evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path) + + +def test_resolve_tasks_yaml_list_raises(tmp_path): + task_file = tmp_path / "list.yaml" + task_file.write_text(yaml.safe_dump(["not", "a", "mapping"])) + with pytest.raises(ValueError, match = "YAML mapping"): + evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path) + + +def test_resolve_tasks_group_yaml_uses_group_name(tmp_path): + task_file = tmp_path / "suite.yaml" + task_file.write_text(yaml.safe_dump({"group": "my_suite", "task": ["task_a", "task_b"]})) + + names, includes = evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path) + + assert names == ["my_suite"] + assert includes == [str(tmp_path.resolve())] + + +def test_resolve_tasks_group_yaml_without_group_raises(tmp_path): + task_file = tmp_path / "suite.yaml" + task_file.write_text(yaml.safe_dump({"task": ["task_a", "task_b"]})) + with pytest.raises(ValueError, match = "no 'group:' name"): + evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path) + + +def test_resolve_tasks_yaml_rejects_registered_name(tmp_path): + task_file = tmp_path / "clash.yaml" + task_file.write_text(yaml.safe_dump({"task": "gsm8k", "output_type": "generate_until"})) + with pytest.raises(ValueError, match = "redefines 'gsm8k'"): + evalmod.resolve_tasks( + str(task_file), "question", "answer", tmp_path, reserved = frozenset({"gsm8k"}) + ) + + +def test_resolve_tasks_rejects_duplicate_yaml_names(tmp_path): + for stem in ("one", "two"): + (tmp_path / f"{stem}.yaml").write_text( + yaml.safe_dump({"task": "same_task", "output_type": "generate_until"}) + ) + with pytest.raises(ValueError, match = "Duplicate task name 'same_task'"): + evalmod.resolve_tasks( + f"{tmp_path / 'one.yaml'},{tmp_path / 'two.yaml'}", + "question", + "answer", + tmp_path / "gen", + ) + + +def test_resolve_tasks_rejects_duplicate_builtins(tmp_path): + with pytest.raises(ValueError, match = "Duplicate task 'gsm8k'"): + evalmod.resolve_tasks("gsm8k,gsm8k", "question", "answer", tmp_path) + + +def test_resolve_tasks_renames_dataset_colliding_with_yaml_name(tmp_path): + (tmp_path / "foo.yaml").write_text( + yaml.safe_dump({"task": "foo", "output_type": "generate_until"}) + ) + (tmp_path / "foo.jsonl").write_text('{"question": "q", "answer": "a"}\n') + tmp_dir = tmp_path / "gen" + + names, _ = evalmod.resolve_tasks( + f"{tmp_path / 'foo.yaml'},{tmp_path / 'foo.jsonl'}", "question", "answer", tmp_dir + ) + + # the dataset must not silently shadow (or be shadowed by) the yaml task + assert names == ["foo", "foo_2"] + assert (tmp_dir / "generated" / "foo_2.yaml").exists() + + +def _fake_torch( + monkeypatch, + cuda_available = False, + device_count = 0, + mps_available = False, + xpu_available = False, + xpu_count = 0, +): + torch_mod = types.ModuleType("torch") + torch_mod.cuda = SimpleNamespace( + is_available = lambda: cuda_available, device_count = lambda: device_count + ) + torch_mod.backends = SimpleNamespace(mps = SimpleNamespace(is_available = lambda: mps_available)) + torch_mod.xpu = SimpleNamespace( + is_available = lambda: xpu_available, device_count = lambda: xpu_count + ) + monkeypatch.setitem(sys.modules, "torch", torch_mod) + + +def test_hf_device_error_validates_cuda_strings(monkeypatch): + _fake_torch(monkeypatch, cuda_available = True, device_count = 2) + assert evalmod._hf_device_error("cuda") is None + assert evalmod._hf_device_error("cuda:0") is None + assert evalmod._hf_device_error("cuda:1") is None + # lm-eval only recognises canonical cuda:; everything else falls back + for bad in ("cuda0", "cuda:", "cuda:01", "cuda:-1", "cudax"): + assert evalmod._hf_device_error(bad) is not None, bad + assert "only 2 CUDA" in evalmod._hf_device_error("cuda:2") + + +def test_hf_device_error_validates_mps_strings(monkeypatch): + _fake_torch(monkeypatch, mps_available = True) + assert evalmod._hf_device_error("mps") is None + assert evalmod._hf_device_error("mps:0") is None + assert evalmod._hf_device_error("mps:1") is not None + _fake_torch(monkeypatch, mps_available = False) + assert "MPS is not available" in evalmod._hf_device_error("mps") + + +def test_hf_device_error_rejects_unknown_literals(monkeypatch): + _fake_torch(monkeypatch) + assert evalmod._hf_device_error("cpu") is None + # typos would silently fall back to HFLM's default device + for bad in ("cpuu", "cude", "gpu", "xpu", "npu"): + assert "invalid --device" in evalmod._hf_device_error(bad), bad + + +def test_hf_device_error_validates_indexed_accelerators(monkeypatch): + # an unavailable or out-of-range accelerator would also silently fall back + _fake_torch(monkeypatch, xpu_available = True, xpu_count = 2) + assert evalmod._hf_device_error("xpu:0") is None + assert evalmod._hf_device_error("xpu:1") is None + assert "only 2 XPU" in evalmod._hf_device_error("xpu:2") + # this torch build has no npu/hpu module at all + assert "NPU is not available" in evalmod._hf_device_error("npu:0") + assert "HPU is not available" in evalmod._hf_device_error("hpu:0") + _fake_torch(monkeypatch, xpu_available = False) + assert "XPU is not available" in evalmod._hf_device_error("xpu:0") + + +def test_metric_number_unwraps_numpy_like_scalars(): + class _FakeScalar: + def __init__(self, value): + self._value = value + + def item(self): + return self._value + + assert evalmod._metric_number(0.5) == 0.5 + assert evalmod._metric_number(3) == 3 + assert evalmod._metric_number(_FakeScalar(0.25)) == 0.25 + assert evalmod._metric_number(_FakeScalar("not a number")) is None + assert evalmod._metric_number("alias-ish string") is None + + +def test_resolve_tasks_yaml_without_task_name_raises(tmp_path): + task_file = tmp_path / "bad.yaml" + task_file.write_text(yaml.safe_dump({"output_type": "generate_until"})) + with pytest.raises(ValueError, match = "missing a 'task:' name"): + evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path) + + +def test_resolve_tasks_missing_file_raises(tmp_path): + with pytest.raises(FileNotFoundError): + evalmod.resolve_tasks("./nope.yaml", "question", "answer", tmp_path) + + +def test_resolve_tasks_empty_raises(tmp_path): + with pytest.raises(ValueError, match = "No tasks provided"): + evalmod.resolve_tasks(" , ", "question", "answer", tmp_path) + + +def test_render_results_renders_metric_row(capsys): + evalmod._render_results( + { + "results": { + "gsm8k": { + "exact_match,strict-match": 0.5, + "exact_match_stderr,strict-match": 0.05, + "alias": "gsm8k", + } + } + } + ) + out = capsys.readouterr().out + assert "gsm8k" in out + assert "0.5000" in out + assert "0.0500" in out + + +def test_render_results_includes_group_aggregates(capsys): + evalmod._render_results( + { + "results": { + "mmlu_abstract_algebra": {"acc,none": 0.30, "alias": " - abstract_algebra"}, + }, + "groups": { + "mmlu": {"acc,none": 0.45, "alias": "mmlu"}, + }, + } + ) + out = capsys.readouterr().out + assert "0.3000" in out + # the group aggregate must be shown, not just per-subtask rows + assert "0.4500" in out + + +def test_eval_missing_lm_eval_shows_hint(monkeypatch): + monkeypatch.setitem(sys.modules, "lm_eval", None) + result = CliRunner().invoke(_eval_app(), ["fake/model", "--tasks", "gsm8k"]) + assert result.exit_code == 1, result.output + assert "pip install unsloth[eval]" in result.output + + +@pytest.fixture +def fake_eval_env(monkeypatch): + calls = {} + + class _FakeFLM: + @classmethod + def from_pretrained( + cls, + model_name = None, + **kw, + ): + calls["model_name"] = model_name + model = SimpleNamespace( + name = model_name, + get_input_embeddings = lambda: SimpleNamespace( + weight = SimpleNamespace(shape = (32000, 4096)) + ), + resize_token_embeddings = lambda n: calls.setdefault("events", []).append( + ("resize", n) + ), + ) + return model, SimpleNamespace(name = "tok") + + @classmethod + def for_inference(cls, model): + calls["for_inference"] = True + return model + + class _FakeHFLM: + def __init__( + self, + pretrained = None, + tokenizer = None, + batch_size = None, + max_length = None, + ): + calls["batch_size"] = batch_size + calls["hflm_tokenizer"] = tokenizer + calls["hflm_max_length"] = max_length + + class _FakeTaskManager: + def __init__(self, include_path = None): + calls["include_path"] = include_path + self.all_tasks = ["gsm8k", "mmlu", "hellaswag"] + self.all_groups = ["mmlu"] + self.all_tags = [] + # mirror lm-eval: yaml tasks/groups under include paths get + # registered under their task or group name + for directory in include_path or []: + for spec_file in sorted(Path(directory).glob("*.yaml")): + # like lm-eval, tolerate !function tags but not broken yaml + spec = yaml.load(spec_file.read_text(), Loader = evalmod._TaskYamlLoader) + if not isinstance(spec, dict): + continue + name = spec.get("task") + if isinstance(name, list): + if spec.get("group"): + self.all_groups.append(str(spec["group"])) + elif name: + self.all_tasks.append(str(name)) + + def _simple_evaluate( + model = None, + model_args = None, + tasks = None, + **kw, + ): + calls["model"] = model + calls["model_args"] = model_args + calls["tasks"] = tasks + calls["simple_evaluate_kwargs"] = kw + return { + "results": { + "gsm8k": { + "exact_match,strict-match": 0.42, + "exact_match_stderr,strict-match": 0.01, + "alias": "gsm8k", + } + }, + "configs": {}, + } + + unsloth_mod = types.ModuleType("unsloth") + unsloth_mod.FastLanguageModel = _FakeFLM + + # deterministic device detection, no real torch needed + torch_mod = types.ModuleType("torch") + torch_mod.cuda = SimpleNamespace(is_available = lambda: False, device_count = lambda: 0) + torch_mod.backends = SimpleNamespace(mps = SimpleNamespace(is_available = lambda: False)) + + # no adapter_config.json on the fake Hub, and no network access in tests + hub_mod = types.ModuleType("huggingface_hub") + + def _no_hub_download(*args, **kwargs): + raise RuntimeError("adapter_config.json not found") + + hub_mod.hf_hub_download = _no_hub_download + + def _no_repo_files(*args, **kwargs): + raise RuntimeError("repo not found") + + hub_mod.list_repo_files = _no_repo_files + + lm_eval_mod = types.ModuleType("lm_eval") + lm_eval_mod.simple_evaluate = _simple_evaluate + models_mod = types.ModuleType("lm_eval.models") + hf_mod = types.ModuleType("lm_eval.models.huggingface") + hf_mod.HFLM = _FakeHFLM + tasks_mod = types.ModuleType("lm_eval.tasks") + tasks_mod.TaskManager = _FakeTaskManager + + for name, mod in { + "unsloth": unsloth_mod, + "torch": torch_mod, + "huggingface_hub": hub_mod, + "lm_eval": lm_eval_mod, + "lm_eval.models": models_mod, + "lm_eval.models.huggingface": hf_mod, + "lm_eval.tasks": tasks_mod, + }.items(): + monkeypatch.setitem(sys.modules, name, mod) + + # deterministic regardless of whether bitsandbytes is installed locally + monkeypatch.setattr(evalmod, "_bitsandbytes_available", lambda: True) + + return calls + + +def test_eval_success_writes_results(fake_eval_env, tmp_path): + out_dir = tmp_path / "out" + result = CliRunner().invoke( + _eval_app(), + ["fake/model", "--tasks", "gsm8k", "--output-dir", str(out_dir)], + ) + + assert result.exit_code == 0, result.output + assert "Saved results to" in result.output + assert fake_eval_env["tasks"] == ["gsm8k"] + assert fake_eval_env["hflm_max_length"] == 2048 + assert fake_eval_env["simple_evaluate_kwargs"]["task_manager"] is not None + assert fake_eval_env["simple_evaluate_kwargs"]["log_samples"] is False + assert fake_eval_env["include_path"] is None + + saved = json.loads((out_dir / "results.json").read_text()) + assert saved["results"]["gsm8k"]["exact_match,strict-match"] == 0.42 + + +def test_eval_jsonl_task_builds_task_manager(fake_eval_env, tmp_path): + data = tmp_path / "qa.jsonl" + data.write_text('{"question": "q", "answer": "a"}\n') + + result = CliRunner().invoke( + _eval_app(), + ["fake/model", "--tasks", str(data), "--output-dir", str(tmp_path / "out")], + ) + + assert result.exit_code == 0, result.output + assert fake_eval_env["tasks"] == ["qa"] + assert fake_eval_env["include_path"] is not None + + +def test_eval_mlx_falls_back_to_hf(fake_eval_env, tmp_path): + sys.modules["unsloth"].DEVICE_TYPE = "mlx" + result = CliRunner().invoke( + _eval_app(), + ["fake/model", "--tasks", "gsm8k", "--output-dir", str(tmp_path / "out")], + ) + assert result.exit_code == 0, result.output + assert "falling back" in result.output + assert fake_eval_env["model"] == "hf" + assert fake_eval_env["model_args"] == {"pretrained": "fake/model", "max_length": 2048} + assert "model_name" not in fake_eval_env + + +def test_eval_hf_backend_skips_unsloth(fake_eval_env, tmp_path): + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cpu", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 0, result.output + assert fake_eval_env["model"] == "hf" + assert fake_eval_env["model_args"] == {"pretrained": "fake/model", "max_length": 2048} + assert fake_eval_env["simple_evaluate_kwargs"]["device"] == "cpu" + assert "model_name" not in fake_eval_env + + +def test_eval_rejects_nonpositive_batch_size(fake_eval_env, tmp_path): + for bad in ["0", "-1", "abc"]: + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cpu", + "--batch-size", + bad, + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 2, (bad, result.output) + assert "positive integer or 'auto'" in result.output + + +def test_eval_hf_forwards_max_seq_length(fake_eval_env, tmp_path): + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cpu", + "--max-seq-length", + "1024", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 0, result.output + assert fake_eval_env["model_args"] == {"pretrained": "fake/model", "max_length": 1024} + + +def test_eval_unsloth_forwards_max_seq_length_to_hflm(fake_eval_env, tmp_path): + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--max-seq-length", + "512", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 0, result.output + assert fake_eval_env["hflm_max_length"] == 512 + + +def test_eval_hf_local_adapter_uses_adapter_tokenizer(fake_eval_env, tmp_path): + adapter = tmp_path / "adapter" + adapter.mkdir() + (adapter / "adapter_config.json").write_text( + json.dumps({"base_model_name_or_path": "unsloth/Llama-3.2-1B"}) + ) + (adapter / "tokenizer_config.json").write_text("{}") + + result = CliRunner().invoke( + _eval_app(), + [ + str(adapter), + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cpu", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 0, result.output + assert fake_eval_env["model_args"] == { + "pretrained": "unsloth/Llama-3.2-1B", + "peft": str(adapter), + "tokenizer": str(adapter), + "max_length": 2048, + } + + +def _make_local_adapter(tmp_path): + adapter = tmp_path / "adapter" + adapter.mkdir() + (adapter / "adapter_config.json").write_text( + json.dumps({"base_model_name_or_path": "unsloth/Llama-3.2-1B"}) + ) + (adapter / "tokenizer_config.json").write_text("{}") + return adapter + + +def _install_adapter_stubs(monkeypatch, fake_eval_env, tokenizer_len): + peft_mod = types.ModuleType("peft") + + class _FakePeftModel: + @staticmethod + def from_pretrained(model, adapter_path): + fake_eval_env["peft_adapter"] = adapter_path + fake_eval_env.setdefault("events", []).append(("peft", adapter_path)) + return model + + peft_mod.PeftModel = _FakePeftModel + monkeypatch.setitem(sys.modules, "peft", peft_mod) + + class _FakeTokenizer: + name = "adapter-tok" + + def __len__(self): + return tokenizer_len + + transformers_mod = types.ModuleType("transformers") + + class _FakeAutoTokenizer: + @staticmethod + def from_pretrained(path, **kwargs): + fake_eval_env["tokenizer_from"] = path + return _FakeTokenizer() + + transformers_mod.AutoTokenizer = _FakeAutoTokenizer + monkeypatch.setitem(sys.modules, "transformers", transformers_mod) + + +def test_eval_unsloth_adapter_prefers_adapter_tokenizer(fake_eval_env, tmp_path, monkeypatch): + adapter = _make_local_adapter(tmp_path) + # same vocab size as the fake base model: no resize expected + _install_adapter_stubs(monkeypatch, fake_eval_env, tokenizer_len = 32000) + + result = CliRunner().invoke( + _eval_app(), + [str(adapter), "--tasks", "gsm8k", "--output-dir", str(tmp_path / "out")], + ) + assert result.exit_code == 0, result.output + assert fake_eval_env["model_name"] == "unsloth/Llama-3.2-1B" + assert fake_eval_env["peft_adapter"] == str(adapter) + assert fake_eval_env["tokenizer_from"] == str(adapter) + assert fake_eval_env["hflm_tokenizer"].name == "adapter-tok" + assert fake_eval_env["events"] == [("peft", str(adapter))] + + +def test_eval_unsloth_adapter_resizes_embeddings_before_peft(fake_eval_env, tmp_path, monkeypatch): + adapter = _make_local_adapter(tmp_path) + # adapter tokenizer grew past the fake base vocab (32000) + _install_adapter_stubs(monkeypatch, fake_eval_env, tokenizer_len = 32005) + + result = CliRunner().invoke( + _eval_app(), + [str(adapter), "--tasks", "gsm8k", "--output-dir", str(tmp_path / "out")], + ) + assert result.exit_code == 0, result.output + # the resize must land before the adapter weights are applied + assert fake_eval_env["events"] == [("resize", 32005), ("peft", str(adapter))] + + +def test_eval_hf_honors_base_model_for_remote_adapter(fake_eval_env, tmp_path): + result = CliRunner().invoke( + _eval_app(), + [ + "someuser/my-lora", + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cpu", + "--base-model", + "meta-llama/Llama-2-7b", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 0, result.output + assert fake_eval_env["model_args"] == { + "pretrained": "meta-llama/Llama-2-7b", + "peft": "someuser/my-lora", + "max_length": 2048, + } + + +def test_eval_cuda_index_keeps_auto_batch_size(fake_eval_env, tmp_path): + sys.modules["torch"].cuda = SimpleNamespace(is_available = lambda: True, device_count = lambda: 1) + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cuda:0", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 0, result.output + # 'auto' survives an explicit CUDA index (not downgraded to 1) + assert fake_eval_env["simple_evaluate_kwargs"]["batch_size"] == "auto" + assert fake_eval_env["model_args"]["load_in_4bit"] is True + + +def test_eval_unknown_task_errors(fake_eval_env, tmp_path): + result = CliRunner().invoke( + _eval_app(), + ["fake/model", "--tasks", "notarealtask", "--output-dir", str(tmp_path / "out")], + ) + assert result.exit_code == 2, result.output + assert "unknown task" in result.output + + +def test_eval_dataset_shadowing_builtin_is_renamed(fake_eval_env, tmp_path): + data = tmp_path / "gsm8k.jsonl" + data.write_text('{"question": "q", "answer": "a"}\n') + + result = CliRunner().invoke( + _eval_app(), + ["fake/model", "--tasks", str(data), "--output-dir", str(tmp_path / "out")], + ) + + assert result.exit_code == 0, result.output + # the built-in gsm8k benchmark must not shadow the user's dataset + assert fake_eval_env["tasks"] == ["gsm8k_2"] + assert "as 'gsm8k_2'" in result.output + + +def test_eval_custom_yaml_shadowing_builtin_errors(fake_eval_env, tmp_path): + task_file = tmp_path / "clash.yaml" + task_file.write_text(yaml.safe_dump({"task": "gsm8k", "output_type": "generate_until"})) + + result = CliRunner().invoke( + _eval_app(), + ["fake/model", "--tasks", str(task_file), "--output-dir", str(tmp_path / "out")], + ) + assert result.exit_code == 2, result.output + assert "redefines 'gsm8k'" in result.output + + +def test_eval_fewshot_with_raw_key_dataset_errors(fake_eval_env, tmp_path): + data = tmp_path / "qa.jsonl" + data.write_text('{"expected answer": "2", "question": "1+1?"}\n') + + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + str(data), + "--target-key", + "expected answer", + "--num-fewshot", + "2", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 2, result.output + assert "plain-identifier column names" in result.output + + +def test_eval_custom_yaml_survives_broken_sibling(fake_eval_env, tmp_path): + task_file = tmp_path / "good.yaml" + task_file.write_text(yaml.safe_dump({"task": "good_task", "output_type": "generate_until"})) + # the fake TaskManager (like lm-eval 0.4.4) chokes on unparseable yaml + # in an include dir; the broken sibling must never reach it + (tmp_path / "broken.yaml").write_text("task: [unclosed") + + result = CliRunner().invoke( + _eval_app(), + ["fake/model", "--tasks", str(task_file), "--output-dir", str(tmp_path / "out")], + ) + assert result.exit_code == 0, result.output + assert fake_eval_env["tasks"] == ["good_task"] + + +def test_eval_group_yaml_runs_under_group_name(fake_eval_env, tmp_path): + task_file = tmp_path / "suite.yaml" + task_file.write_text(yaml.safe_dump({"group": "my_suite", "task": ["task_a", "task_b"]})) + + result = CliRunner().invoke( + _eval_app(), + ["fake/model", "--tasks", str(task_file), "--output-dir", str(tmp_path / "out")], + ) + assert result.exit_code == 0, result.output + assert fake_eval_env["tasks"] == ["my_suite"] + + +def test_eval_unsloth_rejects_multi_process_launch(fake_eval_env, tmp_path, monkeypatch): + monkeypatch.setenv("WORLD_SIZE", "2") + result = CliRunner().invoke( + _eval_app(), + ["fake/model", "--tasks", "gsm8k", "--output-dir", str(tmp_path / "out")], + ) + assert result.exit_code == 2, result.output + assert "multi-process launches" in result.output + + +def test_eval_hf_allows_multi_process_launch(fake_eval_env, tmp_path, monkeypatch): + monkeypatch.setenv("WORLD_SIZE", "2") + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cpu", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 0, result.output + + +def test_eval_rejects_nonpositive_limit_and_max_seq_length(fake_eval_env, tmp_path): + for flag, bad, message in ( + ("--limit", "0", "--limit must be a positive integer"), + ("--limit", "-5", "--limit must be a positive integer"), + ("--max-seq-length", "0", "--max-seq-length must be a positive integer"), + ("--max-seq-length", "-1", "--max-seq-length must be a positive integer"), + ): + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + flag, + bad, + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 2, (flag, bad, result.output) + assert message in result.output, (flag, bad, result.output) + + +def test_eval_rejects_negative_num_fewshot(fake_eval_env, tmp_path): + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--num-fewshot", + "-1", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 2, result.output + assert "--num-fewshot must be >= 0" in result.output + + +def test_eval_rejects_unknown_backend(fake_eval_env, tmp_path): + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--backend", + "hff", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 2, result.output + assert "--backend must be 'unsloth' or 'hf'" in result.output + + +def test_eval_hf_cuda_without_bnb_loads_full_precision(fake_eval_env, tmp_path, monkeypatch): + sys.modules["torch"].cuda = SimpleNamespace(is_available = lambda: True, device_count = lambda: 1) + monkeypatch.setattr(evalmod, "_bitsandbytes_available", lambda: False) + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cuda:0", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 0, result.output + assert "load_in_4bit" not in fake_eval_env["model_args"] + assert "bitsandbytes is not installed" in result.output + + +def test_eval_hf_hub_adapter_uses_hub_tokenizer(fake_eval_env, tmp_path, monkeypatch): + remote_config = tmp_path / "adapter_config.json" + remote_config.write_text(json.dumps({"base_model_name_or_path": "unsloth/Llama-3.2-1B"})) + + hub_mod = types.ModuleType("huggingface_hub") + hub_mod.hf_hub_download = lambda repo_id, filename, **kwargs: str(remote_config) + hub_mod.list_repo_files = lambda repo_id: ["adapter_config.json", "tokenizer.json"] + monkeypatch.setitem(sys.modules, "huggingface_hub", hub_mod) + + result = CliRunner().invoke( + _eval_app(), + [ + "someuser/my-lora", + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cpu", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 0, result.output + assert fake_eval_env["model_args"] == { + "pretrained": "unsloth/Llama-3.2-1B", + "peft": "someuser/my-lora", + "tokenizer": "someuser/my-lora", + "max_length": 2048, + } + + +def test_eval_hf_rejects_cuda_when_unavailable(fake_eval_env, tmp_path): + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cuda", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 2, result.output + assert "CUDA is not available" in result.output + + +def test_eval_hf_rejects_out_of_range_cuda_index(fake_eval_env, tmp_path): + sys.modules["torch"].cuda = SimpleNamespace(is_available = lambda: True, device_count = lambda: 1) + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cuda:1", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 2, result.output + assert "only 1 CUDA device(s)" in result.output + + +def test_eval_worker_rank_exits_cleanly_on_none_results(fake_eval_env, tmp_path, monkeypatch): + monkeypatch.setenv("RANK", "1") + sys.modules["lm_eval"].simple_evaluate = lambda **kwargs: None + + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cpu", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 0, result.output + assert "Saved results" not in result.output + assert not (tmp_path / "out").exists() + + +def test_eval_none_results_errors_on_single_process(fake_eval_env, tmp_path, monkeypatch): + monkeypatch.delenv("RANK", raising = False) + monkeypatch.delenv("LOCAL_RANK", raising = False) + sys.modules["lm_eval"].simple_evaluate = lambda **kwargs: None + + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cpu", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 1, result.output + assert "no results" in result.output + + +def test_eval_hf_token_sets_env(fake_eval_env, tmp_path, monkeypatch): + monkeypatch.setenv("HF_TOKEN", "placeholder") + result = CliRunner().invoke( + _eval_app(), + [ + "fake/model", + "--tasks", + "gsm8k", + "--backend", + "hf", + "--device", + "cpu", + "--hf-token", + "hf_secret", + "--output-dir", + str(tmp_path / "out"), + ], + ) + assert result.exit_code == 0, result.output + assert os.environ.get("HF_TOKEN") == "hf_secret" + + +def test_resolve_tasks_dataset_before_group_still_avoids_child_names(tmp_path): + # argument order must not decide the generated task's name + (tmp_path / "suite.yaml").write_text( + yaml.safe_dump({"group": "suite", "task": ["qa", {"task": "qa_inline"}]}) + ) + (tmp_path / "qa.jsonl").write_text('{"question": "q", "answer": "a"}\n') + tmp_dir = tmp_path / "gen" + + names, _ = evalmod.resolve_tasks( + f"{tmp_path / 'qa.jsonl'},{tmp_path / 'suite.yaml'}", "question", "answer", tmp_dir + ) + + assert names == ["qa_2", "suite"] + assert (tmp_dir / "generated" / "qa_2.yaml").exists() + + +def test_resolve_tasks_rejects_builtin_child_shadowed_by_sibling(tmp_path): + (tmp_path / "suite.yaml").write_text(yaml.safe_dump({"group": "suite", "task": ["gsm8k"]})) + (tmp_path / "gsm8k.yaml").write_text(yaml.safe_dump({"task": "gsm8k", "dataset_path": "json"})) + with pytest.raises(ValueError, match = "depends on the lm-eval version"): + evalmod.resolve_tasks( + str(tmp_path / "suite.yaml"), + "question", + "answer", + tmp_path / "gen", + reserved = frozenset({"gsm8k"}), + ) + + +def test_resolve_tasks_allows_group_of_builtins_without_siblings(tmp_path): + # a suite that aggregates registered tasks is legitimate lm-eval usage + (tmp_path / "suite.yaml").write_text( + yaml.safe_dump({"group": "suite", "task": ["gsm8k", "mmlu"]}) + ) + + names, _ = evalmod.resolve_tasks( + str(tmp_path / "suite.yaml"), + "question", + "answer", + tmp_path / "gen", + reserved = frozenset({"gsm8k", "mmlu"}), + ) + + assert names == ["suite"] + + +def test_resolve_tasks_rejects_include_order_dependent_name(tmp_path): + # lm-eval versions disagree on include precedence, so a name that changes + # with the merge order must be rejected + (tmp_path / "a.yaml").write_text(yaml.safe_dump({"task": "name_a"})) + (tmp_path / "b.yaml").write_text(yaml.safe_dump({"task": "name_b"})) + child = tmp_path / "child.yaml" + child.write_text(yaml.safe_dump({"include": ["a.yaml", "b.yaml"], "dataset_path": "json"})) + + with pytest.raises(ValueError, match = "include order"): + evalmod.resolve_tasks(str(child), "question", "answer", tmp_path / "gen") + + +def test_resolve_tasks_accepts_local_name_over_include_conflict(tmp_path): + # a top-level task: settles the name on every lm-eval version + (tmp_path / "a.yaml").write_text(yaml.safe_dump({"task": "name_a"})) + (tmp_path / "b.yaml").write_text(yaml.safe_dump({"task": "name_b"})) + child = tmp_path / "child.yaml" + child.write_text( + yaml.safe_dump({"include": ["a.yaml", "b.yaml"], "task": "mine", "dataset_path": "json"}) + ) + + names, _ = evalmod.resolve_tasks(str(child), "question", "answer", tmp_path / "gen") + + assert names == ["mine"] + + +def test_load_task_spec_resolves_includes_against_parent_dir(tmp_path, monkeypatch): + # lm-eval resolves relative includes against the including file, never cwd + task_dir = tmp_path / "tasks" + decoy_dir = tmp_path / "decoy" + task_dir.mkdir() + decoy_dir.mkdir() + (task_dir / "base.yaml").write_text(yaml.safe_dump({"task": "right"})) + (decoy_dir / "base.yaml").write_text(yaml.safe_dump({"task": "wrong"})) + child = task_dir / "child.yaml" + child.write_text(yaml.safe_dump({"include": "base.yaml", "dataset_path": "json"})) + monkeypatch.chdir(decoy_dir) + + spec = evalmod._load_task_spec(child) + + assert spec["task"] == "right" + + +def test_json_default_preserves_numeric_scalars(): + class _FakeNumpyScalar: + def tolist(self): + return 3 + + dumped = json.dumps({"n": _FakeNumpyScalar(), "s": {1, 2}}, default = evalmod._json_default) + + parsed = json.loads(dumped) + assert parsed["n"] == 3 + assert isinstance(parsed["s"], str) + + +def test_resolve_tasks_rejects_builtin_child_shadowed_in_subdirectory(tmp_path): + # lm-eval indexes include paths recursively, so a nested sibling shadows too + (tmp_path / "suite.yaml").write_text(yaml.safe_dump({"group": "suite", "task": ["gsm8k"]})) + nested = tmp_path / "sub" + nested.mkdir() + (nested / "gsm8k.yaml").write_text(yaml.safe_dump({"task": "gsm8k", "dataset_path": "json"})) + with pytest.raises(ValueError, match = "depends on the lm-eval version"): + evalmod.resolve_tasks( + str(tmp_path / "suite.yaml"), + "question", + "answer", + tmp_path / "gen", + reserved = frozenset({"gsm8k"}), + ) + + +def test_hf_device_error_gates_xpu_hpu_on_lm_eval_version(monkeypatch): + # HFLM only enumerated xpu/hpu from 0.4.10; older versions silently fall back + _fake_torch(monkeypatch, xpu_available = True, xpu_count = 1) + monkeypatch.setattr(evalmod, "_lm_eval_version", lambda: (0, 4, 4)) + assert "needs lm-eval >= 0.4.10" in evalmod._hf_device_error("xpu:0") + assert "needs lm-eval >= 0.4.10" in evalmod._hf_device_error("hpu:0") + # npu has been enumerated since 0.4.4 + assert "NPU is not available" in evalmod._hf_device_error("npu:0") + monkeypatch.setattr(evalmod, "_lm_eval_version", lambda: (0, 4, 10)) + assert evalmod._hf_device_error("xpu:0") is None + + +def test_resolve_tasks_reserves_tag_aliases_for_datasets(tmp_path): + # a tag: alias registers under that name, so a dataset must not take it + (tmp_path / "custom.yaml").write_text( + yaml.safe_dump({"task": "foo", "tag": "qa", "dataset_path": "json"}) + ) + (tmp_path / "qa.jsonl").write_text('{"question": "q", "answer": "a"}\n') + tmp_dir = tmp_path / "gen" + + names, _ = evalmod.resolve_tasks( + f"{tmp_path / 'qa.jsonl'},{tmp_path / 'custom.yaml'}", "question", "answer", tmp_dir + ) + + assert names == ["qa_2", "foo"] + + +def test_resolve_tasks_reserves_string_group_alias_for_datasets(tmp_path): + # legacy string group: on a single task acts as a tag alias + (tmp_path / "custom.yaml").write_text( + yaml.safe_dump({"task": "foo", "group": "myalias", "dataset_path": "json"}) + ) + (tmp_path / "myalias.jsonl").write_text('{"question": "q", "answer": "a"}\n') + tmp_dir = tmp_path / "gen" + + names, _ = evalmod.resolve_tasks( + f"{tmp_path / 'custom.yaml'},{tmp_path / 'myalias.jsonl'}", "question", "answer", tmp_dir + ) + + assert names == ["foo", "myalias_2"]