diff --git a/pyproject.toml b/pyproject.toml
index 13c421d8ea..805fda5971 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -106,6 +106,10 @@ windows = [
 base = [
     "unsloth[huggingface]",
 ]
+eval = [
+    "unsloth[huggingface]",
+    "lm_eval>=0.4.4",
+]
 cu118only = [
     "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
     "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 9060b57542..1e76fb8d19 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -1922,6 +1922,56 @@ def pip_install(
             temp_req.unlink(missing_ok = True)
 
 
+def _eval_extra_requirements(local_repo: str) -> list:
+    """The [eval] extra's third-party requirements from the checkout's pyproject.
+
+    Self-referential entries (``unsloth[huggingface]``) are dropped: the local
+    checkout is already overlaid with --no-deps and must not be re-resolved
+    from PyPI over the editable install.
+    """
+    try:
+        import tomllib
+
+        with open(Path(local_repo) / "pyproject.toml", "rb") as fh:
+            data = tomllib.load(fh)
+        requirements = data["project"]["optional-dependencies"]["eval"]
+        if isinstance(requirements, list):
+            filtered = [
+                requirement
+                for requirement in requirements
+                if re.split(r"[\[<>=!~; ]", str(requirement).strip(), maxsplit = 1)[0].lower()
+                != "unsloth"
+            ]
+            if filtered:
+                return filtered
+    except Exception:
+        pass
+    # fallback mirrors the pin in pyproject.toml
+    return ["lm_eval>=0.4.4"]
+
+
+def _install_eval_extra(*, package_name: str, local_repo: str) -> None:
+    """Install pyproject.toml's [eval] extra (lm-eval-harness for `unsloth eval`)."""
+    if NO_TORCH or package_name != "unsloth":
+        return
+    _progress("eval extra")
+    if local_repo:
+        # the checkout is overlaid with --no-deps so the torch/CUDA stack is
+        # not re-resolved; a full `-e repo[eval]` install would resolve the
+        # base deps again, so install only the extra's own packages
+        pip_install(
+            "Installing unsloth[eval] extra",
+            "--no-cache-dir",
+            *_eval_extra_requirements(local_repo),
+        )
+    else:
+        pip_install(
+            "Installing unsloth[eval] extra",
+            "--no-cache-dir",
+            "unsloth[eval]",
+        )
+
+
 def download_file(url: str, dest: Path) -> None:
     """Download a file using urllib (no curl dependency)."""
     urllib.request.urlretrieve(url, dest)
@@ -1977,6 +2027,8 @@ def install_python_stack() -> int:
         base_total += 1  # ROCm torch check (line 1526) -- all non-macOS platforms
         if not IS_WINDOWS:
             base_total += 2  # flash-attn (line 1620) + ROCm torch final (line 1705) -- Linux only
+    if not NO_TORCH and package_name == "unsloth":
+        base_total += 1  # pyproject.toml [eval] extra
     _TOTAL = (base_total - 1) if skip_base else base_total
 
     # 1. Try uv for faster installs (before pip upgrade -- uv venvs don't
@@ -2197,6 +2249,8 @@ def _win_amd_smi_has_gpu(stdout: str) -> bool:
                 "Manual install may be required. See: https://docs.unsloth.ai/get-started/install-and-update/amd",
             )
 
+    _install_eval_extra(package_name = package_name, local_repo = local_repo)
+
     # 3. Extra dependencies
     _progress("unsloth extras")
     pip_install(
diff --git a/unsloth_cli/__init__.py b/unsloth_cli/__init__.py
index 440b6276cd..c6e866f7c0 100644
--- a/unsloth_cli/__init__.py
+++ b/unsloth_cli/__init__.py
@@ -13,6 +13,7 @@
 from unsloth_cli.commands.chat import chat
 from unsloth_cli.commands.connect import connect_app
 from unsloth_cli.commands.export import export, list_checkpoints
+from unsloth_cli.commands.eval import evaluate as eval_command
 from unsloth_cli.commands.studio import (
     run as studio_run,
     studio_app,
@@ -76,6 +77,7 @@ def main(
 app.command()(inference)
 app.command()(chat)
 app.command()(export)
+app.command("eval")(eval_command)
 app.command("list-checkpoints")(list_checkpoints)
 app.add_typer(studio_app, name = "studio", help = "Unsloth Studio commands.")
 app.add_typer(
diff --git a/unsloth_cli/commands/eval.py b/unsloth_cli/commands/eval.py
new file mode 100644
index 0000000000..9c91c00fd7
--- /dev/null
+++ b/unsloth_cli/commands/eval.py
@@ -0,0 +1,876 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+import contextlib
+import json
+import os
+import re
+import shutil
+import sys
+import tempfile
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+import typer
+import yaml
+
+
+@contextlib.contextmanager
+def _spinner(console, text):
+    from rich.live import Live
+    from rich.spinner import Spinner
+    with Live(
+        Spinner("dots", text = text, style = "cyan"),
+        console = console,
+        transient = True,
+        refresh_per_second = 12,
+        redirect_stdout = False,
+        redirect_stderr = False,
+    ):
+        yield
+
+
+@contextlib.contextmanager
+def _silence():
+    from rich.console import Console
+
+    sys.stdout.flush()
+    sys.stderr.flush()
+    real = os.fdopen(os.dup(1), "w", closefd = True)
+    saved_out, saved_err = os.dup(1), os.dup(2)
+    devnull_fd = os.open(os.devnull, os.O_WRONLY)
+    try:
+        os.dup2(devnull_fd, 1)
+        os.dup2(devnull_fd, 2)
+        yield Console(file = real)
+    finally:
+        sys.stdout.flush()
+        sys.stderr.flush()
+        os.dup2(saved_out, 1)
+        os.dup2(saved_err, 2)
+        os.close(saved_out)
+        os.close(saved_err)
+        os.close(devnull_fd)
+        real.close()
+
+
+def _read_adapter_base(config: Path) -> Optional[str]:
+    # ValueError covers both JSONDecodeError and UnicodeDecodeError
+    try:
+        data = json.loads(config.read_text(encoding = "utf-8"))
+    except (ValueError, OSError):
+        return None
+    if not isinstance(data, dict):
+        return None
+    return data.get("base_model_name_or_path")
+
+
+_TOKENIZER_FILES = ("tokenizer_config.json", "tokenizer.json", "tokenizer.model")
+_HUB_REPO_RE = re.compile(r"[\w.\-]+/[\w.\-]+")
+
+
+def _has_tokenizer_files(model: str) -> bool:
+    path = Path(model)
+    if path.is_dir():
+        return any((path / name).exists() for name in _TOKENIZER_FILES)
+    if path.exists() or not _HUB_REPO_RE.fullmatch(model):
+        return False
+    try:
+        from huggingface_hub import list_repo_files
+        files = set(list_repo_files(model))
+    except Exception:
+        return False
+    return any(name in files for name in _TOKENIZER_FILES)
+
+
+def _bitsandbytes_available() -> bool:
+    from importlib.util import find_spec
+    return find_spec("bitsandbytes") is not None
+
+
+def _lm_eval_available() -> bool:
+    # probe without importing: on lm-eval 0.4.4 `import lm_eval` pulls in
+    # transformers, which must stay unimported until unsloth has loaded
+    if "lm_eval" in sys.modules:
+        return sys.modules["lm_eval"] is not None
+    from importlib.util import find_spec
+    try:
+        return find_spec("lm_eval") is not None
+    except (ImportError, ValueError):
+        return False
+
+
+def _lm_eval_version() -> tuple:
+    from importlib.metadata import PackageNotFoundError, version
+    for dist in ("lm_eval", "lm-eval"):
+        try:
+            parts = re.findall(r"\d+", version(dist))[:3]
+            return tuple(int(part) for part in parts)
+        except PackageNotFoundError:
+            continue
+        except Exception:
+            break
+    # unknown version: don't block devices the runtime may well support
+    return (999,)
+
+
+def _hf_device_error(device: str) -> Optional[str]:
+    # lm-eval's HFLM only recognises 'cuda', canonical 'cuda:<i>', 'mps' and
+    # 'mps:0'; anything else (cuda0, cuda:, cuda:01, an out-of-range index)
+    # silently falls back to its default device, so reject those up front
+    if device.startswith("cuda"):
+        match = re.fullmatch(r"cuda(?::(0|[1-9]\d*))?", device)
+        if not match:
+            return f"invalid --device '{device}' — use 'cuda' or 'cuda:<index>'."
+        import torch
+
+        if not torch.cuda.is_available():
+            return f"--device {device} requested but CUDA is not available."
+        if match.group(1) is not None:
+            idx = int(match.group(1))
+            count = torch.cuda.device_count()
+            if idx >= count:
+                return f"--device {device} requested but only {count} CUDA device(s) are available."
+    elif device.startswith("mps"):
+        if not re.fullmatch(r"mps(?::0)?", device):
+            return f"invalid --device '{device}' — use 'mps'."
+        import torch
+
+        mps = getattr(torch.backends, "mps", None)
+        if not (mps and mps.is_available()):
+            return f"--device {device} requested but MPS is not available."
+    elif device != "cpu":
+        match = re.fullmatch(r"(npu|xpu|hpu):(\d+)", device)
+        if not match:
+            # a typo like 'cpuu' or 'cude' would silently fall back to HFLM's
+            # default device
+            return (
+                f"invalid --device '{device}' — use 'cpu', 'cuda[:<index>]', 'mps', "
+                "or '<npu|xpu|hpu>:<index>'."
+            )
+        # an unavailable or out-of-range accelerator would also silently fall
+        # back, so validate against the installed torch build like cuda above
+        kind, index = match.group(1), int(match.group(2))
+        if kind in ("xpu", "hpu") and _lm_eval_version() < (0, 4, 10):
+            # HFLM only enumerated cuda/cpu/mps/npu before 0.4.10; xpu/hpu
+            # strings fell through to its silent default-device fallback
+            return (
+                f"--device {device} needs lm-eval >= 0.4.10 — upgrade with "
+                "`pip install -U lm_eval`."
+            )
+        import torch
+
+        backend_mod = getattr(torch, kind, None)
+        try:
+            available = bool(backend_mod is not None and backend_mod.is_available())
+        except Exception:
+            available = False
+        if not available:
+            return (
+                f"--device {device} requested but {kind.upper()} is not available "
+                "in this torch build."
+            )
+        try:
+            count = int(backend_mod.device_count())
+        except Exception:
+            count = 0
+        if index >= count:
+            return (
+                f"--device {device} requested but only {count} {kind.upper()} "
+                "device(s) are available."
+            )
+    return None
+
+
+def _registry_names(manager) -> set:
+    return (
+        set(getattr(manager, "all_tasks", []) or [])
+        | set(getattr(manager, "all_groups", []) or [])
+        | set(getattr(manager, "all_tags", []) or [])
+    )
+
+
+def resolve_base_model(model: str) -> Optional[str]:
+    path = Path(model)
+    if path.is_dir():
+        config = path / "adapter_config.json"
+        return _read_adapter_base(config) if config.exists() else None
+    # adapter-only Hub repos carry adapter_config.json but no config.json, so
+    # they cannot be passed to lm-eval as `pretrained` — detect them up front
+    if path.exists() or not _HUB_REPO_RE.fullmatch(model):
+        return None
+    try:
+        from huggingface_hub import hf_hub_download
+        config_path = hf_hub_download(model, "adapter_config.json")
+    except Exception:
+        return None
+    return _read_adapter_base(Path(config_path))
+
+
+class _TaskYamlLoader(yaml.SafeLoader):
+    """safe_load that tolerates lm-eval's custom tags (!function utils.fn)."""
+
+
+# map local tags to their raw scalar so a valid lm-eval config parses for
+# name extraction; TaskManager loads the original file with its own loader
+_TaskYamlLoader.add_multi_constructor(
+    "!", lambda loader, suffix, node: getattr(node, "value", None)
+)
+
+
+def _load_task_spec(
+    path: Path,
+    depth: int = 0,
+    first_include_wins: bool = False,
+) -> dict:
+    # the task/group name may live in an included base config, which lm-eval
+    # resolves during indexing — mirror that (child keys override the base);
+    # depth-limited in case of include cycles. Current lm-eval merges include
+    # lists in listed order (later wins); some older releases merged in
+    # reverse, so callers compare both orders and reject specs whose name
+    # depends on it.
+    spec = yaml.load(path.read_text(encoding = "utf-8"), Loader = _TaskYamlLoader) or {}
+    includes = spec.get("include") if isinstance(spec, dict) else None
+    if not includes or depth >= 8:
+        return spec
+    if isinstance(includes, str):
+        includes = [includes]
+    ordered = list(reversed(includes)) if first_include_wins else list(includes)
+    merged: dict = {}
+    for include in ordered:
+        # lm-eval resolves relative includes against the including file's
+        # directory, never the current working directory
+        include_path = Path(include)
+        if not include_path.is_absolute():
+            include_path = path.parent / include
+        try:
+            base = _load_task_spec(include_path, depth + 1, first_include_wins)
+        except (OSError, yaml.YAMLError):
+            continue
+        if isinstance(base, dict):
+            merged.update(base)
+    merged.update(spec)
+    return merged
+
+
+def _sibling_defines_task(directory: Path, group_file: Path, child: str) -> bool:
+    # rglob: lm-eval indexes include paths recursively, so a child yaml in a
+    # subdirectory shadows just the same
+    for sibling in sorted(directory.rglob("*.yaml")):
+        if sibling == group_file:
+            continue
+        try:
+            spec = _load_task_spec(sibling)
+        except (OSError, yaml.YAMLError):
+            continue
+        if isinstance(spec, dict) and isinstance(spec.get("task"), str) and spec["task"] == child:
+            return True
+    return False
+
+
+def _doc_column(key: str) -> str:
+    # a jinja template stringifies the value (needed e.g. for numeric answer
+    # columns in few-shot prompts), but jinja can't parse keys that aren't
+    # plain identifiers ("prompt-text", "expected answer") or that collide
+    # with its keywords/literals — lm-eval treats a raw column name as a
+    # direct lookup, so fall back to that for such keys
+    import keyword
+    if key.isidentifier() and not keyword.iskeyword(key) and key not in ("true", "false", "none"):
+        return "{{" + key + "}}"
+    return key
+
+
+def make_jsonl_task(
+    data_file: Path,
+    input_key: str,
+    target_key: str,
+    out_dir: Path,
+    reserved: frozenset = frozenset(),
+) -> str:
+    data_file = Path(data_file).resolve()
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents = True, exist_ok = True)
+    # a generated task must not shadow a registered task (gsm8k.jsonl vs the
+    # gsm8k benchmark) or an earlier dataset with the same stem
+    base_name = data_file.stem
+    task_name = base_name
+    counter = 2
+    while task_name in reserved or (out_dir / f"{task_name}.yaml").exists():
+        task_name = f"{base_name}_{counter}"
+        counter += 1
+    if task_name != base_name:
+        typer.echo(
+            f"Note: task name '{base_name}' is taken — running dataset "
+            f"'{data_file.name}' as '{task_name}'."
+        )
+    builder = "json" if data_file.suffix.lower() in {".json", ".jsonl"} else "csv"
+    task_spec = {
+        "task": task_name,
+        "dataset_path": builder,
+        "dataset_kwargs": {"data_files": str(data_file)},
+        "test_split": "train",
+        # explicit few-shot source so --num-fewshot works on every lm-eval
+        # version we support (the file has a single split)
+        "fewshot_split": "train",
+        "output_type": "generate_until",
+        "doc_to_text": _doc_column(input_key),
+        "doc_to_target": _doc_column(target_key),
+        "generation_kwargs": {"until": ["\n"]},
+        # strip surrounding whitespace so " 2" matches gold "2": lm-eval's
+        # regex filter runs re.findall, which with one capture group yields
+        # the group's text; group_select indexes those matches, not groups
+        "filter_list": [
+            {
+                "name": "strip",
+                "filter": [
+                    {"function": "regex", "regex_pattern": r"^\s*(.*?)\s*$", "group_select": 0},
+                    {"function": "take_first"},
+                ],
+            },
+        ],
+        "metric_list": [
+            {"metric": "exact_match", "aggregation": "mean", "higher_is_better": True},
+        ],
+    }
+    (out_dir / f"{task_name}.yaml").write_text(
+        yaml.safe_dump(task_spec, sort_keys = False), encoding = "utf-8"
+    )
+    return task_name
+
+
+def resolve_tasks(
+    tasks: str,
+    input_key: str,
+    target_key: str,
+    tmp_dir: Path,
+    reserved: frozenset = frozenset(),
+) -> Tuple[List[str], List[str]]:
+    include_paths: List[str] = []
+    sibling_names: set = set()
+    yaml_names: set = set()
+    # (kind, value) in argument order; datasets are generated in a second
+    # pass so every yaml/group/child name is known first — the names a
+    # generated task gets must not depend on argument order
+    entries: List[Tuple[str, object]] = []
+
+    def _add_include(directory: str) -> None:
+        if directory not in include_paths:
+            include_paths.append(directory)
+
+    for raw in tasks.split(","):
+        entry = raw.strip()
+        if not entry:
+            continue
+        suffix = Path(entry).suffix.lower()
+
+        if suffix in {".yaml", ".yml"}:
+            path = Path(entry)
+            if not path.exists():
+                raise FileNotFoundError(f"Custom task file not found: {entry}")
+            text = path.read_text(encoding = "utf-8")
+            try:
+                spec = _load_task_spec(path) or {}
+            except yaml.YAMLError as e:
+                raise ValueError(f"Invalid YAML in custom task file '{entry}': {e}") from e
+            if not isinstance(spec, dict):
+                raise ValueError(f"Custom task file '{entry}' must define a YAML mapping.")
+            if "include" in spec:
+                # lm-eval versions disagree on include precedence (older ones
+                # merged last-to-first), so a name that changes with the merge
+                # order cannot be trusted on either side
+                alt = _load_task_spec(path, first_include_wins = True) or {}
+                if isinstance(alt, dict) and (spec.get("task"), spec.get("group")) != (
+                    alt.get("task"),
+                    alt.get("group"),
+                ):
+                    raise ValueError(
+                        f"Custom task file '{entry}' gets its task/group name from its "
+                        "include: files, and the winner depends on the lm-eval version's "
+                        "include order. Set 'task:' (or 'group:') in the top-level file."
+                    )
+            name = spec.get("task")
+            if isinstance(name, list):
+                # a group file (group: suite, task: [a, b]) is registered
+                # under its group name; its child task names are taken too,
+                # so later dataset entries must not generate a clashing task
+                for child in name:
+                    child_name = None
+                    if isinstance(child, str):
+                        child_name = child
+                    elif isinstance(child, dict) and child.get("task"):
+                        child_name = str(child["task"])
+                    if not child_name:
+                        continue
+                    sibling_names.add(child_name)
+                    # a string child that names a registered task AND a sibling
+                    # yaml is ambiguous: which one runs depends on the lm-eval
+                    # version's registry precedence
+                    if (
+                        isinstance(child, str)
+                        and child_name in reserved
+                        and _sibling_defines_task(path.resolve().parent, path.resolve(), child_name)
+                    ):
+                        raise ValueError(
+                            f"Custom task file '{entry}' lists child task '{child_name}', "
+                            "which is both a registered lm-eval task and defined by a "
+                            "sibling YAML in the same directory — which one runs depends "
+                            "on the lm-eval version. Rename the sibling task."
+                        )
+                name = spec.get("group")
+                if not name:
+                    raise ValueError(
+                        f"Custom task file '{entry}' defines a task list but no 'group:' name."
+                    )
+            if not name:
+                raise ValueError(f"Custom task file '{entry}' is missing a 'task:' name.")
+            # tag: (and legacy string group:) values register alias names in
+            # lm-eval's index, so generated datasets must avoid them too
+            for alias_key in ("tag", "group"):
+                alias_value = spec.get(alias_key)
+                for alias in alias_value if isinstance(alias_value, list) else [alias_value]:
+                    if isinstance(alias, str) and alias:
+                        sibling_names.add(alias)
+            name = str(name)
+            if name in reserved:
+                raise ValueError(
+                    f"Custom task file '{entry}' redefines '{name}', which is already a "
+                    "registered lm-eval task — the registered one would silently win. "
+                    "Rename the task in the YAML."
+                )
+            if name in yaml_names:
+                raise ValueError(f"Duplicate task name '{name}' in --tasks.")
+            if "include" in spec or isinstance(spec.get("task"), list) or "!function" in text:
+                # include-bearing, group and !function configs reference
+                # sibling files (base yaml, subtasks, helper modules), so
+                # their directory must stay on the include path — which
+                # only works for .yaml, the sole extension lm-eval indexes
+                if suffix == ".yml":
+                    raise ValueError(
+                        f"Custom task file '{entry}' references sibling files "
+                        "(include:/group/!function) but is a .yml file — lm-eval only "
+                        "indexes .yaml files, so it would never register. Rename it "
+                        "(and the files it references) to .yaml."
+                    )
+                _add_include(str(path.resolve().parent))
+            else:
+                # copy just this file into the temp include dir so a broken
+                # sibling yaml can't take down TaskManager's include scan
+                # (this also normalises .yml, which lm-eval doesn't index)
+                custom_dir = Path(tmp_dir) / "custom"
+                custom_dir.mkdir(parents = True, exist_ok = True)
+                shutil.copy2(path, custom_dir / f"{name}.yaml")
+                _add_include(str(custom_dir.resolve()))
+            yaml_names.add(name)
+            entries.append(("yaml", name))
+
+        elif suffix in {".jsonl", ".json", ".csv"}:
+            path = Path(entry)
+            if not path.exists():
+                raise FileNotFoundError(f"Dataset file not found: {entry}")
+            entries.append(("dataset", path))
+
+        else:
+            entries.append(("plain", entry))
+
+    names: List[str] = []
+    for kind, value in entries:
+        if kind == "dataset":
+            gen_dir = Path(tmp_dir) / "generated"
+            # every yaml task, group child and earlier name counts as taken
+            names.append(
+                make_jsonl_task(
+                    value,
+                    input_key,
+                    target_key,
+                    gen_dir,
+                    reserved | frozenset(names) | yaml_names | frozenset(sibling_names),
+                )
+            )
+            _add_include(str(gen_dir.resolve()))
+        else:
+            if value in names:
+                message = (
+                    f"Duplicate task name '{value}' in --tasks."
+                    if kind == "yaml"
+                    else f"Duplicate task '{value}' in --tasks."
+                )
+                raise ValueError(message)
+            names.append(value)
+
+    if not names:
+        raise ValueError("No tasks provided. Pass --tasks with at least one task.")
+    return names, include_paths
+
+
+def _metric_number(value):
+    # numpy float32/int64 aren't int/float subclasses; unwrap scalars via item()
+    if isinstance(value, (int, float)):
+        return value
+    item = getattr(value, "item", None)
+    if callable(item):
+        try:
+            value = item()
+        except Exception:
+            return None
+        if isinstance(value, (int, float)):
+            return value
+    return None
+
+
+def _json_default(value):
+    # numpy/torch scalars and arrays serialise as numbers/lists, not strings,
+    # so results.json agrees numerically with the in-memory results
+    tolist = getattr(value, "tolist", None)
+    if callable(tolist):
+        try:
+            return tolist()
+        except Exception:
+            pass
+    return str(value)
+
+
+def _render_results(results: dict) -> None:
+    from rich.console import Console
+    from rich.table import Table
+
+    table = Table(title = "Evaluation results")
+    table.add_column("Task", style = "cyan")
+    table.add_column("Metric")
+    table.add_column("Value", justify = "right")
+    table.add_column("± stderr", justify = "right")
+
+    rows = dict(results.get("results", {}) or {})
+    # group aggregates (mmlu, custom suites) live in a separate section
+    for task, metrics in (results.get("groups") or {}).items():
+        rows.setdefault(task, metrics)
+
+    for task, metrics in rows.items():
+        for key, raw_value in metrics.items():
+            if key == "alias" or "_stderr" in key:
+                continue
+            value = _metric_number(raw_value)
+            if value is None:
+                continue
+            metric, _, flt = key.partition(",")
+            stderr_key = f"{metric}_stderr,{flt}" if flt else f"{metric}_stderr"
+            stderr = _metric_number(metrics.get(stderr_key))
+            stderr_str = f"{stderr:.4f}" if stderr is not None else "—"
+            table.add_row(task, key, f"{value:.4f}", stderr_str)
+
+    Console().print(table)
+
+
+def evaluate(
+    model: str = typer.Argument(
+        ..., help = "Path to a checkpoint/adapter directory or a HuggingFace model id."
+    ),
+    tasks: str = typer.Option(
+        ...,
+        "--tasks",
+        "-t",
+        help = "Comma-separated built-in task names (e.g. mmlu,gsm8k), or a path to a "
+        "custom .yaml task or a .jsonl/.csv dataset.",
+    ),
+    base_model: Optional[str] = typer.Option(
+        None,
+        "--base-model",
+        help = "Base model for a LoRA adapter. Auto-detected from adapter_config.json; "
+        "set this to override a moved/renamed base.",
+    ),
+    num_fewshot: Optional[int] = typer.Option(
+        None, "--num-fewshot", "-n", help = "Few-shot examples (default: per-task)."
+    ),
+    limit: Optional[int] = typer.Option(
+        None, "--limit", help = "Cap examples per task (for quick smoke tests)."
+    ),
+    batch_size: str = typer.Option("auto", "--batch-size", "-b", help = "Batch size, or 'auto'."),
+    max_seq_length: int = typer.Option(
+        2048, "--max-seq-length", help = "Max sequence length for the model."
+    ),
+    load_in_4bit: bool = typer.Option(
+        True, "--load-in-4bit/--no-load-in-4bit", help = "Load the model in 4-bit."
+    ),
+    backend: str = typer.Option(
+        "unsloth",
+        "--backend",
+        help = "Model backend: 'unsloth' (fast kernels; needs an NVIDIA/AMD/Intel "
+        "GPU) or 'hf' (plain transformers; works on CPU/MPS/Mac). "
+        "Auto-falls back to 'hf' on Apple Silicon.",
+    ),
+    device: Optional[str] = typer.Option(
+        None,
+        "--device",
+        help = "Device for the hf backend (e.g. cpu, mps, cuda). Default: auto.",
+    ),
+    input_key: str = typer.Option(
+        "question", "--input-key", help = "Prompt field for a .jsonl/.csv dataset task."
+    ),
+    target_key: str = typer.Option(
+        "answer", "--target-key", help = "Answer field for a .jsonl/.csv dataset task."
+    ),
+    output_dir: Path = typer.Option(
+        Path("./eval_results"), "--output-dir", "-o", help = "Directory for results.json."
+    ),
+    hf_token: Optional[str] = typer.Option(
+        None, "--hf-token", envvar = "HF_TOKEN", help = "HuggingFace token if needed."
+    ),
+):
+    """Evaluate a checkpoint or LoRA adapter using lm-eval-harness."""
+    if batch_size == "auto":
+        bs = "auto"
+    else:
+        try:
+            bs = int(batch_size)
+            if bs <= 0:
+                raise ValueError
+        except ValueError:
+            typer.echo("Error: --batch-size must be a positive integer or 'auto'.", err = True)
+            raise typer.Exit(code = 2)
+
+    if backend not in ("unsloth", "hf"):
+        typer.echo(f"Error: --backend must be 'unsloth' or 'hf', got '{backend}'.", err = True)
+        raise typer.Exit(code = 2)
+
+    if num_fewshot is not None and num_fewshot < 0:
+        # lm-eval treats a negative count as zero-shot while recording the
+        # bogus value in the results metadata
+        typer.echo("Error: --num-fewshot must be >= 0.", err = True)
+        raise typer.Exit(code = 2)
+
+    if limit is not None and limit <= 0:
+        # lm-eval reads values below 1 as a dataset fraction: 0 builds no
+        # requests and crashes, negatives take an unintended slice
+        typer.echo("Error: --limit must be a positive integer.", err = True)
+        raise typer.Exit(code = 2)
+
+    if max_seq_length <= 0:
+        # HFLM treats a falsy 0 as unset (silently dropping the cap) and
+        # uses negatives in truncation arithmetic
+        typer.echo("Error: --max-seq-length must be a positive integer.", err = True)
+        raise typer.Exit(code = 2)
+
+    if not _lm_eval_available():
+        typer.echo(
+            "Error: evaluation requires lm-eval. Install it with `pip install unsloth[eval]`.",
+            err = True,
+        )
+        raise typer.Exit(code = 1)
+
+    if backend == "unsloth":
+        # unsloth must be imported before transformers (which lm-eval pulls
+        # in) or its patches don't fully apply
+        with _silence():
+            import unsloth
+
+        if getattr(unsloth, "DEVICE_TYPE", None) == "mlx":
+            typer.echo(
+                "Note: Apple Silicon (MLX) detected — falling back to "
+                "--backend hf (plain transformers)."
+            )
+            backend = "hf"
+
+    # a pre-loaded model object makes lm-eval single-process (rank 0
+    # everywhere), so under accelerate/torchrun every worker would run
+    # the full task set and write results
+    if backend == "unsloth" and os.environ.get("WORLD_SIZE", "1") not in ("", "1"):
+        typer.echo(
+            "Error: multi-process launches (accelerate/torchrun) are not "
+            "supported with --backend unsloth. Use --backend hf for "
+            "multi-GPU evaluation.",
+            err = True,
+        )
+        raise typer.Exit(code = 2)
+
+    try:
+        import lm_eval
+        from lm_eval.models.huggingface import HFLM
+        from lm_eval.tasks import TaskManager
+    except ImportError as e:
+        typer.echo(
+            "Error: evaluation requires lm-eval. Install it with `pip install unsloth[eval]`.",
+            err = True,
+        )
+        raise typer.Exit(code = 1) from e
+
+    if hf_token:
+        os.environ["HF_TOKEN"] = hf_token  # both backends read it from the env
+
+    # --base-model => treat <model> as an adapter on this base (and skip the
+    # local/Hub adapter_config.json lookup)
+    effective_base = base_model or resolve_base_model(model)
+
+    tmp_dir = Path(tempfile.mkdtemp(prefix = "unsloth_eval_"))
+    try:
+        # a dataset or custom task named after a registered task (gsm8k.jsonl,
+        # task: gsm8k) must not be shadowed by the built-in benchmark, so
+        # collect registry names first
+        base_manager = None
+        reserved: frozenset = frozenset()
+        if any(
+            Path(e.strip()).suffix.lower() in {".jsonl", ".json", ".csv", ".yaml", ".yml"}
+            for e in tasks.split(",")
+        ):
+            base_manager = TaskManager()
+            reserved = frozenset(_registry_names(base_manager))
+
+        try:
+            task_names, include_paths = resolve_tasks(
+                tasks, input_key, target_key, tmp_dir, reserved = reserved
+            )
+        except (FileNotFoundError, ValueError) as e:
+            typer.echo(f"Error: {e}", err = True)
+            raise typer.Exit(code = 2) from e
+
+        # reuse for validation and the eval run
+        if include_paths:
+            task_manager = TaskManager(include_path = include_paths)
+        else:
+            task_manager = base_manager or TaskManager()
+
+        registered = getattr(task_manager, "all_tasks", None)
+        if registered:
+            known = _registry_names(task_manager)
+            unknown = [t for t in task_names if t not in known]
+            if unknown:
+                typer.echo(
+                    f"Error: unknown task(s): {', '.join(unknown)}. Pass a built-in "
+                    "task name, a .yaml task file, or a .jsonl/.csv dataset.",
+                    err = True,
+                )
+                raise typer.Exit(code = 2)
+
+        if num_fewshot and any((tmp_dir / "generated" / f"{t}.yaml").exists() for t in task_names):
+            raw_keys = [k for k in dict.fromkeys((input_key, target_key)) if _doc_column(k) == k]
+            if raw_keys:
+                # raw column lookups feed unstringified values into lm-eval's
+                # few-shot prompt builder, which fails on non-string data
+                typer.echo(
+                    "Error: --num-fewshot needs plain-identifier column names for a "
+                    f"dataset task; rename column(s) {', '.join(map(repr, raw_keys))} "
+                    "or drop --num-fewshot.",
+                    err = True,
+                )
+                raise typer.Exit(code = 2)
+            typer.echo(
+                "Note: few-shot examples for a generated task come from the same "
+                "file (no held-out split)."
+            )
+
+        typer.echo(f"Running tasks: {', '.join(task_names)} (backend: {backend})")
+
+        eval_kwargs = dict(
+            tasks = task_names,
+            num_fewshot = num_fewshot,
+            limit = limit,
+            task_manager = task_manager,
+            log_samples = False,
+        )
+
+        if backend == "hf":
+            if device is None:
+                import torch
+                if torch.cuda.is_available():
+                    device = "cuda"
+                elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+                    device = "mps"
+                else:
+                    device = "cpu"
+            else:
+                device_error = _hf_device_error(device)
+                if device_error:
+                    typer.echo(f"Error: {device_error}", err = True)
+                    raise typer.Exit(code = 2)
+            if bs == "auto" and not device.startswith("cuda"):
+                typer.echo(
+                    "Note: batch_size 'auto' is slow on CPU/MPS — using 1 (override with --batch-size)."
+                )
+                bs = 1
+            # dict form: a comma in a path can't corrupt key=value parsing
+            if effective_base:
+                model_args = {"pretrained": effective_base, "peft": model}
+                # adapters that saved their own tokenizer (added tokens etc.)
+                # must not be scored with the base tokenizer
+                if _has_tokenizer_files(model):
+                    model_args["tokenizer"] = model
+                typer.echo(f"Evaluating adapter '{model}' on base '{effective_base}'.")
+            else:
+                model_args = {"pretrained": model}
+            model_args["max_length"] = max_seq_length
+            if load_in_4bit and device.startswith("cuda"):
+                if _bitsandbytes_available():
+                    model_args["load_in_4bit"] = True
+                else:
+                    typer.echo(
+                        "Note: bitsandbytes is not installed — loading in full "
+                        "precision (`pip install bitsandbytes` to enable 4-bit)."
+                    )
+            eval_kwargs.update(
+                model = "hf",
+                model_args = model_args,
+                batch_size = bs,
+                device = device,
+            )
+        else:
+            from unsloth import FastLanguageModel
+
+            load_kwargs = dict(
+                max_seq_length = max_seq_length,
+                load_in_4bit = load_in_4bit,
+                token = hf_token or None,
+            )
+            if effective_base:
+                typer.echo(f"Loading base model '{effective_base}' with adapter '{model}'...")
+                with _silence():
+                    lmodel, tokenizer = FastLanguageModel.from_pretrained(
+                        model_name = effective_base, **load_kwargs
+                    )
+                    # adapters that saved their own tokenizer (added tokens
+                    # etc.) must not be scored with the base tokenizer, and
+                    # the embeddings must match its vocab before the adapter
+                    # weights are applied or PEFT fails on a size mismatch
+                    if _has_tokenizer_files(model):
+                        from transformers import AutoTokenizer
+
+                        tokenizer = AutoTokenizer.from_pretrained(model)
+                        embeddings = lmodel.get_input_embeddings()
+                        if embeddings is not None and embeddings.weight.shape[0] != len(tokenizer):
+                            lmodel.resize_token_embeddings(len(tokenizer))
+                    from peft import PeftModel
+
+                    lmodel = PeftModel.from_pretrained(lmodel, model)
+            else:
+                typer.echo(f"Loading model: {model}")
+                with _silence():
+                    lmodel, tokenizer = FastLanguageModel.from_pretrained(
+                        model_name = model, **load_kwargs
+                    )
+            with _silence():
+                FastLanguageModel.for_inference(lmodel)
+                lm = HFLM(
+                    pretrained = lmodel,
+                    tokenizer = tokenizer,
+                    batch_size = bs,
+                    max_length = max_seq_length,
+                )
+            eval_kwargs["model"] = lm
+
+        with _silence() as ui, _spinner(ui, f"Evaluating {', '.join(task_names)}…"):
+            results = lm_eval.simple_evaluate(**eval_kwargs)
+    finally:
+        shutil.rmtree(tmp_dir, ignore_errors = True)
+
+    if results is None:
+        # lm-eval hands results only to rank 0 of a multi-process run
+        # (accelerate/torchrun); worker ranks get None and must exit cleanly
+        if os.environ.get("RANK", "0") != "0" or os.environ.get("LOCAL_RANK", "0") != "0":
+            return
+        typer.echo("Error: evaluation returned no results.", err = True)
+        raise typer.Exit(code = 1)
+
+    _render_results(results)
+
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents = True, exist_ok = True)
+    results_path = output_dir / "results.json"
+    results_path.write_text(json.dumps(results, indent = 2, default = _json_default), encoding = "utf-8")
+    typer.echo(f"Saved results to: {results_path}")
diff --git a/unsloth_cli/tests/test_eval.py b/unsloth_cli/tests/test_eval.py
new file mode 100644
index 0000000000..dff8b46730
--- /dev/null
+++ b/unsloth_cli/tests/test_eval.py
@@ -0,0 +1,1396 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+import types
+from pathlib import Path
+from types import SimpleNamespace
+
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+
+import typer
+import pytest
+import yaml
+from typer.testing import CliRunner
+
+import unsloth_cli.commands.eval as evalmod
+
+
+def _eval_app():
+    cli = typer.Typer()
+    cli.command()(evalmod.evaluate)
+    return cli
+
+
+def test_resolve_base_model_reads_adapter_config(tmp_path):
+    (tmp_path / "adapter_config.json").write_text(
+        json.dumps({"base_model_name_or_path": "unsloth/Llama-3.2-1B"})
+    )
+    assert evalmod.resolve_base_model(str(tmp_path)) == "unsloth/Llama-3.2-1B"
+
+
+def test_resolve_base_model_none_for_plain_dir(tmp_path):
+    assert evalmod.resolve_base_model(str(tmp_path)) is None
+
+
+def test_resolve_base_model_none_for_missing_path():
+    assert evalmod.resolve_base_model("/no/such/dir") is None
+
+
+def test_resolve_base_model_none_for_non_dict_config(tmp_path):
+    (tmp_path / "adapter_config.json").write_text(json.dumps(["not", "a", "dict"]))
+    assert evalmod.resolve_base_model(str(tmp_path)) is None
+
+
+def test_resolve_base_model_finds_hub_adapter(tmp_path, monkeypatch):
+    remote_config = tmp_path / "adapter_config.json"
+    remote_config.write_text(json.dumps({"base_model_name_or_path": "unsloth/Llama-3.2-1B"}))
+
+    hub_mod = types.ModuleType("huggingface_hub")
+
+    def fake_download(repo_id, filename, **kwargs):
+        assert repo_id == "someuser/my-lora"
+        assert filename == "adapter_config.json"
+        return str(remote_config)
+
+    hub_mod.hf_hub_download = fake_download
+    monkeypatch.setitem(sys.modules, "huggingface_hub", hub_mod)
+
+    assert evalmod.resolve_base_model("someuser/my-lora") == "unsloth/Llama-3.2-1B"
+
+
+def test_resolve_base_model_none_when_hub_lookup_fails(monkeypatch):
+    hub_mod = types.ModuleType("huggingface_hub")
+
+    def fake_download(*args, **kwargs):
+        raise RuntimeError("no adapter_config.json in repo")
+
+    hub_mod.hf_hub_download = fake_download
+    monkeypatch.setitem(sys.modules, "huggingface_hub", hub_mod)
+
+    assert evalmod.resolve_base_model("someuser/full-model") is None
+
+
+def test_make_jsonl_task_generates_expected_spec(tmp_path):
+    data = tmp_path / "qa.jsonl"
+    data.write_text('{"question": "1+1?", "answer": "2"}\n')
+    out_dir = tmp_path / "tasks"
+
+    name = evalmod.make_jsonl_task(data, "question", "answer", out_dir)
+
+    assert name == "qa"
+    spec = yaml.safe_load((out_dir / "qa.yaml").read_text())
+    assert spec["task"] == "qa"
+    assert spec["dataset_path"] == "json"
+    assert spec["dataset_kwargs"]["data_files"] == str(data.resolve())
+    assert spec["doc_to_text"] == "{{question}}"
+    assert spec["doc_to_target"] == "{{answer}}"
+    assert spec["metric_list"][0]["metric"] == "exact_match"
+    assert spec["fewshot_split"] == "train"
+
+
+def test_make_jsonl_task_honours_custom_keys(tmp_path):
+    data = tmp_path / "prompts.csv"
+    data.write_text("prompt,label\nhi,hello\n")
+    name = evalmod.make_jsonl_task(data, "prompt", "label", tmp_path / "t")
+
+    spec = yaml.safe_load((tmp_path / "t" / "prompts.yaml").read_text())
+    assert name == "prompts"
+    assert spec["dataset_path"] == "csv"
+    assert spec["doc_to_text"] == "{{prompt}}"
+    assert spec["doc_to_target"] == "{{label}}"
+
+
+def test_make_jsonl_task_uses_raw_lookup_for_non_identifier_keys(tmp_path):
+    data = tmp_path / "weird.jsonl"
+    data.write_text('{"prompt-text": "1+1?", "expected answer": "2"}\n')
+    evalmod.make_jsonl_task(data, "prompt-text", "expected answer", tmp_path / "t")
+
+    spec = yaml.safe_load((tmp_path / "t" / "weird.yaml").read_text())
+    # jinja can't parse these keys; lm-eval resolves raw column names directly
+    assert spec["doc_to_text"] == "prompt-text"
+    assert spec["doc_to_target"] == "expected answer"
+
+
+def test_make_jsonl_task_avoids_reserved_names(tmp_path):
+    data = tmp_path / "gsm8k.jsonl"
+    data.write_text('{"question": "q", "answer": "a"}\n')
+
+    name = evalmod.make_jsonl_task(
+        data, "question", "answer", tmp_path / "t", reserved = frozenset({"gsm8k"})
+    )
+
+    assert name == "gsm8k_2"
+    assert (tmp_path / "t" / "gsm8k_2.yaml").exists()
+
+
+def test_has_tokenizer_files_checks_hub_repo(monkeypatch):
+    hub_mod = types.ModuleType("huggingface_hub")
+    hub_mod.list_repo_files = lambda repo_id: [
+        "adapter_config.json",
+        "tokenizer_config.json",
+        "tokenizer.json",
+    ]
+    monkeypatch.setitem(sys.modules, "huggingface_hub", hub_mod)
+
+    assert evalmod._has_tokenizer_files("someuser/my-lora") is True
+
+
+def test_has_tokenizer_files_false_when_hub_listing_fails(monkeypatch):
+    hub_mod = types.ModuleType("huggingface_hub")
+
+    def _fail(repo_id):
+        raise RuntimeError("offline")
+
+    hub_mod.list_repo_files = _fail
+    monkeypatch.setitem(sys.modules, "huggingface_hub", hub_mod)
+
+    assert evalmod._has_tokenizer_files("someuser/my-lora") is False
+
+
+def test_resolve_tasks_builtin_names(tmp_path):
+    names, includes = evalmod.resolve_tasks("mmlu, gsm8k", "question", "answer", tmp_path)
+    assert names == ["mmlu", "gsm8k"]
+    assert includes == []
+
+
+def test_resolve_tasks_custom_yaml_copied_to_include_dir(tmp_path):
+    src = tmp_path / "src"
+    src.mkdir()
+    task_file = src / "custom.yaml"
+    task_file.write_text(yaml.safe_dump({"task": "my_task", "output_type": "generate_until"}))
+    # a broken sibling must not end up on the include path
+    (src / "broken.yaml").write_text("task: [unclosed")
+    tmp_dir = tmp_path / "gen"
+
+    names, includes = evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_dir)
+
+    custom_dir = tmp_dir / "custom"
+    assert names == ["my_task"]
+    assert includes == [str(custom_dir.resolve())]
+    assert (custom_dir / "my_task.yaml").exists()
+    assert not (custom_dir / "broken.yaml").exists()
+
+
+def test_resolve_tasks_yml_normalised_to_yaml(tmp_path):
+    task_file = tmp_path / "custom.yml"
+    task_file.write_text(yaml.safe_dump({"task": "my_task", "output_type": "generate_until"}))
+    tmp_dir = tmp_path / "gen"
+
+    names, _ = evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_dir)
+
+    assert names == ["my_task"]
+    # lm-eval only indexes .yaml files
+    assert (tmp_dir / "custom" / "my_task.yaml").exists()
+
+
+def test_resolve_tasks_include_yaml_keeps_parent_dir(tmp_path):
+    task_file = tmp_path / "custom.yaml"
+    task_file.write_text(yaml.safe_dump({"task": "my_task", "include": "base.yaml"}))
+
+    names, includes = evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path / "gen")
+
+    assert names == ["my_task"]
+    # the config references a sibling file, so its directory stays included
+    assert includes == [str(tmp_path.resolve())]
+
+
+def test_resolve_tasks_yaml_with_function_tag_keeps_parent_dir(tmp_path):
+    task_file = tmp_path / "custom.yaml"
+    task_file.write_text(
+        "task: fn_task\noutput_type: generate_until\n"
+        "process_docs: !function utils.process_docs\n"
+    )
+    tmp_dir = tmp_path / "gen"
+
+    names, includes = evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_dir)
+
+    assert names == ["fn_task"]
+    # !function imports resolve relative to the yaml, so utils.py must stay
+    # next to it — no copy into the temp dir
+    assert includes == [str(tmp_path.resolve())]
+    assert not (tmp_dir / "custom" / "fn_task.yaml").exists()
+
+
+def test_resolve_tasks_task_name_from_included_base(tmp_path):
+    (tmp_path / "base.yaml").write_text(
+        yaml.safe_dump({"task": "from_base", "output_type": "generate_until"})
+    )
+    child = tmp_path / "child.yaml"
+    child.write_text(yaml.safe_dump({"include": "base.yaml", "dataset_path": "json"}))
+
+    names, includes = evalmod.resolve_tasks(str(child), "question", "answer", tmp_path / "gen")
+
+    # lm-eval resolves include: during indexing, so the name from the base
+    # config counts
+    assert names == ["from_base"]
+    assert includes == [str(tmp_path.resolve())]
+
+
+def test_resolve_tasks_rejects_yml_group_config(tmp_path):
+    task_file = tmp_path / "suite.yml"
+    task_file.write_text(yaml.safe_dump({"group": "my_suite", "task": ["task_a", "task_b"]}))
+    with pytest.raises(ValueError, match = "only indexes .yaml"):
+        evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path / "gen")
+
+
+def test_resolve_tasks_jsonl_generates_task(tmp_path):
+    data = tmp_path / "qa.jsonl"
+    data.write_text('{"question": "q", "answer": "a"}\n')
+    tmp_dir = tmp_path / "gen"
+
+    names, includes = evalmod.resolve_tasks(str(data), "question", "answer", tmp_dir)
+
+    gen_dir = tmp_dir / "generated"
+    assert names == ["qa"]
+    assert includes == [str(gen_dir.resolve())]
+    assert (gen_dir / "qa.yaml").exists()
+
+
+def test_resolve_tasks_uniquifies_colliding_dataset_stems(tmp_path):
+    dir_a = tmp_path / "a"
+    dir_b = tmp_path / "b"
+    dir_a.mkdir()
+    dir_b.mkdir()
+    (dir_a / "qa.jsonl").write_text('{"question": "q", "answer": "a"}\n')
+    (dir_b / "qa.jsonl").write_text('{"question": "q2", "answer": "a2"}\n')
+    gen_dir = tmp_path / "gen"
+
+    names, _ = evalmod.resolve_tasks(
+        f"{dir_a / 'qa.jsonl'},{dir_b / 'qa.jsonl'}", "question", "answer", gen_dir
+    )
+
+    assert names == ["qa", "qa_2"]
+    spec_a = yaml.safe_load((gen_dir / "generated" / "qa.yaml").read_text())
+    spec_b = yaml.safe_load((gen_dir / "generated" / "qa_2.yaml").read_text())
+    assert spec_a["dataset_kwargs"]["data_files"] == str((dir_a / "qa.jsonl").resolve())
+    assert spec_b["dataset_kwargs"]["data_files"] == str((dir_b / "qa.jsonl").resolve())
+    assert spec_b["task"] == "qa_2"
+
+
+def test_resolve_tasks_reserves_group_child_names_for_datasets(tmp_path):
+    (tmp_path / "suite.yaml").write_text(
+        yaml.safe_dump({"group": "suite", "task": ["qa", {"task": "qa_inline"}]})
+    )
+    (tmp_path / "qa.jsonl").write_text('{"question": "q", "answer": "a"}\n')
+    tmp_dir = tmp_path / "gen"
+
+    names, _ = evalmod.resolve_tasks(
+        f"{tmp_path / 'suite.yaml'},{tmp_path / 'qa.jsonl'}", "question", "answer", tmp_dir
+    )
+
+    # the dataset must not generate a task shadowing the suite's child 'qa'
+    assert names == ["suite", "qa_2"]
+    assert (tmp_dir / "generated" / "qa_2.yaml").exists()
+
+
+def test_resolve_tasks_invalid_yaml_raises(tmp_path):
+    task_file = tmp_path / "broken.yaml"
+    task_file.write_text("task: [unclosed")
+    with pytest.raises(ValueError, match = "Invalid YAML"):
+        evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path)
+
+
+def test_resolve_tasks_yaml_list_raises(tmp_path):
+    task_file = tmp_path / "list.yaml"
+    task_file.write_text(yaml.safe_dump(["not", "a", "mapping"]))
+    with pytest.raises(ValueError, match = "YAML mapping"):
+        evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path)
+
+
+def test_resolve_tasks_group_yaml_uses_group_name(tmp_path):
+    task_file = tmp_path / "suite.yaml"
+    task_file.write_text(yaml.safe_dump({"group": "my_suite", "task": ["task_a", "task_b"]}))
+
+    names, includes = evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path)
+
+    assert names == ["my_suite"]
+    assert includes == [str(tmp_path.resolve())]
+
+
+def test_resolve_tasks_group_yaml_without_group_raises(tmp_path):
+    task_file = tmp_path / "suite.yaml"
+    task_file.write_text(yaml.safe_dump({"task": ["task_a", "task_b"]}))
+    with pytest.raises(ValueError, match = "no 'group:' name"):
+        evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path)
+
+
+def test_resolve_tasks_yaml_rejects_registered_name(tmp_path):
+    task_file = tmp_path / "clash.yaml"
+    task_file.write_text(yaml.safe_dump({"task": "gsm8k", "output_type": "generate_until"}))
+    with pytest.raises(ValueError, match = "redefines 'gsm8k'"):
+        evalmod.resolve_tasks(
+            str(task_file), "question", "answer", tmp_path, reserved = frozenset({"gsm8k"})
+        )
+
+
+def test_resolve_tasks_rejects_duplicate_yaml_names(tmp_path):
+    for stem in ("one", "two"):
+        (tmp_path / f"{stem}.yaml").write_text(
+            yaml.safe_dump({"task": "same_task", "output_type": "generate_until"})
+        )
+    with pytest.raises(ValueError, match = "Duplicate task name 'same_task'"):
+        evalmod.resolve_tasks(
+            f"{tmp_path / 'one.yaml'},{tmp_path / 'two.yaml'}",
+            "question",
+            "answer",
+            tmp_path / "gen",
+        )
+
+
+def test_resolve_tasks_rejects_duplicate_builtins(tmp_path):
+    with pytest.raises(ValueError, match = "Duplicate task 'gsm8k'"):
+        evalmod.resolve_tasks("gsm8k,gsm8k", "question", "answer", tmp_path)
+
+
+def test_resolve_tasks_renames_dataset_colliding_with_yaml_name(tmp_path):
+    (tmp_path / "foo.yaml").write_text(
+        yaml.safe_dump({"task": "foo", "output_type": "generate_until"})
+    )
+    (tmp_path / "foo.jsonl").write_text('{"question": "q", "answer": "a"}\n')
+    tmp_dir = tmp_path / "gen"
+
+    names, _ = evalmod.resolve_tasks(
+        f"{tmp_path / 'foo.yaml'},{tmp_path / 'foo.jsonl'}", "question", "answer", tmp_dir
+    )
+
+    # the dataset must not silently shadow (or be shadowed by) the yaml task
+    assert names == ["foo", "foo_2"]
+    assert (tmp_dir / "generated" / "foo_2.yaml").exists()
+
+
+def _fake_torch(
+    monkeypatch,
+    cuda_available = False,
+    device_count = 0,
+    mps_available = False,
+    xpu_available = False,
+    xpu_count = 0,
+):
+    torch_mod = types.ModuleType("torch")
+    torch_mod.cuda = SimpleNamespace(
+        is_available = lambda: cuda_available, device_count = lambda: device_count
+    )
+    torch_mod.backends = SimpleNamespace(mps = SimpleNamespace(is_available = lambda: mps_available))
+    torch_mod.xpu = SimpleNamespace(
+        is_available = lambda: xpu_available, device_count = lambda: xpu_count
+    )
+    monkeypatch.setitem(sys.modules, "torch", torch_mod)
+
+
+def test_hf_device_error_validates_cuda_strings(monkeypatch):
+    _fake_torch(monkeypatch, cuda_available = True, device_count = 2)
+    assert evalmod._hf_device_error("cuda") is None
+    assert evalmod._hf_device_error("cuda:0") is None
+    assert evalmod._hf_device_error("cuda:1") is None
+    # lm-eval only recognises canonical cuda:<i>; everything else falls back
+    for bad in ("cuda0", "cuda:", "cuda:01", "cuda:-1", "cudax"):
+        assert evalmod._hf_device_error(bad) is not None, bad
+    assert "only 2 CUDA" in evalmod._hf_device_error("cuda:2")
+
+
+def test_hf_device_error_validates_mps_strings(monkeypatch):
+    _fake_torch(monkeypatch, mps_available = True)
+    assert evalmod._hf_device_error("mps") is None
+    assert evalmod._hf_device_error("mps:0") is None
+    assert evalmod._hf_device_error("mps:1") is not None
+    _fake_torch(monkeypatch, mps_available = False)
+    assert "MPS is not available" in evalmod._hf_device_error("mps")
+
+
+def test_hf_device_error_rejects_unknown_literals(monkeypatch):
+    _fake_torch(monkeypatch)
+    assert evalmod._hf_device_error("cpu") is None
+    # typos would silently fall back to HFLM's default device
+    for bad in ("cpuu", "cude", "gpu", "xpu", "npu"):
+        assert "invalid --device" in evalmod._hf_device_error(bad), bad
+
+
+def test_hf_device_error_validates_indexed_accelerators(monkeypatch):
+    # an unavailable or out-of-range accelerator would also silently fall back
+    _fake_torch(monkeypatch, xpu_available = True, xpu_count = 2)
+    assert evalmod._hf_device_error("xpu:0") is None
+    assert evalmod._hf_device_error("xpu:1") is None
+    assert "only 2 XPU" in evalmod._hf_device_error("xpu:2")
+    # this torch build has no npu/hpu module at all
+    assert "NPU is not available" in evalmod._hf_device_error("npu:0")
+    assert "HPU is not available" in evalmod._hf_device_error("hpu:0")
+    _fake_torch(monkeypatch, xpu_available = False)
+    assert "XPU is not available" in evalmod._hf_device_error("xpu:0")
+
+
+def test_metric_number_unwraps_numpy_like_scalars():
+    class _FakeScalar:
+        def __init__(self, value):
+            self._value = value
+
+        def item(self):
+            return self._value
+
+    assert evalmod._metric_number(0.5) == 0.5
+    assert evalmod._metric_number(3) == 3
+    assert evalmod._metric_number(_FakeScalar(0.25)) == 0.25
+    assert evalmod._metric_number(_FakeScalar("not a number")) is None
+    assert evalmod._metric_number("alias-ish string") is None
+
+
+def test_resolve_tasks_yaml_without_task_name_raises(tmp_path):
+    task_file = tmp_path / "bad.yaml"
+    task_file.write_text(yaml.safe_dump({"output_type": "generate_until"}))
+    with pytest.raises(ValueError, match = "missing a 'task:' name"):
+        evalmod.resolve_tasks(str(task_file), "question", "answer", tmp_path)
+
+
+def test_resolve_tasks_missing_file_raises(tmp_path):
+    with pytest.raises(FileNotFoundError):
+        evalmod.resolve_tasks("./nope.yaml", "question", "answer", tmp_path)
+
+
+def test_resolve_tasks_empty_raises(tmp_path):
+    with pytest.raises(ValueError, match = "No tasks provided"):
+        evalmod.resolve_tasks("  , ", "question", "answer", tmp_path)
+
+
+def test_render_results_renders_metric_row(capsys):
+    evalmod._render_results(
+        {
+            "results": {
+                "gsm8k": {
+                    "exact_match,strict-match": 0.5,
+                    "exact_match_stderr,strict-match": 0.05,
+                    "alias": "gsm8k",
+                }
+            }
+        }
+    )
+    out = capsys.readouterr().out
+    assert "gsm8k" in out
+    assert "0.5000" in out
+    assert "0.0500" in out
+
+
+def test_render_results_includes_group_aggregates(capsys):
+    evalmod._render_results(
+        {
+            "results": {
+                "mmlu_abstract_algebra": {"acc,none": 0.30, "alias": " - abstract_algebra"},
+            },
+            "groups": {
+                "mmlu": {"acc,none": 0.45, "alias": "mmlu"},
+            },
+        }
+    )
+    out = capsys.readouterr().out
+    assert "0.3000" in out
+    # the group aggregate must be shown, not just per-subtask rows
+    assert "0.4500" in out
+
+
+def test_eval_missing_lm_eval_shows_hint(monkeypatch):
+    monkeypatch.setitem(sys.modules, "lm_eval", None)
+    result = CliRunner().invoke(_eval_app(), ["fake/model", "--tasks", "gsm8k"])
+    assert result.exit_code == 1, result.output
+    assert "pip install unsloth[eval]" in result.output
+
+
+@pytest.fixture
+def fake_eval_env(monkeypatch):
+    calls = {}
+
+    class _FakeFLM:
+        @classmethod
+        def from_pretrained(
+            cls,
+            model_name = None,
+            **kw,
+        ):
+            calls["model_name"] = model_name
+            model = SimpleNamespace(
+                name = model_name,
+                get_input_embeddings = lambda: SimpleNamespace(
+                    weight = SimpleNamespace(shape = (32000, 4096))
+                ),
+                resize_token_embeddings = lambda n: calls.setdefault("events", []).append(
+                    ("resize", n)
+                ),
+            )
+            return model, SimpleNamespace(name = "tok")
+
+        @classmethod
+        def for_inference(cls, model):
+            calls["for_inference"] = True
+            return model
+
+    class _FakeHFLM:
+        def __init__(
+            self,
+            pretrained = None,
+            tokenizer = None,
+            batch_size = None,
+            max_length = None,
+        ):
+            calls["batch_size"] = batch_size
+            calls["hflm_tokenizer"] = tokenizer
+            calls["hflm_max_length"] = max_length
+
+    class _FakeTaskManager:
+        def __init__(self, include_path = None):
+            calls["include_path"] = include_path
+            self.all_tasks = ["gsm8k", "mmlu", "hellaswag"]
+            self.all_groups = ["mmlu"]
+            self.all_tags = []
+            # mirror lm-eval: yaml tasks/groups under include paths get
+            # registered under their task or group name
+            for directory in include_path or []:
+                for spec_file in sorted(Path(directory).glob("*.yaml")):
+                    # like lm-eval, tolerate !function tags but not broken yaml
+                    spec = yaml.load(spec_file.read_text(), Loader = evalmod._TaskYamlLoader)
+                    if not isinstance(spec, dict):
+                        continue
+                    name = spec.get("task")
+                    if isinstance(name, list):
+                        if spec.get("group"):
+                            self.all_groups.append(str(spec["group"]))
+                    elif name:
+                        self.all_tasks.append(str(name))
+
+    def _simple_evaluate(
+        model = None,
+        model_args = None,
+        tasks = None,
+        **kw,
+    ):
+        calls["model"] = model
+        calls["model_args"] = model_args
+        calls["tasks"] = tasks
+        calls["simple_evaluate_kwargs"] = kw
+        return {
+            "results": {
+                "gsm8k": {
+                    "exact_match,strict-match": 0.42,
+                    "exact_match_stderr,strict-match": 0.01,
+                    "alias": "gsm8k",
+                }
+            },
+            "configs": {},
+        }
+
+    unsloth_mod = types.ModuleType("unsloth")
+    unsloth_mod.FastLanguageModel = _FakeFLM
+
+    # deterministic device detection, no real torch needed
+    torch_mod = types.ModuleType("torch")
+    torch_mod.cuda = SimpleNamespace(is_available = lambda: False, device_count = lambda: 0)
+    torch_mod.backends = SimpleNamespace(mps = SimpleNamespace(is_available = lambda: False))
+
+    # no adapter_config.json on the fake Hub, and no network access in tests
+    hub_mod = types.ModuleType("huggingface_hub")
+
+    def _no_hub_download(*args, **kwargs):
+        raise RuntimeError("adapter_config.json not found")
+
+    hub_mod.hf_hub_download = _no_hub_download
+
+    def _no_repo_files(*args, **kwargs):
+        raise RuntimeError("repo not found")
+
+    hub_mod.list_repo_files = _no_repo_files
+
+    lm_eval_mod = types.ModuleType("lm_eval")
+    lm_eval_mod.simple_evaluate = _simple_evaluate
+    models_mod = types.ModuleType("lm_eval.models")
+    hf_mod = types.ModuleType("lm_eval.models.huggingface")
+    hf_mod.HFLM = _FakeHFLM
+    tasks_mod = types.ModuleType("lm_eval.tasks")
+    tasks_mod.TaskManager = _FakeTaskManager
+
+    for name, mod in {
+        "unsloth": unsloth_mod,
+        "torch": torch_mod,
+        "huggingface_hub": hub_mod,
+        "lm_eval": lm_eval_mod,
+        "lm_eval.models": models_mod,
+        "lm_eval.models.huggingface": hf_mod,
+        "lm_eval.tasks": tasks_mod,
+    }.items():
+        monkeypatch.setitem(sys.modules, name, mod)
+
+    # deterministic regardless of whether bitsandbytes is installed locally
+    monkeypatch.setattr(evalmod, "_bitsandbytes_available", lambda: True)
+
+    return calls
+
+
+def test_eval_success_writes_results(fake_eval_env, tmp_path):
+    out_dir = tmp_path / "out"
+    result = CliRunner().invoke(
+        _eval_app(),
+        ["fake/model", "--tasks", "gsm8k", "--output-dir", str(out_dir)],
+    )
+
+    assert result.exit_code == 0, result.output
+    assert "Saved results to" in result.output
+    assert fake_eval_env["tasks"] == ["gsm8k"]
+    assert fake_eval_env["hflm_max_length"] == 2048
+    assert fake_eval_env["simple_evaluate_kwargs"]["task_manager"] is not None
+    assert fake_eval_env["simple_evaluate_kwargs"]["log_samples"] is False
+    assert fake_eval_env["include_path"] is None
+
+    saved = json.loads((out_dir / "results.json").read_text())
+    assert saved["results"]["gsm8k"]["exact_match,strict-match"] == 0.42
+
+
+def test_eval_jsonl_task_builds_task_manager(fake_eval_env, tmp_path):
+    data = tmp_path / "qa.jsonl"
+    data.write_text('{"question": "q", "answer": "a"}\n')
+
+    result = CliRunner().invoke(
+        _eval_app(),
+        ["fake/model", "--tasks", str(data), "--output-dir", str(tmp_path / "out")],
+    )
+
+    assert result.exit_code == 0, result.output
+    assert fake_eval_env["tasks"] == ["qa"]
+    assert fake_eval_env["include_path"] is not None
+
+
+def test_eval_mlx_falls_back_to_hf(fake_eval_env, tmp_path):
+    sys.modules["unsloth"].DEVICE_TYPE = "mlx"
+    result = CliRunner().invoke(
+        _eval_app(),
+        ["fake/model", "--tasks", "gsm8k", "--output-dir", str(tmp_path / "out")],
+    )
+    assert result.exit_code == 0, result.output
+    assert "falling back" in result.output
+    assert fake_eval_env["model"] == "hf"
+    assert fake_eval_env["model_args"] == {"pretrained": "fake/model", "max_length": 2048}
+    assert "model_name" not in fake_eval_env
+
+
+def test_eval_hf_backend_skips_unsloth(fake_eval_env, tmp_path):
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hf",
+            "--device",
+            "cpu",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 0, result.output
+    assert fake_eval_env["model"] == "hf"
+    assert fake_eval_env["model_args"] == {"pretrained": "fake/model", "max_length": 2048}
+    assert fake_eval_env["simple_evaluate_kwargs"]["device"] == "cpu"
+    assert "model_name" not in fake_eval_env
+
+
+def test_eval_rejects_nonpositive_batch_size(fake_eval_env, tmp_path):
+    for bad in ["0", "-1", "abc"]:
+        result = CliRunner().invoke(
+            _eval_app(),
+            [
+                "fake/model",
+                "--tasks",
+                "gsm8k",
+                "--backend",
+                "hf",
+                "--device",
+                "cpu",
+                "--batch-size",
+                bad,
+                "--output-dir",
+                str(tmp_path / "out"),
+            ],
+        )
+        assert result.exit_code == 2, (bad, result.output)
+        assert "positive integer or 'auto'" in result.output
+
+
+def test_eval_hf_forwards_max_seq_length(fake_eval_env, tmp_path):
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hf",
+            "--device",
+            "cpu",
+            "--max-seq-length",
+            "1024",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 0, result.output
+    assert fake_eval_env["model_args"] == {"pretrained": "fake/model", "max_length": 1024}
+
+
+def test_eval_unsloth_forwards_max_seq_length_to_hflm(fake_eval_env, tmp_path):
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            "gsm8k",
+            "--max-seq-length",
+            "512",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 0, result.output
+    assert fake_eval_env["hflm_max_length"] == 512
+
+
+def test_eval_hf_local_adapter_uses_adapter_tokenizer(fake_eval_env, tmp_path):
+    adapter = tmp_path / "adapter"
+    adapter.mkdir()
+    (adapter / "adapter_config.json").write_text(
+        json.dumps({"base_model_name_or_path": "unsloth/Llama-3.2-1B"})
+    )
+    (adapter / "tokenizer_config.json").write_text("{}")
+
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            str(adapter),
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hf",
+            "--device",
+            "cpu",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 0, result.output
+    assert fake_eval_env["model_args"] == {
+        "pretrained": "unsloth/Llama-3.2-1B",
+        "peft": str(adapter),
+        "tokenizer": str(adapter),
+        "max_length": 2048,
+    }
+
+
+def _make_local_adapter(tmp_path):
+    adapter = tmp_path / "adapter"
+    adapter.mkdir()
+    (adapter / "adapter_config.json").write_text(
+        json.dumps({"base_model_name_or_path": "unsloth/Llama-3.2-1B"})
+    )
+    (adapter / "tokenizer_config.json").write_text("{}")
+    return adapter
+
+
+def _install_adapter_stubs(monkeypatch, fake_eval_env, tokenizer_len):
+    peft_mod = types.ModuleType("peft")
+
+    class _FakePeftModel:
+        @staticmethod
+        def from_pretrained(model, adapter_path):
+            fake_eval_env["peft_adapter"] = adapter_path
+            fake_eval_env.setdefault("events", []).append(("peft", adapter_path))
+            return model
+
+    peft_mod.PeftModel = _FakePeftModel
+    monkeypatch.setitem(sys.modules, "peft", peft_mod)
+
+    class _FakeTokenizer:
+        name = "adapter-tok"
+
+        def __len__(self):
+            return tokenizer_len
+
+    transformers_mod = types.ModuleType("transformers")
+
+    class _FakeAutoTokenizer:
+        @staticmethod
+        def from_pretrained(path, **kwargs):
+            fake_eval_env["tokenizer_from"] = path
+            return _FakeTokenizer()
+
+    transformers_mod.AutoTokenizer = _FakeAutoTokenizer
+    monkeypatch.setitem(sys.modules, "transformers", transformers_mod)
+
+
+def test_eval_unsloth_adapter_prefers_adapter_tokenizer(fake_eval_env, tmp_path, monkeypatch):
+    adapter = _make_local_adapter(tmp_path)
+    # same vocab size as the fake base model: no resize expected
+    _install_adapter_stubs(monkeypatch, fake_eval_env, tokenizer_len = 32000)
+
+    result = CliRunner().invoke(
+        _eval_app(),
+        [str(adapter), "--tasks", "gsm8k", "--output-dir", str(tmp_path / "out")],
+    )
+    assert result.exit_code == 0, result.output
+    assert fake_eval_env["model_name"] == "unsloth/Llama-3.2-1B"
+    assert fake_eval_env["peft_adapter"] == str(adapter)
+    assert fake_eval_env["tokenizer_from"] == str(adapter)
+    assert fake_eval_env["hflm_tokenizer"].name == "adapter-tok"
+    assert fake_eval_env["events"] == [("peft", str(adapter))]
+
+
+def test_eval_unsloth_adapter_resizes_embeddings_before_peft(fake_eval_env, tmp_path, monkeypatch):
+    adapter = _make_local_adapter(tmp_path)
+    # adapter tokenizer grew past the fake base vocab (32000)
+    _install_adapter_stubs(monkeypatch, fake_eval_env, tokenizer_len = 32005)
+
+    result = CliRunner().invoke(
+        _eval_app(),
+        [str(adapter), "--tasks", "gsm8k", "--output-dir", str(tmp_path / "out")],
+    )
+    assert result.exit_code == 0, result.output
+    # the resize must land before the adapter weights are applied
+    assert fake_eval_env["events"] == [("resize", 32005), ("peft", str(adapter))]
+
+
+def test_eval_hf_honors_base_model_for_remote_adapter(fake_eval_env, tmp_path):
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "someuser/my-lora",
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hf",
+            "--device",
+            "cpu",
+            "--base-model",
+            "meta-llama/Llama-2-7b",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 0, result.output
+    assert fake_eval_env["model_args"] == {
+        "pretrained": "meta-llama/Llama-2-7b",
+        "peft": "someuser/my-lora",
+        "max_length": 2048,
+    }
+
+
+def test_eval_cuda_index_keeps_auto_batch_size(fake_eval_env, tmp_path):
+    sys.modules["torch"].cuda = SimpleNamespace(is_available = lambda: True, device_count = lambda: 1)
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hf",
+            "--device",
+            "cuda:0",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 0, result.output
+    # 'auto' survives an explicit CUDA index (not downgraded to 1)
+    assert fake_eval_env["simple_evaluate_kwargs"]["batch_size"] == "auto"
+    assert fake_eval_env["model_args"]["load_in_4bit"] is True
+
+
+def test_eval_unknown_task_errors(fake_eval_env, tmp_path):
+    result = CliRunner().invoke(
+        _eval_app(),
+        ["fake/model", "--tasks", "notarealtask", "--output-dir", str(tmp_path / "out")],
+    )
+    assert result.exit_code == 2, result.output
+    assert "unknown task" in result.output
+
+
+def test_eval_dataset_shadowing_builtin_is_renamed(fake_eval_env, tmp_path):
+    data = tmp_path / "gsm8k.jsonl"
+    data.write_text('{"question": "q", "answer": "a"}\n')
+
+    result = CliRunner().invoke(
+        _eval_app(),
+        ["fake/model", "--tasks", str(data), "--output-dir", str(tmp_path / "out")],
+    )
+
+    assert result.exit_code == 0, result.output
+    # the built-in gsm8k benchmark must not shadow the user's dataset
+    assert fake_eval_env["tasks"] == ["gsm8k_2"]
+    assert "as 'gsm8k_2'" in result.output
+
+
+def test_eval_custom_yaml_shadowing_builtin_errors(fake_eval_env, tmp_path):
+    task_file = tmp_path / "clash.yaml"
+    task_file.write_text(yaml.safe_dump({"task": "gsm8k", "output_type": "generate_until"}))
+
+    result = CliRunner().invoke(
+        _eval_app(),
+        ["fake/model", "--tasks", str(task_file), "--output-dir", str(tmp_path / "out")],
+    )
+    assert result.exit_code == 2, result.output
+    assert "redefines 'gsm8k'" in result.output
+
+
+def test_eval_fewshot_with_raw_key_dataset_errors(fake_eval_env, tmp_path):
+    data = tmp_path / "qa.jsonl"
+    data.write_text('{"expected answer": "2", "question": "1+1?"}\n')
+
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            str(data),
+            "--target-key",
+            "expected answer",
+            "--num-fewshot",
+            "2",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 2, result.output
+    assert "plain-identifier column names" in result.output
+
+
+def test_eval_custom_yaml_survives_broken_sibling(fake_eval_env, tmp_path):
+    task_file = tmp_path / "good.yaml"
+    task_file.write_text(yaml.safe_dump({"task": "good_task", "output_type": "generate_until"}))
+    # the fake TaskManager (like lm-eval 0.4.4) chokes on unparseable yaml
+    # in an include dir; the broken sibling must never reach it
+    (tmp_path / "broken.yaml").write_text("task: [unclosed")
+
+    result = CliRunner().invoke(
+        _eval_app(),
+        ["fake/model", "--tasks", str(task_file), "--output-dir", str(tmp_path / "out")],
+    )
+    assert result.exit_code == 0, result.output
+    assert fake_eval_env["tasks"] == ["good_task"]
+
+
+def test_eval_group_yaml_runs_under_group_name(fake_eval_env, tmp_path):
+    task_file = tmp_path / "suite.yaml"
+    task_file.write_text(yaml.safe_dump({"group": "my_suite", "task": ["task_a", "task_b"]}))
+
+    result = CliRunner().invoke(
+        _eval_app(),
+        ["fake/model", "--tasks", str(task_file), "--output-dir", str(tmp_path / "out")],
+    )
+    assert result.exit_code == 0, result.output
+    assert fake_eval_env["tasks"] == ["my_suite"]
+
+
+def test_eval_unsloth_rejects_multi_process_launch(fake_eval_env, tmp_path, monkeypatch):
+    monkeypatch.setenv("WORLD_SIZE", "2")
+    result = CliRunner().invoke(
+        _eval_app(),
+        ["fake/model", "--tasks", "gsm8k", "--output-dir", str(tmp_path / "out")],
+    )
+    assert result.exit_code == 2, result.output
+    assert "multi-process launches" in result.output
+
+
+def test_eval_hf_allows_multi_process_launch(fake_eval_env, tmp_path, monkeypatch):
+    monkeypatch.setenv("WORLD_SIZE", "2")
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hf",
+            "--device",
+            "cpu",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 0, result.output
+
+
+def test_eval_rejects_nonpositive_limit_and_max_seq_length(fake_eval_env, tmp_path):
+    for flag, bad, message in (
+        ("--limit", "0", "--limit must be a positive integer"),
+        ("--limit", "-5", "--limit must be a positive integer"),
+        ("--max-seq-length", "0", "--max-seq-length must be a positive integer"),
+        ("--max-seq-length", "-1", "--max-seq-length must be a positive integer"),
+    ):
+        result = CliRunner().invoke(
+            _eval_app(),
+            [
+                "fake/model",
+                "--tasks",
+                "gsm8k",
+                flag,
+                bad,
+                "--output-dir",
+                str(tmp_path / "out"),
+            ],
+        )
+        assert result.exit_code == 2, (flag, bad, result.output)
+        assert message in result.output, (flag, bad, result.output)
+
+
+def test_eval_rejects_negative_num_fewshot(fake_eval_env, tmp_path):
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            "gsm8k",
+            "--num-fewshot",
+            "-1",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 2, result.output
+    assert "--num-fewshot must be >= 0" in result.output
+
+
+def test_eval_rejects_unknown_backend(fake_eval_env, tmp_path):
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hff",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 2, result.output
+    assert "--backend must be 'unsloth' or 'hf'" in result.output
+
+
+def test_eval_hf_cuda_without_bnb_loads_full_precision(fake_eval_env, tmp_path, monkeypatch):
+    sys.modules["torch"].cuda = SimpleNamespace(is_available = lambda: True, device_count = lambda: 1)
+    monkeypatch.setattr(evalmod, "_bitsandbytes_available", lambda: False)
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hf",
+            "--device",
+            "cuda:0",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 0, result.output
+    assert "load_in_4bit" not in fake_eval_env["model_args"]
+    assert "bitsandbytes is not installed" in result.output
+
+
+def test_eval_hf_hub_adapter_uses_hub_tokenizer(fake_eval_env, tmp_path, monkeypatch):
+    remote_config = tmp_path / "adapter_config.json"
+    remote_config.write_text(json.dumps({"base_model_name_or_path": "unsloth/Llama-3.2-1B"}))
+
+    hub_mod = types.ModuleType("huggingface_hub")
+    hub_mod.hf_hub_download = lambda repo_id, filename, **kwargs: str(remote_config)
+    hub_mod.list_repo_files = lambda repo_id: ["adapter_config.json", "tokenizer.json"]
+    monkeypatch.setitem(sys.modules, "huggingface_hub", hub_mod)
+
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "someuser/my-lora",
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hf",
+            "--device",
+            "cpu",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 0, result.output
+    assert fake_eval_env["model_args"] == {
+        "pretrained": "unsloth/Llama-3.2-1B",
+        "peft": "someuser/my-lora",
+        "tokenizer": "someuser/my-lora",
+        "max_length": 2048,
+    }
+
+
+def test_eval_hf_rejects_cuda_when_unavailable(fake_eval_env, tmp_path):
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hf",
+            "--device",
+            "cuda",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 2, result.output
+    assert "CUDA is not available" in result.output
+
+
+def test_eval_hf_rejects_out_of_range_cuda_index(fake_eval_env, tmp_path):
+    sys.modules["torch"].cuda = SimpleNamespace(is_available = lambda: True, device_count = lambda: 1)
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hf",
+            "--device",
+            "cuda:1",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 2, result.output
+    assert "only 1 CUDA device(s)" in result.output
+
+
+def test_eval_worker_rank_exits_cleanly_on_none_results(fake_eval_env, tmp_path, monkeypatch):
+    monkeypatch.setenv("RANK", "1")
+    sys.modules["lm_eval"].simple_evaluate = lambda **kwargs: None
+
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hf",
+            "--device",
+            "cpu",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 0, result.output
+    assert "Saved results" not in result.output
+    assert not (tmp_path / "out").exists()
+
+
+def test_eval_none_results_errors_on_single_process(fake_eval_env, tmp_path, monkeypatch):
+    monkeypatch.delenv("RANK", raising = False)
+    monkeypatch.delenv("LOCAL_RANK", raising = False)
+    sys.modules["lm_eval"].simple_evaluate = lambda **kwargs: None
+
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hf",
+            "--device",
+            "cpu",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 1, result.output
+    assert "no results" in result.output
+
+
+def test_eval_hf_token_sets_env(fake_eval_env, tmp_path, monkeypatch):
+    monkeypatch.setenv("HF_TOKEN", "placeholder")
+    result = CliRunner().invoke(
+        _eval_app(),
+        [
+            "fake/model",
+            "--tasks",
+            "gsm8k",
+            "--backend",
+            "hf",
+            "--device",
+            "cpu",
+            "--hf-token",
+            "hf_secret",
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+    assert result.exit_code == 0, result.output
+    assert os.environ.get("HF_TOKEN") == "hf_secret"
+
+
+def test_resolve_tasks_dataset_before_group_still_avoids_child_names(tmp_path):
+    # argument order must not decide the generated task's name
+    (tmp_path / "suite.yaml").write_text(
+        yaml.safe_dump({"group": "suite", "task": ["qa", {"task": "qa_inline"}]})
+    )
+    (tmp_path / "qa.jsonl").write_text('{"question": "q", "answer": "a"}\n')
+    tmp_dir = tmp_path / "gen"
+
+    names, _ = evalmod.resolve_tasks(
+        f"{tmp_path / 'qa.jsonl'},{tmp_path / 'suite.yaml'}", "question", "answer", tmp_dir
+    )
+
+    assert names == ["qa_2", "suite"]
+    assert (tmp_dir / "generated" / "qa_2.yaml").exists()
+
+
+def test_resolve_tasks_rejects_builtin_child_shadowed_by_sibling(tmp_path):
+    (tmp_path / "suite.yaml").write_text(yaml.safe_dump({"group": "suite", "task": ["gsm8k"]}))
+    (tmp_path / "gsm8k.yaml").write_text(yaml.safe_dump({"task": "gsm8k", "dataset_path": "json"}))
+    with pytest.raises(ValueError, match = "depends on the lm-eval version"):
+        evalmod.resolve_tasks(
+            str(tmp_path / "suite.yaml"),
+            "question",
+            "answer",
+            tmp_path / "gen",
+            reserved = frozenset({"gsm8k"}),
+        )
+
+
+def test_resolve_tasks_allows_group_of_builtins_without_siblings(tmp_path):
+    # a suite that aggregates registered tasks is legitimate lm-eval usage
+    (tmp_path / "suite.yaml").write_text(
+        yaml.safe_dump({"group": "suite", "task": ["gsm8k", "mmlu"]})
+    )
+
+    names, _ = evalmod.resolve_tasks(
+        str(tmp_path / "suite.yaml"),
+        "question",
+        "answer",
+        tmp_path / "gen",
+        reserved = frozenset({"gsm8k", "mmlu"}),
+    )
+
+    assert names == ["suite"]
+
+
+def test_resolve_tasks_rejects_include_order_dependent_name(tmp_path):
+    # lm-eval versions disagree on include precedence, so a name that changes
+    # with the merge order must be rejected
+    (tmp_path / "a.yaml").write_text(yaml.safe_dump({"task": "name_a"}))
+    (tmp_path / "b.yaml").write_text(yaml.safe_dump({"task": "name_b"}))
+    child = tmp_path / "child.yaml"
+    child.write_text(yaml.safe_dump({"include": ["a.yaml", "b.yaml"], "dataset_path": "json"}))
+
+    with pytest.raises(ValueError, match = "include order"):
+        evalmod.resolve_tasks(str(child), "question", "answer", tmp_path / "gen")
+
+
+def test_resolve_tasks_accepts_local_name_over_include_conflict(tmp_path):
+    # a top-level task: settles the name on every lm-eval version
+    (tmp_path / "a.yaml").write_text(yaml.safe_dump({"task": "name_a"}))
+    (tmp_path / "b.yaml").write_text(yaml.safe_dump({"task": "name_b"}))
+    child = tmp_path / "child.yaml"
+    child.write_text(
+        yaml.safe_dump({"include": ["a.yaml", "b.yaml"], "task": "mine", "dataset_path": "json"})
+    )
+
+    names, _ = evalmod.resolve_tasks(str(child), "question", "answer", tmp_path / "gen")
+
+    assert names == ["mine"]
+
+
+def test_load_task_spec_resolves_includes_against_parent_dir(tmp_path, monkeypatch):
+    # lm-eval resolves relative includes against the including file, never cwd
+    task_dir = tmp_path / "tasks"
+    decoy_dir = tmp_path / "decoy"
+    task_dir.mkdir()
+    decoy_dir.mkdir()
+    (task_dir / "base.yaml").write_text(yaml.safe_dump({"task": "right"}))
+    (decoy_dir / "base.yaml").write_text(yaml.safe_dump({"task": "wrong"}))
+    child = task_dir / "child.yaml"
+    child.write_text(yaml.safe_dump({"include": "base.yaml", "dataset_path": "json"}))
+    monkeypatch.chdir(decoy_dir)
+
+    spec = evalmod._load_task_spec(child)
+
+    assert spec["task"] == "right"
+
+
+def test_json_default_preserves_numeric_scalars():
+    class _FakeNumpyScalar:
+        def tolist(self):
+            return 3
+
+    dumped = json.dumps({"n": _FakeNumpyScalar(), "s": {1, 2}}, default = evalmod._json_default)
+
+    parsed = json.loads(dumped)
+    assert parsed["n"] == 3
+    assert isinstance(parsed["s"], str)
+
+
+def test_resolve_tasks_rejects_builtin_child_shadowed_in_subdirectory(tmp_path):
+    # lm-eval indexes include paths recursively, so a nested sibling shadows too
+    (tmp_path / "suite.yaml").write_text(yaml.safe_dump({"group": "suite", "task": ["gsm8k"]}))
+    nested = tmp_path / "sub"
+    nested.mkdir()
+    (nested / "gsm8k.yaml").write_text(yaml.safe_dump({"task": "gsm8k", "dataset_path": "json"}))
+    with pytest.raises(ValueError, match = "depends on the lm-eval version"):
+        evalmod.resolve_tasks(
+            str(tmp_path / "suite.yaml"),
+            "question",
+            "answer",
+            tmp_path / "gen",
+            reserved = frozenset({"gsm8k"}),
+        )
+
+
+def test_hf_device_error_gates_xpu_hpu_on_lm_eval_version(monkeypatch):
+    # HFLM only enumerated xpu/hpu from 0.4.10; older versions silently fall back
+    _fake_torch(monkeypatch, xpu_available = True, xpu_count = 1)
+    monkeypatch.setattr(evalmod, "_lm_eval_version", lambda: (0, 4, 4))
+    assert "needs lm-eval >= 0.4.10" in evalmod._hf_device_error("xpu:0")
+    assert "needs lm-eval >= 0.4.10" in evalmod._hf_device_error("hpu:0")
+    # npu has been enumerated since 0.4.4
+    assert "NPU is not available" in evalmod._hf_device_error("npu:0")
+    monkeypatch.setattr(evalmod, "_lm_eval_version", lambda: (0, 4, 10))
+    assert evalmod._hf_device_error("xpu:0") is None
+
+
+def test_resolve_tasks_reserves_tag_aliases_for_datasets(tmp_path):
+    # a tag: alias registers under that name, so a dataset must not take it
+    (tmp_path / "custom.yaml").write_text(
+        yaml.safe_dump({"task": "foo", "tag": "qa", "dataset_path": "json"})
+    )
+    (tmp_path / "qa.jsonl").write_text('{"question": "q", "answer": "a"}\n')
+    tmp_dir = tmp_path / "gen"
+
+    names, _ = evalmod.resolve_tasks(
+        f"{tmp_path / 'qa.jsonl'},{tmp_path / 'custom.yaml'}", "question", "answer", tmp_dir
+    )
+
+    assert names == ["qa_2", "foo"]
+
+
+def test_resolve_tasks_reserves_string_group_alias_for_datasets(tmp_path):
+    # legacy string group: on a single task acts as a tag alias
+    (tmp_path / "custom.yaml").write_text(
+        yaml.safe_dump({"task": "foo", "group": "myalias", "dataset_path": "json"})
+    )
+    (tmp_path / "myalias.jsonl").write_text('{"question": "q", "answer": "a"}\n')
+    tmp_dir = tmp_path / "gen"
+
+    names, _ = evalmod.resolve_tasks(
+        f"{tmp_path / 'custom.yaml'},{tmp_path / 'myalias.jsonl'}", "question", "answer", tmp_dir
+    )
+
+    assert names == ["foo", "myalias_2"]