From 68f55d03302efdc68a802d5f1da578c615dbdd0c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 24 Jun 2026 11:33:41 +0000
Subject: [PATCH 01/80] Auto Xet to HTTP download fallback in from_pretrained;
 share Studio's via unsloth_zoo

Hugging Face Xet downloads can hang on a blob with no progress and no exception,
and a blocked native Xet thread cannot be killed in-process. Studio already
recovers from this; the shared logic now lives in unsloth_zoo.hf_xet_fallback.

unsloth main: FastModel/FastVisionModel/FastLanguageModel.from_pretrained now warm
the repo in a killable subprocess (maybe_prefetch_hf_snapshot ->
snapshot_download_with_xet_fallback) before the in-process load, so a stalled Xet
transfer auto-recovers over HTTP and the subsequent load is a cache hit that cannot
hang. Best-effort and guarded: local paths, offline/local_files_only, and the vLLM
fast_inference path are skipped; an older unsloth_zoo without the helper is a no-op.

Studio: utils/hf_xet_fallback.py is now a thin shim over unsloth_zoo.hf_xet_fallback
that injects Studio's marker-aware prepare_cache_for_transport, replacing its local
copy of the watchdog + spawn-child download + Xet->HTTP retry. Call sites and the
orchestrator's DownloadStallError import are unchanged. The hf_xet_fallback unit
test is slimmed to the shim-specific behavior (the full matrix is tested in
unsloth_zoo); the GGUF and training integration tests are unchanged.
---
 studio/backend/tests/test_hf_xet_fallback.py | 330 ++------------
 studio/backend/utils/hf_xet_fallback.py      | 454 +++----------------
 unsloth/models/_utils.py                     |  68 +++
 unsloth/models/llama.py                      |  12 +
 unsloth/models/vision.py                     |  12 +
 5 files changed, 193 insertions(+), 683 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 39ecebd328..56465bb98e 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -1,18 +1,19 @@
 # SPDX-License-Identifier: AGPL-3.0-only
 # Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
 
-"""Unit tests for utils.hf_xet_fallback: the no-progress watchdog, the Xet->HTTP
-transport policy, and the HF_HUB_DISABLE_XET precondition the fallback rests on.
-CPU-only, no network, no real subprocess (the per-attempt download seam is
-monkeypatched).
+"""Studio shim over the shared unsloth_zoo Xet -> HTTP stall fallback.
+
+The watchdog and transport-policy matrix (cached short-circuit, cancel, error
+propagation, the single Xet -> HTTP retry, the snapshot variant, the knobs) is
+tested once in unsloth_zoo (tests/test_hf_xet_fallback.py). Here we assert only
+the Studio-specific seam: the shim re-exports the shared API and injects Studio's
+marker-aware prepare_cache_for_transport on the HTTP retry. CPU-only, no network,
+no real subprocess (the per-attempt download seam is monkeypatched).
 """
 
 from __future__ import annotations
 
-import subprocess
 import sys
-import threading
-import time
 import types as _types
 from pathlib import Path
 
@@ -34,171 +35,48 @@
     sys.modules["structlog"] = _types.ModuleType("structlog")
 
 import huggingface_hub
-from huggingface_hub import constants as hf_constants
 
+import unsloth_zoo.hf_xet_fallback as shared
 import utils.hf_xet_fallback as xf
 
 
-# --------------------------------------------------------------------------- #
-# Watchdog: fires only on a constant-size .incomplete, sparse-aware byte total.
-# --------------------------------------------------------------------------- #
-REPO = "ztest/xet-watchdog"
-
-
-@pytest.fixture
-def hf_cache(tmp_path, monkeypatch):
-    monkeypatch.setattr(hf_constants, "HF_HUB_CACHE", str(tmp_path))
-    return tmp_path
-
-
-def _blobs_dir(root: Path, repo_id: str = REPO) -> Path:
-    d = root / f"models--{repo_id.replace('/', '--')}" / "blobs"
-    d.mkdir(parents = True, exist_ok = True)
-    return d
-
-
-def _wait(
-    predicate,
-    timeout: float = 2.0,
-    step: float = 0.02,
-) -> bool:
-    deadline = time.monotonic() + timeout
-    while time.monotonic() < deadline:
-        if predicate():
-            return True
-        time.sleep(step)
-    return predicate()
-
-
-def test_constant_incomplete_fires_stall(hf_cache):
-    blobs = _blobs_dir(hf_cache)
-    (blobs / "deadbeef.incomplete").write_bytes(b"\0" * 1024)  # never grows
-
-    calls: list[str] = []
-    stop = xf.start_watchdog(
-        repo_ids = [REPO], on_stall = calls.append, interval = 0.05, stall_timeout = 0.3
-    )
-    try:
-        assert _wait(
-            lambda: len(calls) >= 1, timeout = 3.0
-        ), "watchdog never fired on a constant-size .incomplete"
-    finally:
-        stop.set()
-    assert "stalled" in calls[0].lower()
-
-
-def test_growing_incomplete_never_stalls(hf_cache):
-    blobs = _blobs_dir(hf_cache)
-    part = blobs / "growing.incomplete"
-    part.write_bytes(b"\0" * 1024)
-
-    grow_stop = threading.Event()
-
-    def _grow():
-        size = 1024
-        while not grow_stop.wait(0.05):
-            size += 4096
-            part.write_bytes(b"\0" * size)
-
-    grower = threading.Thread(target = _grow, daemon = True)
-    grower.start()
-
-    calls: list[str] = []
-    stop = xf.start_watchdog(
-        repo_ids = [REPO], on_stall = calls.append, interval = 0.05, stall_timeout = 0.3
-    )
-    try:
-        time.sleep(1.0)  # well past stall_timeout, but bytes keep growing
-        assert calls == [], "watchdog fired despite continuous progress"
-    finally:
-        stop.set()
-        grow_stop.set()
-
-
-def test_no_incomplete_never_stalls(hf_cache):
-    blobs = _blobs_dir(hf_cache)
-    (blobs / "finalized_blob").write_bytes(b"\0" * 4096)  # no .incomplete
-
-    calls: list[str] = []
-    stop = xf.start_watchdog(
-        repo_ids = [REPO], on_stall = calls.append, interval = 0.05, stall_timeout = 0.3
-    )
-    try:
-        time.sleep(0.8)
-        assert calls == [], "watchdog fired with no active .incomplete"
-    finally:
-        stop.set()
-
-
-def test_stall_fires_at_most_once(hf_cache):
-    blobs = _blobs_dir(hf_cache)
-    (blobs / "frozen.incomplete").write_bytes(b"\0" * 2048)
-
-    calls: list[str] = []
-    stop = xf.start_watchdog(
-        repo_ids = [REPO], on_stall = calls.append, interval = 0.05, stall_timeout = 0.2
-    )
-    try:
-        assert _wait(lambda: len(calls) >= 1, timeout = 3.0)
-        time.sleep(0.6)  # keep ticking; must not fire again
-        assert len(calls) == 1, f"on_stall fired {len(calls)} times, expected exactly 1"
-    finally:
-        stop.set()
-
-
-def test_get_state_empty_cache(hf_cache):
-    assert xf.get_hf_download_state([REPO]) == (0, False)
-
-
-def test_get_state_absent_cache_root(tmp_path, monkeypatch):
-    monkeypatch.setattr(hf_constants, "HF_HUB_CACHE", str(tmp_path / "no-such-cache"))
-    assert xf.get_hf_download_state([REPO]) == (0, False)
-
+DL_REPO, FILE = "ztest/xet-dl", "model-Q4_K_XL.gguf"
 
-def test_get_state_skips_local_paths(hf_cache):
-    # Filesystem paths are not HF repo IDs and must be ignored without error.
-    assert xf.get_hf_download_state(["/abs/path", "./rel", "~user", "c:\\x"]) == (0, False)
 
+def test_shim_reexports_shared_api():
+    assert xf.DownloadStallError is shared.DownloadStallError
+    for name in (
+        "start_watchdog",
+        "get_hf_download_state",
+        "child_should_disable_xet",
+        "hf_hub_download_with_xet_fallback",
+        "snapshot_download_with_xet_fallback",
+    ):
+        assert hasattr(xf, name), f"shim missing {name}"
 
-def test_get_state_sparse_aware(hf_cache):
-    blobs = _blobs_dir(hf_cache)
-    sparse = blobs / "sparse.incomplete"
-    with open(sparse, "wb") as f:
-        f.truncate(64 * 1024 * 1024)  # large apparent size, few allocated blocks
-    st = sparse.stat()
-    if getattr(st, "st_blocks", 0) == 0:
-        pytest.skip("filesystem does not report st_blocks; sparse accounting unavailable")
-    total, has_incomplete = xf.get_hf_download_state([REPO])
-    assert has_incomplete is True
-    assert total < st.st_size, "sparse partial counted at apparent size, not allocated blocks"
 
+def test_child_should_disable_xet_truth_table():
+    assert xf.child_should_disable_xet({"disable_xet": True}) is True
+    assert xf.child_should_disable_xet({"disable_xet": False}) is False
+    assert xf.child_should_disable_xet({}) is False
 
-# --------------------------------------------------------------------------- #
-# Transport policy: cached short-circuit, cancel, error propagation, and the
-# single Xet->HTTP fallback. _run_download_attempt is faked, so no real spawn.
-# --------------------------------------------------------------------------- #
-DL_REPO, FILE = "ztest/xet-dl", "model-Q4_K_XL.gguf"
 
-
-@pytest.fixture(autouse = True)
-def _no_real_cache_hit(monkeypatch):
-    """Default: the cached probe misses; tests override it to force a hit."""
+def test_shim_injects_studio_prepare_on_http_retry(monkeypatch):
+    """A stall on Xet retries over HTTP, and the shim runs Studio's marker-aware
+    ``prepare_cache_for_transport(..., 'http')`` before the retry (not the generic
+    delete-incompletes default)."""
+    for var in ("UNSLOTH_DISABLE_XET", "UNSLOTH_STABLE_DOWNLOADS", "HF_HUB_DISABLE_XET"):
+        monkeypatch.delenv(var, raising = False)
     monkeypatch.setattr(huggingface_hub, "try_to_load_from_cache", lambda *a, **k: None)
 
+    seen_disable_xet = []
 
-class _FakeAttempt:
-    """Records calls to the download seam and returns scripted results."""
-
-    def __init__(self, results):
-        self._results = list(results)
-        self.calls = []
-
-    def __call__(
-        self,
+    def fake_attempt(
         repo_id,
-        filename,
-        token,
         *,
+        kind,
+        params,
+        token,
         repo_type,
         disable_xet,
         cancel_event,
@@ -207,146 +85,18 @@ def __call__(
         grace_period,
         on_status,
     ):
-        self.calls.append(
-            _types.SimpleNamespace(
-                repo_id = repo_id,
-                filename = filename,
-                disable_xet = disable_xet,
-                repo_type = repo_type,
-            )
-        )
-        return self._results[len(self.calls) - 1]
-
+        seen_disable_xet.append(disable_xet)
+        return ("ok", "/cache/model.gguf") if disable_xet else ("stall", None)
 
-def _install(monkeypatch, results):
-    fake = _FakeAttempt(results)
-    monkeypatch.setattr(xf, "_run_download_attempt", fake)
-    return fake
+    monkeypatch.setattr(shared, "_run_download_attempt", fake_attempt)
 
-
-def test_cached_file_short_circuits(monkeypatch, tmp_path):
-    cached = tmp_path / "cached.gguf"
-    cached.write_bytes(b"\0" * 8)
-    monkeypatch.setattr(huggingface_hub, "try_to_load_from_cache", lambda *a, **k: str(cached))
-    fake = _install(monkeypatch, [])  # must not be called
-
-    out = xf.hf_hub_download_with_xet_fallback(DL_REPO, FILE, None)
-    assert out == str(cached)
-    assert fake.calls == [], "spawned a download for an already-cached file"
-
-
-def test_cancel_before_start_raises_no_attempt(monkeypatch):
-    fake = _install(monkeypatch, [])
-    ev = threading.Event()
-    ev.set()
-    with pytest.raises(RuntimeError, match = "Cancelled"):
-        xf.hf_hub_download_with_xet_fallback(DL_REPO, FILE, None, cancel_event = ev)
-    assert fake.calls == []
-
-
-def test_nonstall_error_propagates_without_fallback(monkeypatch):
-    fake = _install(monkeypatch, [("error", "RepositoryNotFoundError: 404 not found")])
-    with pytest.raises(RuntimeError, match = "RepositoryNotFoundError"):
-        xf.hf_hub_download_with_xet_fallback(DL_REPO, FILE, None)
-    assert len(fake.calls) == 1, "deterministic error must not trigger an HTTP fallback"
-    assert fake.calls[0].disable_xet is False
-
-
-def test_immediate_success_uses_xet_only(monkeypatch):
-    prepared = []
-    monkeypatch.setattr(
-        "hub.utils.download_registry.prepare_cache_for_transport",
-        lambda *a, **k: prepared.append(a),
-    )
-    fake = _install(monkeypatch, [("ok", "/cache/model.gguf")])
-    out = xf.hf_hub_download_with_xet_fallback(DL_REPO, FILE, None)
-    assert out == "/cache/model.gguf"
-    assert len(fake.calls) == 1 and fake.calls[0].disable_xet is False
-    assert prepared == [], "no cache prep should run when Xet succeeds first try"
-
-
-def test_stall_then_http_fallback_succeeds(monkeypatch):
     prepared = []
     monkeypatch.setattr(
         "hub.utils.download_registry.prepare_cache_for_transport",
         lambda repo_type, repo_id, mode, *a, **k: prepared.append((repo_type, repo_id, mode)),
     )
-    fake = _install(monkeypatch, [("stall", None), ("ok", "/cache/model.gguf")])
 
     out = xf.hf_hub_download_with_xet_fallback(DL_REPO, FILE, None)
     assert out == "/cache/model.gguf"
-    assert len(fake.calls) == 2
-    assert fake.calls[0].disable_xet is False  # Xet first
-    assert fake.calls[1].disable_xet is True  # HTTP fallback
-    assert prepared == [("model", DL_REPO, "http")], "must prep cache for HTTP before the retry"
-
-
-def test_second_stall_raises_download_stall_error(monkeypatch):
-    monkeypatch.setattr(
-        "hub.utils.download_registry.prepare_cache_for_transport", lambda *a, **k: None
-    )
-    fake = _install(monkeypatch, [("stall", None), ("stall", None)])
-    with pytest.raises(xf.DownloadStallError):
-        xf.hf_hub_download_with_xet_fallback(DL_REPO, FILE, None)
-    assert len(fake.calls) == 2
-
-
-def test_cancelled_midattempt_raises_no_fallback(monkeypatch):
-    fake = _install(monkeypatch, [("cancelled", None)])
-    with pytest.raises(RuntimeError, match = "Cancelled"):
-        xf.hf_hub_download_with_xet_fallback(DL_REPO, FILE, None)
-    assert len(fake.calls) == 1
-
-
-def test_per_file_independent_fallback(monkeypatch):
-    """A stalled shard falls back; a sibling shard that succeeds does not."""
-    monkeypatch.setattr(
-        "hub.utils.download_registry.prepare_cache_for_transport", lambda *a, **k: None
-    )
-    fake = _install(monkeypatch, [("ok", "/a"), ("stall", None), ("ok", "/b")])
-    assert xf.hf_hub_download_with_xet_fallback(DL_REPO, "shardA.gguf", None) == "/a"
-    assert xf.hf_hub_download_with_xet_fallback(DL_REPO, "shardB.gguf", None) == "/b"
-    assert [c.disable_xet for c in fake.calls] == [False, False, True]
-
-
-# --------------------------------------------------------------------------- #
-# Precondition: HF_HUB_DISABLE_XET is read at import time, so assert its effect
-# in a FRESH interpreter (huggingface/huggingface_hub#3266 once ignored it).
-# --------------------------------------------------------------------------- #
-def _safe_path() -> str:
-    import os
-    return os.environ.get("PATH", "")
-
-
-def test_disable_xet_constant_set_in_fresh_interpreter():
-    code = (
-        "from huggingface_hub import constants as c; "
-        "import sys; sys.exit(0 if c.HF_HUB_DISABLE_XET is True else 17)"
-    )
-    proc = subprocess.run(
-        [sys.executable, "-c", code],
-        env = {"HF_HUB_DISABLE_XET": "1", "PATH": _safe_path()},
-        capture_output = True,
-        text = True,
-    )
-    assert proc.returncode == 0, (
-        f"HF_HUB_DISABLE_XET=1 did not set constants.HF_HUB_DISABLE_XET=True "
-        f"(rc={proc.returncode}): {proc.stderr}"
-    )
-
-
-def test_default_leaves_xet_enabled():
-    code = (
-        "from huggingface_hub import constants as c; "
-        "import sys; sys.exit(0 if c.HF_HUB_DISABLE_XET is False else 17)"
-    )
-    proc = subprocess.run(
-        [sys.executable, "-c", code],
-        env = {"PATH": _safe_path()},  # no HF_HUB_DISABLE_XET
-        capture_output = True,
-        text = True,
-    )
-    assert proc.returncode == 0, (
-        f"without the env var, constants.HF_HUB_DISABLE_XET was not False "
-        f"(rc={proc.returncode}): {proc.stderr}"
-    )
+    assert seen_disable_xet == [False, True]  # Xet first, then HTTP
+    assert prepared == [("model", DL_REPO, "http")], "shim must run Studio's marker-aware prep"
diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index a6ba69fffc..ecc565fb02 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -1,337 +1,56 @@
 # SPDX-License-Identifier: AGPL-3.0-only
 # Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
 
-"""Xet-primary HF downloads with an automatic HTTP fallback on a no-progress stall.
-
-Xet (``hf_xet``) is the fast default but can hang with no progress and no
-exception, and a blocked native thread cannot be killed. Keep Xet primary; fall
-back to plain HTTP only when the parent observes a stall. ``HF_HUB_DISABLE_XET``
-is read at import time, so the fallback runs in a fresh ``spawn`` child (not a
-thread) that sets the env before importing ``huggingface_hub``. Cached files
-short-circuit with no child; deterministic errors (401/403/404/disk-full) and
-cancellation propagate without a fallback. Mirrors the safetensors inference
-recovery in core/inference/{orchestrator,worker}.py.
+"""Studio shim over the shared Xet -> HTTP stall fallback.
+
+The no-progress watchdog, the spawn-child download, and the single Xet -> HTTP
+retry now live once in ``unsloth_zoo.hf_xet_fallback`` (so Unsloth main and Studio
+share one implementation). This module re-exports that API and injects Studio's
+marker-aware cache purge (``prepare_cache_for_transport``) so the hub download
+manager keeps its ``.transport`` marker semantics on the HTTP retry. Call sites
+(core/inference/llama_cpp.py, core/training/worker.py) and the orchestrator's
+``DownloadStallError`` import are unchanged.
 """
 
 from __future__ import annotations
 
-import multiprocessing as mp
-import os
-import queue
-import signal
-import sys
 import threading
-import time
-from typing import Any, Callable, Optional
-
-from loggers import get_logger
-
-logger = get_logger(__name__)
-
-_CTX = mp.get_context("spawn")
-
-# Defaults match the existing inference watchdog and hub shutdown deadline.
-DEFAULT_HEARTBEAT_INTERVAL = 30.0
-DEFAULT_STALL_TIMEOUT = 180.0
-DEFAULT_GRACE_PERIOD = 10.0
-_POLL_INTERVAL = 0.5
-
-
-class DownloadStallError(RuntimeError):
-    """Raised when no download progress is observed for too long.
-
-    Canonical home; orchestrator.py re-imports it so all paths share one type.
-    """
-
-
-def child_should_disable_xet(config: dict) -> bool:
-    """Single source of truth for the per-worker Xet env flip."""
-    return bool(config.get("disable_xet"))
-
-
-def get_hf_download_state(
-    repo_ids: Optional[list[str]] = None, *, repo_type: str = "model"
-) -> Optional[tuple[int, bool]]:
-    """Return ``(total_on_disk_bytes, has_incomplete)`` for the active HF cache.
-
-    Sparse-aware (st_blocks based) so a sparse Xet/``hf_transfer`` ``.incomplete``
-    is not mistaken for full-size progress. ``None`` means the state could not be
-    measured, so callers skip stall logic for that tick.
-    """
-    try:
-        from hub.utils.hf_cache_state import (
-            blob_bytes_present,
-            has_active_incomplete_blobs,
-            hf_cache_root,
-            iter_active_repo_cache_dirs,
-        )
-
-        if hf_cache_root() is None:
-            return (0, False)
-
-        total = 0
-        has_incomplete = False
-        for repo_id in repo_ids or []:
-            # Skip local paths: HF IDs never start with / . ~ or contain "\".
-            if not repo_id or repo_id.startswith(("/", ".", "~")) or "\\" in repo_id:
-                continue
-            for entry in iter_active_repo_cache_dirs(repo_type, repo_id):
-                blobs_dir = entry / "blobs"
-                if not blobs_dir.is_dir():
-                    continue
-                for blob in blobs_dir.iterdir():
-                    try:
-                        if blob.is_file():
-                            total += blob_bytes_present(blob)
-                    except OSError:
-                        pass
-            if has_active_incomplete_blobs(repo_type, repo_id):
-                has_incomplete = True
-        return (total, has_incomplete)
-    except Exception as e:
-        logger.debug("Failed to determine HF download state: %s", e)
-        return None
-
-
-def start_watchdog(
-    *,
-    repo_ids: list[str],
-    on_stall: Callable[[str], None],
-    repo_type: str = "model",
-    interval: float = DEFAULT_HEARTBEAT_INTERVAL,
-    stall_timeout: float = DEFAULT_STALL_TIMEOUT,
-    xet_disabled: bool = False,
-    on_heartbeat: Optional[Callable[[str], None]] = None,
-) -> threading.Event:
-    """Start a daemon thread that fires ``on_stall(message)`` exactly once iff a
-    ``*.incomplete`` is present AND the on-disk size is unchanged for
-    *stall_timeout* seconds. The timer resets while no ``*.incomplete`` exists, so
-    post-download init is never misread as a stall. Returns a stop event the
-    caller sets when the download phase ends.
-    """
-    stop = threading.Event()
-    transport = "https" if xet_disabled else "xet"
-    fired = False
-
-    def _beat() -> None:
-        nonlocal fired
-        state = get_hf_download_state(repo_ids, repo_type = repo_type)
-        last_size = state[0] if state is not None else 0
-        last_change = time.monotonic()
-
-        while not stop.wait(interval):
-            state = get_hf_download_state(repo_ids, repo_type = repo_type)
-            now = time.monotonic()
-
-            if state is None:
-                if on_heartbeat is not None:
-                    on_heartbeat(f"Downloading ({transport} transport)...")
-                continue
-
-            current_size, has_incomplete = state
-            if current_size != last_size:
-                last_size = current_size
-                last_change = now
-
-            # Reset unless .incomplete confirms an active download, so model init
-            # and lock waits are not counted as a stall.
-            if not has_incomplete:
-                last_change = now
-            elif now - last_change >= stall_timeout:
-                if not fired:
-                    fired = True
-                    on_stall(
-                        f"Download appears stalled ({transport} transport) "
-                        f"-- no progress for {int(now - last_change)}s"
-                    )
-                return
-
-            if on_heartbeat is not None:
-                on_heartbeat(f"Downloading ({transport} transport)...")
-
-    threading.Thread(target = _beat, daemon = True, name = "hf-xet-watchdog").start()
-    return stop
-
-
-def _download_child_entry(
-    *,
-    repo_id: str,
-    filename: str,
-    token: Optional[str],
-    repo_type: str,
-    disable_xet: bool,
-    result_queue: Any,
-) -> None:
-    """Spawn-child entrypoint: download one file and report the result.
-
-    Top-level and picklable. Sets the Xet env BEFORE importing huggingface_hub,
-    forms its own process group so the parent can kill the whole transfer, and
-    never logs the token or signed URLs.
-    """
-    # Die with Studio on Linux (this mp child gets no parent-set preexec_fn).
-    try:
-        from utils.process_lifetime import bind_current_process_to_parent_lifetime
-        bind_current_process_to_parent_lifetime()
-    except Exception:
-        pass
-
-    if hasattr(os, "setsid"):
-        try:
-            os.setsid()
-        except OSError:
-            pass
-
-    if disable_xet:
-        os.environ["HF_HUB_DISABLE_XET"] = "1"
-        # Keep the HTTP writer sequential and resumable (hf_transfer leaves sparse
-        # partials a sequential resume cannot safely continue).
-        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
-    os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
-
-    # Test-only fault injection (never set in production): stall the Xet attempt
-    # so the watchdog + HTTP fallback can be exercised against a real repo.
-    if not disable_xet and os.environ.get("UNSLOTH_HF_XET_FORCE_STALL") == "1":
-        import time as _t
-        try:
-            from huggingface_hub.constants import HF_HUB_CACHE
-
-            blobs = os.path.join(HF_HUB_CACHE, "models--" + repo_id.replace("/", "--"), "blobs")
-            os.makedirs(blobs, exist_ok = True)
-            with open(os.path.join(blobs, "xet-force-stall.incomplete"), "wb") as fh:
-                fh.write(b"\0" * 4096)
-        except OSError:
-            pass
-        while True:
-            _t.sleep(3600)
-
-    try:
-        from huggingface_hub import hf_hub_download
-        path = hf_hub_download(
-            repo_id = repo_id,
-            filename = filename,
-            repo_type = repo_type,
-            token = token,
-        )
-        result_queue.put({"ok": True, "path": path})
-    except BaseException as e:  # noqa: BLE001 - report every failure to the parent
-        error = f"{type(e).__name__}: {e}"
-        try:
-            from hub.utils.download_registry import scrub_secrets
-            error = scrub_secrets(error, hf_token = token)
-        except Exception:
-            pass
-        result_queue.put({"ok": False, "error": error})
-
-
-def _terminate_process_group(proc: "mp.process.BaseProcess", grace_period: float) -> None:
-    """Kill *proc* and its whole process group (Xet may spawn helper procs).
-
-    The child calls ``os.setsid()`` so its pgid equals its pid; signal via
-    ``os.killpg(pid, ...)`` -- NOT ``getpgid``, which before the child becomes a
-    group leader resolves to OUR group. SIGTERM, then SIGKILL after *grace_period*.
-    """
-    pid = proc.pid
-
-    def _signal_group(sig: int) -> None:
-        if pid is not None and hasattr(os, "killpg"):
-            try:
-                os.killpg(pid, sig)
-                return
-            except (ProcessLookupError, PermissionError, OSError):
-                pass
-        # Windows or pre-setsid: best effort on the single process.
-        try:
-            proc.terminate() if sig != getattr(signal, "SIGKILL", -9) else proc.kill()
-        except Exception:
-            pass
-
-    _signal_group(getattr(signal, "SIGTERM", signal.SIGINT))
-    proc.join(timeout = grace_period)
-    if proc.is_alive():
-        _signal_group(getattr(signal, "SIGKILL", signal.SIGTERM))
-        proc.join(timeout = 5.0)
-
-
-def _run_download_attempt(
-    repo_id: str,
-    filename: str,
-    token: Optional[str],
-    *,
-    repo_type: str,
-    disable_xet: bool,
-    cancel_event: Optional[threading.Event],
-    stall_timeout: float,
-    interval: float,
-    grace_period: float,
-    on_status: Optional[Callable[[str], None]],
-) -> tuple[str, Optional[str]]:
-    """Run one download in a spawn child supervised by the no-progress watchdog.
-
-    Returns ``("ok", path)``, ``("stall", None)``, ``("cancelled", None)``, or
-    ``("error", message)``. This is the seam tests monkeypatch to avoid spawning.
-    """
-    result_queue: Any = _CTX.Queue()
-    proc = _CTX.Process(
-        target = _download_child_entry,
-        kwargs = dict(
-            repo_id = repo_id,
-            filename = filename,
-            token = token,
-            repo_type = repo_type,
-            disable_xet = disable_xet,
-            result_queue = result_queue,
-        ),
-        daemon = True,
-    )
-    proc.start()
-    from utils.process_lifetime import adopt_pid
-
-    adopt_pid(proc.pid)  # bind to parent lifetime (Windows job / sweep)
-
-    stalled = threading.Event()
-    stop_watchdog = start_watchdog(
-        repo_ids = [repo_id],
-        on_stall = lambda msg: stalled.set(),
-        repo_type = repo_type,
-        interval = interval,
-        stall_timeout = stall_timeout,
-        xet_disabled = disable_xet,
-        on_heartbeat = on_status,
-    )
-
-    result: Optional[dict] = None
-    try:
-        while proc.is_alive():
-            if cancel_event is not None and cancel_event.is_set():
-                _terminate_process_group(proc, grace_period)
-                return ("cancelled", None)
-            if stalled.is_set():
-                _terminate_process_group(proc, grace_period)
-                return ("stall", None)
-            try:
-                result = result_queue.get(timeout = _POLL_INTERVAL)
-                break
-            except queue.Empty:
-                continue
-        else:
-            # Process exited; drain any result it enqueued.
-            try:
-                result = result_queue.get_nowait()
-            except queue.Empty:
-                result = None
-    finally:
-        stop_watchdog.set()
-        proc.join(timeout = grace_period)
-
-    if result is None:
-        return (
-            "error",
-            f"download process for '{repo_id}/{filename}' exited "
-            f"(code={proc.exitcode}) without a result",
-        )
-    if result.get("ok"):
-        return ("ok", result["path"])
-    return ("error", result.get("error") or "unknown download error")
+from typing import Callable, Optional
+
+from unsloth_zoo.hf_xet_fallback import (
+    DEFAULT_GRACE_PERIOD,
+    DEFAULT_HEARTBEAT_INTERVAL,
+    DEFAULT_STALL_TIMEOUT,
+    DownloadStallError,
+    child_should_disable_xet,
+    get_hf_download_state,
+    snapshot_download_with_xet_fallback,
+    start_watchdog,
+)
+from unsloth_zoo.hf_xet_fallback import (
+    hf_hub_download_with_xet_fallback as _shared_hf_hub_download_with_xet_fallback,
+)
+
+__all__ = [
+    "DEFAULT_GRACE_PERIOD",
+    "DEFAULT_HEARTBEAT_INTERVAL",
+    "DEFAULT_STALL_TIMEOUT",
+    "DownloadStallError",
+    "child_should_disable_xet",
+    "get_hf_download_state",
+    "start_watchdog",
+    "hf_hub_download_with_xet_fallback",
+    "snapshot_download_with_xet_fallback",
+]
+
+
+def _studio_prepare_for_http(repo_type: str, repo_id: str) -> None:
+    """Make the partial safe for an HTTP resume using Studio's marker-aware purge,
+    so the download manager's ``.transport`` marker accounting stays consistent
+    (vs the generic delete-incompletes default in unsloth_zoo)."""
+    from hub.utils.download_registry import prepare_cache_for_transport
+
+    prepare_cache_for_transport(repo_type, repo_id, "http")
 
 
 def hf_hub_download_with_xet_fallback(
@@ -341,75 +60,24 @@ def hf_hub_download_with_xet_fallback(
     *,
     cancel_event: Optional[threading.Event] = None,
     repo_type: str = "model",
+    revision: Optional[str] = None,
     stall_timeout: float = DEFAULT_STALL_TIMEOUT,
     interval: float = DEFAULT_HEARTBEAT_INTERVAL,
     grace_period: float = DEFAULT_GRACE_PERIOD,
     on_status: Optional[Callable[[str], None]] = None,
 ) -> str:
-    """Download a single file with Xet primary and HTTP as a stall-only fallback.
-
-    Returns the local cache path. Raises ``RuntimeError("Cancelled")`` if
-    *cancel_event* is set, re-raises a deterministic child error unchanged (no
-    fallback), and raises ``DownloadStallError`` only if BOTH transports stall.
-    """
-    # Finalized blob already cached: return it with no child and no network.
-    try:
-        from huggingface_hub import try_to_load_from_cache
-        cached = try_to_load_from_cache(repo_id, filename, repo_type = repo_type)
-        if isinstance(cached, str) and os.path.exists(cached):
-            return cached
-    except Exception as e:
-        logger.debug("Cached probe failed for %s/%s: %s", repo_id, filename, e)
-
-    if cancel_event is not None and cancel_event.is_set():
-        raise RuntimeError("Cancelled")
-
-    disable_xet = False
-    for attempt in range(2):
-        if disable_xet:
-            # Purge a non-HTTP partial before resuming over HTTP: an HTTP resume
-            # over a sparse Xet/hf_transfer partial silently corrupts the blob.
-            try:
-                from hub.utils.download_registry import prepare_cache_for_transport
-                prepare_cache_for_transport(repo_type, repo_id, "http")
-            except Exception as e:
-                logger.debug("prepare_cache_for_transport failed for %s: %s", repo_id, e)
-
-        kind, payload = _run_download_attempt(
-            repo_id,
-            filename,
-            token,
-            repo_type = repo_type,
-            disable_xet = disable_xet,
-            cancel_event = cancel_event,
-            stall_timeout = stall_timeout,
-            interval = interval,
-            grace_period = grace_period,
-            on_status = on_status,
-        )
-
-        if kind == "ok":
-            return payload  # type: ignore[return-value]
-        if kind == "cancelled":
-            raise RuntimeError("Cancelled")
-        if kind == "error":
-            # Deterministic failure: the other transport would fail identically.
-            raise RuntimeError(payload)
-        # kind == "stall"
-        if attempt == 0 and not disable_xet:
-            logger.warning(
-                "Download stalled for '%s/%s' -- retrying with HF_HUB_DISABLE_XET=1",
-                repo_id,
-                filename,
-            )
-            if on_status is not None:
-                on_status(f"{repo_id}/{filename}: Xet stalled, retrying over HTTP")
-            disable_xet = True
-            continue
-        raise DownloadStallError(
-            f"Download stalled for '{repo_id}/{filename}' even with "
-            f"HF_HUB_DISABLE_XET=1 -- check your network connection"
-        )
-
-    # Unreachable: the loop either returns or raises on each attempt.
-    raise DownloadStallError(f"Download failed for '{repo_id}/{filename}'")
+    """Single-file download with the shared Xet -> HTTP stall fallback, using
+    Studio's marker-aware cache prep on the HTTP retry."""
+    return _shared_hf_hub_download_with_xet_fallback(
+        repo_id,
+        filename,
+        token,
+        cancel_event = cancel_event,
+        repo_type = repo_type,
+        revision = revision,
+        stall_timeout = stall_timeout,
+        interval = interval,
+        grace_period = grace_period,
+        on_status = on_status,
+        prepare_for_http_fn = _studio_prepare_for_http,
+    )
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 7a056cef82..992380219e 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -83,6 +83,7 @@
     "verify_fp8_support_if_applicable",
     "_get_inference_mode_context_manager",
     "hf_login",
+    "maybe_prefetch_hf_snapshot",
     "is_moe_model",
     "get_moe_target_parameters",
     "make_fast_generate_wrapper",
@@ -824,6 +825,73 @@ def _run_temporary_patches(phase):
 TORCHAO_MSG = "Error: torchao not found, please install with `pip install torchao`"
 
 
+def maybe_prefetch_hf_snapshot(
+    model_name,
+    token = None,
+    *,
+    revision = None,
+    cache_dir = None,
+    local_files_only = False,
+    fast_inference = False,
+):
+    """Warm the Hugging Face cache for a remote repo before the in-process load.
+
+    Xet downloads can hang on a blob with no progress and no exception, and a
+    blocked native Xet thread cannot be killed in-process. We pull the whole
+    snapshot first in a killable subprocess that automatically falls back from
+    Xet to plain HTTP on a no-progress stall (unsloth_zoo.hf_xet_fallback); the
+    from_pretrained that follows is then a cache hit and cannot stall on Xet.
+
+    Best-effort: a deterministic failure (missing repo, auth, disk) is left for
+    from_pretrained to surface canonically; only a both-transports-stalled
+    DownloadStallError is raised here, to avoid a silent in-process hang.
+    """
+    try:
+        from unsloth_zoo.hf_xet_fallback import (
+            snapshot_download_with_xet_fallback,
+            DownloadStallError,
+        )
+    except Exception:
+        # Older unsloth_zoo without the helper: skip warming, load normally.
+        return
+
+    if not isinstance(model_name, str) or not model_name:
+        return
+    # A local directory / file path has nothing to download.
+    if os.path.isdir(model_name) or os.path.exists(model_name):
+        return
+    # Offline / cache-only: never reach out.
+    if local_files_only:
+        return
+    if any(
+        os.environ.get(flag, "0").lower() in ("1", "true", "yes", "on")
+        for flag in ("HF_HUB_OFFLINE", "TRANSFORMERS_OFFLINE")
+    ):
+        return
+    # vLLM has its own download path; leave it untouched.
+    if fast_inference:
+        return
+
+    try:
+        snapshot_download_with_xet_fallback(
+            model_name,
+            token = token,
+            revision = revision,
+            cache_dir = cache_dir,
+        )
+    except DownloadStallError:
+        # Both Xet and HTTP stalled: surface a clear network error instead of
+        # letting the in-process load hang on the same stall.
+        raise
+    except Exception as exception:
+        logger.warning_once(
+            f"Unsloth: Could not pre-download {model_name} "
+            f"({type(exception).__name__}: {exception}); continuing with the normal load."
+        )
+    return
+pass
+
+
 # Ignore logging messages
 class HideLoggingMessage(logging.Filter):
     __slots__ = ("text",)
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index e8584caf7d..ec31198eba 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2373,6 +2373,18 @@ def from_pretrained(
         # For debugging - we use a download counter to see if environments are not breaking or if HF is down
         get_statistics(kwargs.get("local_files_only", False))
 
+        # Pre-download the repo in a killable subprocess that falls back from Xet
+        # to HTTP on a no-progress stall, so the in-process load below is a cache
+        # hit and cannot hang on a stalled Xet transfer.
+        maybe_prefetch_hf_snapshot(
+            model_name,
+            token = token,
+            revision = revision,
+            cache_dir = kwargs.get("cache_dir"),
+            local_files_only = kwargs.get("local_files_only", False),
+            fast_inference = fast_inference,
+        )
+
         if dtype is None:
             dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
         elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16:
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 0bd70c6d41..1bdd8d038e 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -758,6 +758,18 @@ def from_pretrained(
         # For debugging - we use a download counter to see if environments are not breaking or if HF is down
         get_statistics(kwargs.get("local_files_only", False))
 
+        # Pre-download the repo in a killable subprocess that falls back from Xet
+        # to HTTP on a no-progress stall, so the in-process load below is a cache
+        # hit and cannot hang on a stalled Xet transfer.
+        maybe_prefetch_hf_snapshot(
+            model_name,
+            token = token,
+            revision = kwargs.get("revision"),
+            cache_dir = kwargs.get("cache_dir"),
+            local_files_only = kwargs.get("local_files_only", False),
+            fast_inference = fast_inference,
+        )
+
         if dtype is None:
             dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
         elif os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "1":

From 18d277c9069a9bc05eefc5f99f806f9cf36527e4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 Jun 2026 11:41:29 +0000
Subject: [PATCH 02/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/utils/hf_xet_fallback.py | 1 -
 unsloth/models/_utils.py                | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index ecc565fb02..78e70ff076 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -49,7 +49,6 @@ def _studio_prepare_for_http(repo_type: str, repo_id: str) -> None:
     so the download manager's ``.transport`` marker accounting stays consistent
     (vs the generic delete-incompletes default in unsloth_zoo)."""
     from hub.utils.download_registry import prepare_cache_for_transport
-
     prepare_cache_for_transport(repo_type, repo_id, "http")
 
 
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 992380219e..6b7decce8c 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -889,7 +889,8 @@ def maybe_prefetch_hf_snapshot(
             f"({type(exception).__name__}: {exception}); continuing with the normal load."
         )
     return
-pass
+
+
 
 
 # Ignore logging messages

From f0c1d4794f56ee08f9fa5608fc2e1b69a57ec019 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 24 Jun 2026 11:59:20 +0000
Subject: [PATCH 03/80] Address review: guard Studio shim import, inject prep
 on snapshot, prune prefetch

- Studio shim: guard the unsloth_zoo.hf_xet_fallback import and raise an actionable
  RuntimeError if the installed unsloth_zoo predates the shared helper, instead of a
  bare ModuleNotFoundError at Studio startup.
- Studio shim: wrap snapshot_download_with_xet_fallback too (not just the single-file
  path) so its HTTP retry uses Studio's marker-aware prepare_cache_for_transport.
- unsloth main: maybe_prefetch_hf_snapshot now passes a conservative ignore_patterns
  so prewarming a mixed-format repo does not pull ONNX/TF/Flax/CoreML/GGUF/training
  state the Transformers load never reads (ignore list, not an allowlist, so no needed
  file is dropped).
- Add a Studio shim test for the snapshot prep injection.
---
 studio/backend/tests/test_hf_xet_fallback.py | 16 +++++++
 studio/backend/utils/hf_xet_fallback.py      | 46 ++++++++++++++------
 unsloth/models/_utils.py                     | 12 +++++
 3 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 56465bb98e..1026d92dbb 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -100,3 +100,19 @@ def fake_attempt(
     assert out == "/cache/model.gguf"
     assert seen_disable_xet == [False, True]  # Xet first, then HTTP
     assert prepared == [("model", DL_REPO, "http")], "shim must run Studio's marker-aware prep"
+
+
+def test_shim_snapshot_injects_studio_prepare(monkeypatch):
+    """The snapshot wrapper forwards Studio's marker-aware prep, like the file wrapper."""
+    captured = {}
+
+    def fake_snapshot(repo_id, **kwargs):
+        captured["repo_id"] = repo_id
+        captured["prepare_for_http_fn"] = kwargs.get("prepare_for_http_fn")
+        return "/tmp/snap-dir"
+
+    monkeypatch.setattr(xf, "_shared_snapshot_download_with_xet_fallback", fake_snapshot)
+    out = xf.snapshot_download_with_xet_fallback("org/model")
+    assert out == "/tmp/snap-dir"
+    assert captured["repo_id"] == "org/model"
+    assert captured["prepare_for_http_fn"] is xf._studio_prepare_for_http
diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index ecc565fb02..c63c2036e5 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -15,21 +15,32 @@
 from __future__ import annotations
 
 import threading
-from typing import Callable, Optional
+from typing import Any, Callable, Optional
 
-from unsloth_zoo.hf_xet_fallback import (
-    DEFAULT_GRACE_PERIOD,
-    DEFAULT_HEARTBEAT_INTERVAL,
-    DEFAULT_STALL_TIMEOUT,
-    DownloadStallError,
-    child_should_disable_xet,
-    get_hf_download_state,
-    snapshot_download_with_xet_fallback,
-    start_watchdog,
-)
-from unsloth_zoo.hf_xet_fallback import (
-    hf_hub_download_with_xet_fallback as _shared_hf_hub_download_with_xet_fallback,
-)
+try:
+    from unsloth_zoo.hf_xet_fallback import (
+        DEFAULT_GRACE_PERIOD,
+        DEFAULT_HEARTBEAT_INTERVAL,
+        DEFAULT_STALL_TIMEOUT,
+        DownloadStallError,
+        child_should_disable_xet,
+        get_hf_download_state,
+        start_watchdog,
+    )
+    from unsloth_zoo.hf_xet_fallback import (
+        hf_hub_download_with_xet_fallback as _shared_hf_hub_download_with_xet_fallback,
+        snapshot_download_with_xet_fallback as _shared_snapshot_download_with_xet_fallback,
+    )
+except ModuleNotFoundError as exc:
+    # The shared helper lives in a newer unsloth_zoo; fail with an actionable
+    # message instead of a bare ModuleNotFoundError at Studio startup.
+    if exc.name == "unsloth_zoo.hf_xet_fallback":
+        raise RuntimeError(
+            "Unsloth Studio requires an unsloth_zoo that provides "
+            "unsloth_zoo.hf_xet_fallback. Upgrade unsloth_zoo alongside unsloth "
+            "(pip install -U unsloth_zoo)."
+        ) from exc
+    raise
 
 __all__ = [
     "DEFAULT_GRACE_PERIOD",
@@ -81,3 +92,10 @@ def hf_hub_download_with_xet_fallback(
         on_status = on_status,
         prepare_for_http_fn = _studio_prepare_for_http,
     )
+
+
+def snapshot_download_with_xet_fallback(repo_id: str, **kwargs: Any) -> str:
+    """Whole-repo download with the shared Xet -> HTTP stall fallback, using Studio's
+    marker-aware cache prep on the HTTP retry (same injection as the single-file path)."""
+    kwargs.setdefault("prepare_for_http_fn", _studio_prepare_for_http)
+    return _shared_snapshot_download_with_xet_fallback(repo_id, **kwargs)
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 992380219e..01219da412 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -825,6 +825,17 @@ def _run_temporary_patches(phase):
 TORCHAO_MSG = "Error: torchao not found, please install with `pip install torchao`"
 
 
+# Artifacts a Transformers/PEFT load never consumes -- skip them when prewarming
+# so a mixed-format repo does not pull ONNX/TF/Flax/CoreML/GGUF/training state.
+# Conservative ignore list (not an allowlist) so no file a load needs is dropped.
+_PREFETCH_IGNORE_PATTERNS = (
+    "*.onnx", "onnx/*", "*.h5", "*.msgpack", "*.tflite",
+    "coreml/*", "*.mlpackage/*", "*.mlmodel", "*.gguf",
+    "optimizer.*", "scheduler.*", "rng_state*", "trainer_state.json",
+    "events.out.tfevents*", "checkpoint-*/*",
+)
+
+
 def maybe_prefetch_hf_snapshot(
     model_name,
     token = None,
@@ -878,6 +889,7 @@ def maybe_prefetch_hf_snapshot(
             token = token,
             revision = revision,
             cache_dir = cache_dir,
+            ignore_patterns = list(_PREFETCH_IGNORE_PATTERNS),
         )
     except DownloadStallError:
         # Both Xet and HTTP stalled: surface a clear network error instead of

From 4d4e45aa036e3c119c99e555d7342c4fce4b5e90 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 Jun 2026 12:02:40 +0000
Subject: [PATCH 04/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/models/_utils.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 15bdbcf9a1..a817a57417 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -829,10 +829,21 @@ def _run_temporary_patches(phase):
 # so a mixed-format repo does not pull ONNX/TF/Flax/CoreML/GGUF/training state.
 # Conservative ignore list (not an allowlist) so no file a load needs is dropped.
 _PREFETCH_IGNORE_PATTERNS = (
-    "*.onnx", "onnx/*", "*.h5", "*.msgpack", "*.tflite",
-    "coreml/*", "*.mlpackage/*", "*.mlmodel", "*.gguf",
-    "optimizer.*", "scheduler.*", "rng_state*", "trainer_state.json",
-    "events.out.tfevents*", "checkpoint-*/*",
+    "*.onnx",
+    "onnx/*",
+    "*.h5",
+    "*.msgpack",
+    "*.tflite",
+    "coreml/*",
+    "*.mlpackage/*",
+    "*.mlmodel",
+    "*.gguf",
+    "optimizer.*",
+    "scheduler.*",
+    "rng_state*",
+    "trainer_state.json",
+    "events.out.tfevents*",
+    "checkpoint-*/*",
 )
 
 
@@ -903,8 +914,6 @@ def maybe_prefetch_hf_snapshot(
     return
 
 
-
-
 # Ignore logging messages
 class HideLoggingMessage(logging.Filter):
     __slots__ = ("text",)

From 0c10ffc45d56be17b2131a82ada606541f437bf5 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 24 Jun 2026 13:07:47 +0000
Subject: [PATCH 05/80] Prewarm PEFT adapter repos and respect checkpoint
 subfolders

Warm the adapter repo with the Xet->HTTP stall fallback before
PeftModel.from_pretrained at both adapter-load sites in loader.py, so a
stalled Xet transfer of a saved LoRA cannot hang the in-process load the
same way it can for a base model.

Thread subfolder through maybe_prefetch_hf_snapshot and drop the
checkpoint-*/* ignore when the caller loads from a checkpoint-* subfolder,
so prewarming never skips the exact files the load needs.
---
 unsloth/models/_utils.py | 13 ++++++++++++-
 unsloth/models/llama.py  |  1 +
 unsloth/models/loader.py | 23 +++++++++++++++++++++++
 unsloth/models/vision.py |  1 +
 4 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 15bdbcf9a1..6fb831a57d 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -844,6 +844,7 @@ def maybe_prefetch_hf_snapshot(
     cache_dir = None,
     local_files_only = False,
     fast_inference = False,
+    subfolder = None,
 ):
     """Warm the Hugging Face cache for a remote repo before the in-process load.
 
@@ -883,13 +884,23 @@ def maybe_prefetch_hf_snapshot(
     if fast_inference:
         return
 
+    # A checkpoint-* subfolder is exactly what "checkpoint-*/*" would drop, so
+    # do not ignore it when the caller is explicitly loading from that subfolder.
+    ignore_patterns = [
+        pattern for pattern in _PREFETCH_IGNORE_PATTERNS
+        if not (
+            pattern == "checkpoint-*/*"
+            and isinstance(subfolder, str)
+            and subfolder.startswith("checkpoint-")
+        )
+    ]
     try:
         snapshot_download_with_xet_fallback(
             model_name,
             token = token,
             revision = revision,
             cache_dir = cache_dir,
-            ignore_patterns = list(_PREFETCH_IGNORE_PATTERNS),
+            ignore_patterns = ignore_patterns,
         )
     except DownloadStallError:
         # Both Xet and HTTP stalled: surface a clear network error instead of
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index ec31198eba..3d1f023eb2 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2383,6 +2383,7 @@ def from_pretrained(
             cache_dir = kwargs.get("cache_dir"),
             local_files_only = kwargs.get("local_files_only", False),
             fast_inference = fast_inference,
+            subfolder = kwargs.get("subfolder"),
         )
 
         if dtype is None:
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index f629899f0c..2bf70d1661 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -102,6 +102,7 @@
     _is_family_text_decoder,
     _apply_text_only_key_mapping,
     set_task_config_attr,
+    maybe_prefetch_hf_snapshot,
 )
 
 # Single source of truth is unsloth_zoo.model_lists. Re-exported so callers
@@ -862,6 +863,17 @@ def from_pretrained(
         if is_peft:
             # From https://github.com/huggingface/peft/issues/184
             # Now add PEFT adapters
+            # Warm the adapter repo first: PeftModel.from_pretrained downloads it
+            # in-process and can hang on a stalled Xet transfer like the base model.
+            maybe_prefetch_hf_snapshot(
+                old_model_name,
+                token = token,
+                revision = revision,
+                cache_dir = kwargs.get("cache_dir"),
+                local_files_only = local_files_only,
+                fast_inference = fast_inference,
+                subfolder = kwargs.get("subfolder"),
+            )
             model = PeftModel.from_pretrained(
                 model,
                 old_model_name,
@@ -1777,6 +1789,17 @@ def _patched_car(
 
                 _LoraModel._create_and_replace = _patched_car
 
+            # Warm the adapter repo first: PeftModel.from_pretrained downloads it
+            # in-process and can hang on a stalled Xet transfer like the base model.
+            maybe_prefetch_hf_snapshot(
+                old_model_name,
+                token = token,
+                revision = revision,
+                cache_dir = kwargs.get("cache_dir"),
+                local_files_only = local_files_only,
+                fast_inference = fast_inference,
+                subfolder = kwargs.get("subfolder"),
+            )
             try:
                 model = PeftModel.from_pretrained(
                     model,
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 1bdd8d038e..3f36dea958 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -768,6 +768,7 @@ def from_pretrained(
             cache_dir = kwargs.get("cache_dir"),
             local_files_only = kwargs.get("local_files_only", False),
             fast_inference = fast_inference,
+            subfolder = kwargs.get("subfolder"),
         )
 
         if dtype is None:

From ec9c6e9e5b51f432fb57e36f05059e0b86e61240 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 Jun 2026 13:09:08 +0000
Subject: [PATCH 06/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/models/_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index b935572eed..356e2507b0 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -898,7 +898,8 @@ def maybe_prefetch_hf_snapshot(
     # A checkpoint-* subfolder is exactly what "checkpoint-*/*" would drop, so
     # do not ignore it when the caller is explicitly loading from that subfolder.
     ignore_patterns = [
-        pattern for pattern in _PREFETCH_IGNORE_PATTERNS
+        pattern
+        for pattern in _PREFETCH_IGNORE_PATTERNS
         if not (
             pattern == "checkpoint-*/*"
             and isinstance(subfolder, str)

From 7f6e137c6fbf82ae49ee9d1148cc3b2294d3335b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 24 Jun 2026 23:49:21 +0000
Subject: [PATCH 07/80] Guard Studio HTTP prep and detect home-relative local
 paths (Gemini review)

- _studio_prepare_for_http: wrap the marker-aware purge so a purge failure
  (locked file, missing dir) is logged at debug rather than aborting the
  HTTP retry; the shared orchestrator already guards it, this is belt-and-braces.
- maybe_prefetch_hf_snapshot: expand ~ before the on-disk check and treat an
  absolute / ~ / explicitly-relative / Windows path that is not on disk yet as
  local, so it is not misread as a Hub repo id and does not emit a confusing
  pre-download warning.
---
 studio/backend/utils/hf_xet_fallback.py | 20 +++++++++++++++++---
 unsloth/models/_utils.py                | 15 +++++++++++++--
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index 68c5884ef8..700d0831f7 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -58,9 +58,23 @@
 def _studio_prepare_for_http(repo_type: str, repo_id: str) -> None:
     """Make the partial safe for an HTTP resume using Studio's marker-aware purge,
     so the download manager's ``.transport`` marker accounting stays consistent
-    (vs the generic delete-incompletes default in unsloth_zoo)."""
-    from hub.utils.download_registry import prepare_cache_for_transport
-    prepare_cache_for_transport(repo_type, repo_id, "http")
+    (vs the generic delete-incompletes default in unsloth_zoo).
+
+    The shared orchestrator already wraps this hook, but guard it here too so a
+    purge failure (locked file, missing dir) is logged rather than aborting the
+    HTTP retry that is the whole point of the fallback."""
+    try:
+        from hub.utils.download_registry import prepare_cache_for_transport
+        prepare_cache_for_transport(repo_type, repo_id, "http")
+    except Exception as exc:
+        try:
+            from loggers import get_logger
+            get_logger(__name__).debug(
+                "Studio prepare_cache_for_transport failed for %s: %s", repo_id, exc
+            )
+        except ModuleNotFoundError as logger_exc:
+            if logger_exc.name != "loggers":
+                raise
 
 
 def hf_hub_download_with_xet_fallback(
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 356e2507b0..ae2e1f6f1c 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -880,8 +880,19 @@ def maybe_prefetch_hf_snapshot(
 
     if not isinstance(model_name, str) or not model_name:
         return
-    # A local directory / file path has nothing to download.
-    if os.path.isdir(model_name) or os.path.exists(model_name):
+    # A local directory / file path has nothing to download. Expand ~ first, since
+    # os.path.exists does not, so a home-relative path is detected as local.
+    model_path = os.path.expanduser(model_name)
+    if os.path.isdir(model_path) or os.path.exists(model_path):
+        return
+    # A path that looks local but is not on disk yet (e.g. a not-created output
+    # dir) is still not a Hub repo id ("org/name"); leave it for from_pretrained
+    # to surface canonically rather than trying to download it.
+    if (
+        os.path.isabs(model_path)
+        or model_name.startswith(("~", "./", "../", ".\\", "..\\"))
+        or "\\" in model_name
+    ):
         return
     # Offline / cache-only: never reach out.
     if local_files_only:

From eb6954b57e55c30a1ddcf0187e5a43da95c22240 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Jun 2026 08:20:04 +0000
Subject: [PATCH 08/80] Forward force_download through the prefetch (review 3)

force_download=True previously bypassed the killable prefetch: the warm-cache
short-circuit no-oped, then Transformers re-fetched in-process over the Xet path
this avoids. Thread force_download through maybe_prefetch_hf_snapshot and every
call site so a forced reload also runs in the killable child.
---
 unsloth/models/_utils.py | 2 ++
 unsloth/models/llama.py  | 1 +
 unsloth/models/loader.py | 2 ++
 unsloth/models/vision.py | 1 +
 4 files changed, 6 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index ae2e1f6f1c..5131ea7c34 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -856,6 +856,7 @@ def maybe_prefetch_hf_snapshot(
     local_files_only = False,
     fast_inference = False,
     subfolder = None,
+    force_download = False,
 ):
     """Warm the Hugging Face cache for a remote repo before the in-process load.
 
@@ -924,6 +925,7 @@ def maybe_prefetch_hf_snapshot(
             revision = revision,
             cache_dir = cache_dir,
             ignore_patterns = ignore_patterns,
+            force_download = force_download,
         )
     except DownloadStallError:
         # Both Xet and HTTP stalled: surface a clear network error instead of
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 3d1f023eb2..a588d35d2b 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2384,6 +2384,7 @@ def from_pretrained(
             local_files_only = kwargs.get("local_files_only", False),
             fast_inference = fast_inference,
             subfolder = kwargs.get("subfolder"),
+            force_download = kwargs.get("force_download", False),
         )
 
         if dtype is None:
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 2bf70d1661..e5ea73091c 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -873,6 +873,7 @@ def from_pretrained(
                 local_files_only = local_files_only,
                 fast_inference = fast_inference,
                 subfolder = kwargs.get("subfolder"),
+                force_download = kwargs.get("force_download", False),
             )
             model = PeftModel.from_pretrained(
                 model,
@@ -1799,6 +1800,7 @@ def _patched_car(
                 local_files_only = local_files_only,
                 fast_inference = fast_inference,
                 subfolder = kwargs.get("subfolder"),
+                force_download = kwargs.get("force_download", False),
             )
             try:
                 model = PeftModel.from_pretrained(
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 3f36dea958..d4c9383e1a 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -769,6 +769,7 @@ def from_pretrained(
             local_files_only = kwargs.get("local_files_only", False),
             fast_inference = fast_inference,
             subfolder = kwargs.get("subfolder"),
+            force_download = kwargs.get("force_download", False),
         )
 
         if dtype is None:

From 2bc08b5c3ffe139a564bab4db0cdd5b410a4e987 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Jun 2026 08:28:27 +0000
Subject: [PATCH 09/80] Degrade Studio gracefully when the shared zoo helper is
 absent (review 3)

Studio's shim hard-imported unsloth_zoo.hf_xet_fallback at startup, so a Studio
install whose unsloth_zoo predates the shared helper crashed at boot. Instead of
raising, fall back to thin stubs: plain HF snapshot/file downloads with the
no-progress watchdog disabled (the same best-effort posture core Unsloth uses in
from_pretrained), a DownloadStallError mirror, and no-op watchdog/state helpers.
Studio boots and downloads work; automatic Xet -> HTTP recovery returns as soon
as unsloth_zoo is upgraded. These are thin stubs, not a second copy of the
orchestration. Verified against the installed older unsloth_zoo and added a
degraded-path test (5 passed).
---
 studio/backend/tests/test_hf_xet_fallback.py | 45 ++++++++++
 studio/backend/utils/hf_xet_fallback.py      | 91 ++++++++++++++++++--
 2 files changed, 127 insertions(+), 9 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 1026d92dbb..40904cf5ed 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -116,3 +116,48 @@ def fake_snapshot(repo_id, **kwargs):
     assert out == "/tmp/snap-dir"
     assert captured["repo_id"] == "org/model"
     assert captured["prepare_for_http_fn"] is xf._studio_prepare_for_http
+
+
+def test_degrades_gracefully_without_shared_helper(monkeypatch):
+    """On an older unsloth_zoo that lacks the shared helper, the shim must still
+    import (Studio boots) and provide stub API that does plain HF downloads with
+    the watchdog disabled, instead of crashing at startup."""
+    import importlib
+
+    class _BlockShared:
+        def find_spec(self, name, path = None, target = None):
+            if name == "unsloth_zoo.hf_xet_fallback":
+                raise ModuleNotFoundError(f"No module named '{name}'", name = name)
+            return None
+
+    finder = _BlockShared()
+    saved_shared = sys.modules.pop("unsloth_zoo.hf_xet_fallback", None)
+    saved_shim = sys.modules.pop("utils.hf_xet_fallback", None)
+    sys.meta_path.insert(0, finder)
+    try:
+        degraded = importlib.import_module("utils.hf_xet_fallback")
+
+        # Boots without raising and mirrors the shared API surface.
+        assert issubclass(degraded.DownloadStallError, RuntimeError)
+        assert degraded.child_should_disable_xet({"disable_xet": True}) is True
+        assert degraded.get_hf_download_state(["x"]) is None  # unmeasurable
+        event = degraded.start_watchdog(repo_ids = ["x"], on_stall = lambda m: None)
+        assert hasattr(event, "set") and not event.is_set()  # never fires
+
+        # Downloads fall back to plain huggingface_hub (no watchdog, no crash).
+        called = {}
+
+        def _fake_snapshot(repo_id, **kwargs):
+            called["repo_id"] = repo_id
+            return "/snap-dir"
+
+        monkeypatch.setattr(huggingface_hub, "snapshot_download", _fake_snapshot)
+        assert degraded.snapshot_download_with_xet_fallback("org/model") == "/snap-dir"
+        assert called["repo_id"] == "org/model"
+    finally:
+        sys.meta_path.remove(finder)
+        sys.modules.pop("utils.hf_xet_fallback", None)
+        if saved_shared is not None:
+            sys.modules["unsloth_zoo.hf_xet_fallback"] = saved_shared
+        if saved_shim is not None:
+            sys.modules["utils.hf_xet_fallback"] = saved_shim
diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index 700d0831f7..22fced62bb 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -32,15 +32,88 @@
         snapshot_download_with_xet_fallback as _shared_snapshot_download_with_xet_fallback,
     )
 except ModuleNotFoundError as exc:
-    # The shared helper lives in a newer unsloth_zoo; fail with an actionable
-    # message instead of a bare ModuleNotFoundError at Studio startup.
-    if exc.name == "unsloth_zoo.hf_xet_fallback":
-        raise RuntimeError(
-            "Unsloth Studio requires an unsloth_zoo that provides "
-            "unsloth_zoo.hf_xet_fallback. Upgrade unsloth_zoo alongside unsloth "
-            "(pip install -U unsloth_zoo)."
-        ) from exc
-    raise
+    if exc.name != "unsloth_zoo.hf_xet_fallback":
+        raise
+
+    # The shared helper lives in a newer unsloth_zoo. Rather than crash Studio at
+    # startup on an older (but dependency-satisfying) unsloth_zoo, degrade
+    # gracefully: plain HF downloads with the no-progress stall watchdog disabled
+    # -- the same best-effort posture core Unsloth uses in from_pretrained. The
+    # automatic Xet -> HTTP recovery returns as soon as unsloth_zoo is upgraded.
+    # These are thin stubs, not a second copy of the orchestration.
+    import logging as _logging
+
+    _logging.getLogger(__name__).warning(
+        "unsloth_zoo.hf_xet_fallback not found; the Xet stall watchdog is "
+        "disabled. Upgrade unsloth_zoo to re-enable automatic Xet -> HTTP "
+        "download recovery."
+    )
+
+    DEFAULT_HEARTBEAT_INTERVAL = 30.0
+    DEFAULT_STALL_TIMEOUT = 180.0
+    DEFAULT_GRACE_PERIOD = 10.0
+
+    class DownloadStallError(RuntimeError):
+        """Stub mirror of the shared type so callers and ``except`` clauses still
+        resolve when the shared helper is unavailable (it is simply never raised
+        in degraded mode, since there is no watchdog to detect a stall)."""
+
+    def child_should_disable_xet(config: dict) -> bool:
+        return bool(config.get("disable_xet"))
+
+    def get_hf_download_state(*args: Any, **kwargs: Any) -> None:
+        return None  # unmeasurable -> the (absent) watchdog never fires
+
+    def start_watchdog(**kwargs: Any) -> "threading.Event":
+        return threading.Event()  # never set; no stall detection in degraded mode
+
+    def _shared_hf_hub_download_with_xet_fallback(
+        repo_id: str,
+        filename: str,
+        token: Optional[str],
+        *,
+        repo_type: str = "model",
+        revision: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        force_download: bool = False,
+        **_ignored: Any,
+    ) -> str:
+        from huggingface_hub import hf_hub_download
+
+        return hf_hub_download(
+            repo_id = repo_id,
+            filename = filename,
+            token = token,
+            repo_type = repo_type,
+            revision = revision,
+            cache_dir = cache_dir,
+            force_download = force_download,
+        )
+
+    def _shared_snapshot_download_with_xet_fallback(
+        repo_id: str,
+        *,
+        revision: Optional[str] = None,
+        token: Optional[str] = None,
+        repo_type: str = "model",
+        cache_dir: Optional[str] = None,
+        allow_patterns: Optional[Any] = None,
+        ignore_patterns: Optional[Any] = None,
+        force_download: bool = False,
+        **_ignored: Any,
+    ) -> str:
+        from huggingface_hub import snapshot_download
+
+        return snapshot_download(
+            repo_id = repo_id,
+            repo_type = repo_type,
+            revision = revision,
+            token = token,
+            cache_dir = cache_dir,
+            allow_patterns = allow_patterns,
+            ignore_patterns = ignore_patterns,
+            force_download = force_download,
+        )
 
 __all__ = [
     "DEFAULT_GRACE_PERIOD",

From ed16544cf8d6d1487649303f9f86eb5dfccf7a87 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 Jun 2026 08:28:54 +0000
Subject: [PATCH 10/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_hf_xet_fallback.py | 7 ++++++-
 studio/backend/utils/hf_xet_fallback.py      | 3 +--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 40904cf5ed..4f6987b262 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -125,7 +125,12 @@ def test_degrades_gracefully_without_shared_helper(monkeypatch):
     import importlib
 
     class _BlockShared:
-        def find_spec(self, name, path = None, target = None):
+        def find_spec(
+            self,
+            name,
+            path = None,
+            target = None,
+        ):
             if name == "unsloth_zoo.hf_xet_fallback":
                 raise ModuleNotFoundError(f"No module named '{name}'", name = name)
             return None
diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index 22fced62bb..82c8b1a46d 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -79,7 +79,6 @@ def _shared_hf_hub_download_with_xet_fallback(
         **_ignored: Any,
     ) -> str:
         from huggingface_hub import hf_hub_download
-
         return hf_hub_download(
             repo_id = repo_id,
             filename = filename,
@@ -103,7 +102,6 @@ def _shared_snapshot_download_with_xet_fallback(
         **_ignored: Any,
     ) -> str:
         from huggingface_hub import snapshot_download
-
         return snapshot_download(
             repo_id = repo_id,
             repo_type = repo_type,
@@ -115,6 +113,7 @@ def _shared_snapshot_download_with_xet_fallback(
             force_download = force_download,
         )
 
+
 __all__ = [
     "DEFAULT_GRACE_PERIOD",
     "DEFAULT_HEARTBEAT_INTERVAL",

From 1fda803fe6f82a10a4ab5355d121f2280430a15d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Jun 2026 08:54:50 +0000
Subject: [PATCH 11/80] Fix forced double-download, guard diffusion path, honor
 cancel in degraded shim (review 4)

- maybe_prefetch_hf_snapshot now returns whether it warmed the cache, and every
  call site clears force_download after a successful warm. force_download=True
  previously warmed the cache in the killable child and then re-forced the same
  download in-process over Xet, defeating the protection.
- Add the prefetch guard to the diffusion FastModel path (diffusion.py), which
  loaded weights via model_cls.from_pretrained without the Xet->HTTP recovery the
  vision/language paths get.
- Degraded Studio shim (older unsloth_zoo) now honors cancel_event: it raises
  before starting and after finishing the plain HF download instead of dropping
  cancellation via **kwargs.
- Prewarm skips redundant *.bin when the repo also ships safetensors (best-effort
  model_info), so a dual-format repo is not double-downloaded before Transformers
  picks safetensors. Threaded use_safetensors through the call sites.
- Studio shim test imports the shared helper optionally so the degraded-path test
  runs on an unsloth_zoo without it; added a degraded cancellation test.
---
 studio/backend/tests/test_hf_xet_fallback.py | 27 +++++-
 studio/backend/utils/hf_xet_fallback.py      | 23 +++++-
 unsloth/models/_utils.py                     | 86 +++++++++++++++-----
 unsloth/models/diffusion.py                  | 18 +++-
 unsloth/models/llama.py                      |  7 +-
 unsloth/models/loader.py                     | 14 +++-
 unsloth/models/vision.py                     |  7 +-
 7 files changed, 155 insertions(+), 27 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 40904cf5ed..067435594e 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -36,14 +36,27 @@
 
 import huggingface_hub
 
-import unsloth_zoo.hf_xet_fallback as shared
+try:
+    import unsloth_zoo.hf_xet_fallback as shared
+except ModuleNotFoundError as exc:
+    # The degraded-path test must still run on an unsloth_zoo without the helper.
+    if exc.name != "unsloth_zoo.hf_xet_fallback":
+        raise
+    shared = None
+
 import utils.hf_xet_fallback as xf
 
 
 DL_REPO, FILE = "ztest/xet-dl", "model-Q4_K_XL.gguf"
 
 
+def _requires_shared():
+    if shared is None:
+        pytest.skip("unsloth_zoo.hf_xet_fallback is not installed in this environment")
+
+
 def test_shim_reexports_shared_api():
+    _requires_shared()
     assert xf.DownloadStallError is shared.DownloadStallError
     for name in (
         "start_watchdog",
@@ -65,6 +78,7 @@ def test_shim_injects_studio_prepare_on_http_retry(monkeypatch):
     """A stall on Xet retries over HTTP, and the shim runs Studio's marker-aware
     ``prepare_cache_for_transport(..., 'http')`` before the retry (not the generic
     delete-incompletes default)."""
+    _requires_shared()
     for var in ("UNSLOTH_DISABLE_XET", "UNSLOTH_STABLE_DOWNLOADS", "HF_HUB_DISABLE_XET"):
         monkeypatch.delenv(var, raising = False)
     monkeypatch.setattr(huggingface_hub, "try_to_load_from_cache", lambda *a, **k: None)
@@ -154,6 +168,17 @@ def _fake_snapshot(repo_id, **kwargs):
         monkeypatch.setattr(huggingface_hub, "snapshot_download", _fake_snapshot)
         assert degraded.snapshot_download_with_xet_fallback("org/model") == "/snap-dir"
         assert called["repo_id"] == "org/model"
+
+        # Cancellation contract is preserved even in degraded mode: an already-set
+        # cancel_event must abort before starting the plain HF download.
+        import threading as _threading
+
+        cancelled = _threading.Event()
+        cancelled.set()
+        called.clear()
+        with pytest.raises(RuntimeError, match = "Cancelled"):
+            degraded.snapshot_download_with_xet_fallback("org/model", cancel_event = cancelled)
+        assert "repo_id" not in called, "degraded download ran despite cancellation"
     finally:
         sys.meta_path.remove(finder)
         sys.modules.pop("utils.hf_xet_fallback", None)
diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index 22fced62bb..f381ce7e8b 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -67,6 +67,9 @@ def get_hf_download_state(*args: Any, **kwargs: Any) -> None:
     def start_watchdog(**kwargs: Any) -> "threading.Event":
         return threading.Event()  # never set; no stall detection in degraded mode
 
+    def _degraded_cancelled(cancel_event: "Optional[threading.Event]") -> bool:
+        return cancel_event is not None and cancel_event.is_set()
+
     def _shared_hf_hub_download_with_xet_fallback(
         repo_id: str,
         filename: str,
@@ -76,11 +79,17 @@ def _shared_hf_hub_download_with_xet_fallback(
         revision: Optional[str] = None,
         cache_dir: Optional[str] = None,
         force_download: bool = False,
+        cancel_event: "Optional[threading.Event]" = None,
         **_ignored: Any,
     ) -> str:
+        # No subprocess to interrupt mid-call here, but keep the cancellation
+        # contract: do not start, and do not return, a download once cancelled.
+        if _degraded_cancelled(cancel_event):
+            raise RuntimeError("Cancelled")
+
         from huggingface_hub import hf_hub_download
 
-        return hf_hub_download(
+        path = hf_hub_download(
             repo_id = repo_id,
             filename = filename,
             token = token,
@@ -89,6 +98,9 @@ def _shared_hf_hub_download_with_xet_fallback(
             cache_dir = cache_dir,
             force_download = force_download,
         )
+        if _degraded_cancelled(cancel_event):
+            raise RuntimeError("Cancelled")
+        return path
 
     def _shared_snapshot_download_with_xet_fallback(
         repo_id: str,
@@ -100,11 +112,15 @@ def _shared_snapshot_download_with_xet_fallback(
         allow_patterns: Optional[Any] = None,
         ignore_patterns: Optional[Any] = None,
         force_download: bool = False,
+        cancel_event: "Optional[threading.Event]" = None,
         **_ignored: Any,
     ) -> str:
+        if _degraded_cancelled(cancel_event):
+            raise RuntimeError("Cancelled")
+
         from huggingface_hub import snapshot_download
 
-        return snapshot_download(
+        path = snapshot_download(
             repo_id = repo_id,
             repo_type = repo_type,
             revision = revision,
@@ -114,6 +130,9 @@ def _shared_snapshot_download_with_xet_fallback(
             ignore_patterns = ignore_patterns,
             force_download = force_download,
         )
+        if _degraded_cancelled(cancel_event):
+            raise RuntimeError("Cancelled")
+        return path
 
 __all__ = [
     "DEFAULT_GRACE_PERIOD",
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 5131ea7c34..587d86c4aa 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -847,6 +847,50 @@ def _run_temporary_patches(phase):
 )
 
 
+def _prefetch_ignore_patterns(
+    model_name,
+    *,
+    token = None,
+    revision = None,
+    subfolder = None,
+    use_safetensors = None,
+):
+    """ignore_patterns for the prewarm snapshot: the static skip list, minus the
+    checkpoint guard when loading from a checkpoint-* subfolder, plus *.bin when
+    the repo also ships safetensors (Transformers prefers safetensors, so pulling
+    the .bin copies just to discard them doubles the very download we optimize)."""
+    # A checkpoint-* subfolder is exactly what "checkpoint-*/*" would drop, so
+    # do not ignore it when the caller is explicitly loading from that subfolder.
+    ignore_patterns = [
+        pattern
+        for pattern in _PREFETCH_IGNORE_PATTERNS
+        if not (
+            pattern == "checkpoint-*/*"
+            and isinstance(subfolder, str)
+            and subfolder.startswith("checkpoint-")
+        )
+    ]
+    # Skip .bin only when the caller has not explicitly asked for it and the repo
+    # actually ships safetensors to load instead. Best-effort: any failure leaves
+    # both formats eligible (correct, just less efficient).
+    if use_safetensors is not False:
+        try:
+            from huggingface_hub import HfApi
+
+            siblings = HfApi().model_info(
+                model_name, revision = revision, token = token,
+            ).siblings or []
+            has_safetensors = any(
+                sibling.rfilename.endswith((".safetensors", ".safetensors.index.json"))
+                for sibling in siblings
+            )
+            if has_safetensors:
+                ignore_patterns.extend(("*.bin", "*.bin.index.json"))
+        except Exception:
+            pass
+    return ignore_patterns
+
+
 def maybe_prefetch_hf_snapshot(
     model_name,
     token = None,
@@ -857,6 +901,7 @@ def maybe_prefetch_hf_snapshot(
     fast_inference = False,
     subfolder = None,
     force_download = False,
+    use_safetensors = None,
 ):
     """Warm the Hugging Face cache for a remote repo before the in-process load.
 
@@ -866,6 +911,12 @@ def maybe_prefetch_hf_snapshot(
     Xet to plain HTTP on a no-progress stall (unsloth_zoo.hf_xet_fallback); the
     from_pretrained that follows is then a cache hit and cannot stall on Xet.
 
+    Returns True iff the snapshot was warmed in the killable subprocess, so the
+    caller can clear force_download for the in-process load (else a forced reload
+    would re-download over the very Xet path this avoids). Returns False when
+    warming was skipped (local path / offline / local_files_only / fast_inference,
+    or an older unsloth_zoo) or failed.
+
     Best-effort: a deterministic failure (missing repo, auth, disk) is left for
     from_pretrained to surface canonically; only a both-transports-stalled
     DownloadStallError is raised here, to avoid a silent in-process hang.
@@ -877,15 +928,15 @@ def maybe_prefetch_hf_snapshot(
         )
     except Exception:
         # Older unsloth_zoo without the helper: skip warming, load normally.
-        return
+        return False
 
     if not isinstance(model_name, str) or not model_name:
-        return
+        return False
     # A local directory / file path has nothing to download. Expand ~ first, since
     # os.path.exists does not, so a home-relative path is detected as local.
     model_path = os.path.expanduser(model_name)
     if os.path.isdir(model_path) or os.path.exists(model_path):
-        return
+        return False
     # A path that looks local but is not on disk yet (e.g. a not-created output
     # dir) is still not a Hub repo id ("org/name"); leave it for from_pretrained
     # to surface canonically rather than trying to download it.
@@ -894,30 +945,26 @@ def maybe_prefetch_hf_snapshot(
         or model_name.startswith(("~", "./", "../", ".\\", "..\\"))
         or "\\" in model_name
     ):
-        return
+        return False
     # Offline / cache-only: never reach out.
     if local_files_only:
-        return
+        return False
     if any(
         os.environ.get(flag, "0").lower() in ("1", "true", "yes", "on")
         for flag in ("HF_HUB_OFFLINE", "TRANSFORMERS_OFFLINE")
     ):
-        return
+        return False
     # vLLM has its own download path; leave it untouched.
     if fast_inference:
-        return
+        return False
 
-    # A checkpoint-* subfolder is exactly what "checkpoint-*/*" would drop, so
-    # do not ignore it when the caller is explicitly loading from that subfolder.
-    ignore_patterns = [
-        pattern
-        for pattern in _PREFETCH_IGNORE_PATTERNS
-        if not (
-            pattern == "checkpoint-*/*"
-            and isinstance(subfolder, str)
-            and subfolder.startswith("checkpoint-")
-        )
-    ]
+    ignore_patterns = _prefetch_ignore_patterns(
+        model_name,
+        token = token,
+        revision = revision,
+        subfolder = subfolder,
+        use_safetensors = use_safetensors,
+    )
     try:
         snapshot_download_with_xet_fallback(
             model_name,
@@ -927,6 +974,7 @@ def maybe_prefetch_hf_snapshot(
             ignore_patterns = ignore_patterns,
             force_download = force_download,
         )
+        return True
     except DownloadStallError:
         # Both Xet and HTTP stalled: surface a clear network error instead of
         # letting the in-process load hang on the same stall.
@@ -936,7 +984,7 @@ def maybe_prefetch_hf_snapshot(
             f"Unsloth: Could not pre-download {model_name} "
             f"({type(exception).__name__}: {exception}); continuing with the normal load."
         )
-    return
+        return False
 
 
 # Ignore logging messages
diff --git a/unsloth/models/diffusion.py b/unsloth/models/diffusion.py
index 12596b432e..3ae52bb769 100644
--- a/unsloth/models/diffusion.py
+++ b/unsloth/models/diffusion.py
@@ -24,7 +24,7 @@
 import torch
 from transformers import AutoConfig, AutoProcessor, AutoTokenizer
 
-from ._utils import is_bfloat16_supported
+from ._utils import is_bfloat16_supported, maybe_prefetch_hf_snapshot
 from .llama import logger
 
 __all__ = ["FastDiffusionModel", "DIFFUSION_MODEL_TYPES", "is_diffusion_model_type"]
@@ -152,6 +152,22 @@ def from_pretrained(
                 os.environ.get("HF_HUB_OFFLINE", "0") == "1"
                 or os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1"
             )
+
+        # Pre-download the repo in a killable subprocess that falls back from Xet
+        # to HTTP on a no-progress stall, so the config + weight loads below are
+        # cache hits and cannot hang on a stalled Xet transfer.
+        maybe_prefetch_hf_snapshot(
+            model_name,
+            token = token,
+            revision = revision,
+            cache_dir = kwargs.get("cache_dir"),
+            local_files_only = local_files_only,
+            fast_inference = False,
+            subfolder = kwargs.get("subfolder"),
+            force_download = kwargs.get("force_download", False),
+            use_safetensors = kwargs.get("use_safetensors"),
+        )
+
         config = _load_diffusion_config(
             model_name,
             token,
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index a588d35d2b..7d93188c0a 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2376,7 +2376,7 @@ def from_pretrained(
         # Pre-download the repo in a killable subprocess that falls back from Xet
         # to HTTP on a no-progress stall, so the in-process load below is a cache
         # hit and cannot hang on a stalled Xet transfer.
-        maybe_prefetch_hf_snapshot(
+        _prefetched = maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
             revision = revision,
@@ -2385,7 +2385,12 @@ def from_pretrained(
             fast_inference = fast_inference,
             subfolder = kwargs.get("subfolder"),
             force_download = kwargs.get("force_download", False),
+            use_safetensors = kwargs.get("use_safetensors"),
         )
+        # The killable child already did the forced download; clear the flag so the
+        # in-process load reuses that warm cache instead of re-forcing over Xet.
+        if _prefetched and kwargs.get("force_download", False):
+            kwargs["force_download"] = False
 
         if dtype is None:
             dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index e5ea73091c..2c5e1027bf 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -865,7 +865,7 @@ def from_pretrained(
             # Now add PEFT adapters
             # Warm the adapter repo first: PeftModel.from_pretrained downloads it
             # in-process and can hang on a stalled Xet transfer like the base model.
-            maybe_prefetch_hf_snapshot(
+            _prefetched = maybe_prefetch_hf_snapshot(
                 old_model_name,
                 token = token,
                 revision = revision,
@@ -874,7 +874,12 @@ def from_pretrained(
                 fast_inference = fast_inference,
                 subfolder = kwargs.get("subfolder"),
                 force_download = kwargs.get("force_download", False),
+                use_safetensors = kwargs.get("use_safetensors"),
             )
+            # The killable child already did the forced download; clear the flag so
+            # the in-process load reuses that warm cache instead of re-forcing.
+            if _prefetched and kwargs.get("force_download", False):
+                kwargs["force_download"] = False
             model = PeftModel.from_pretrained(
                 model,
                 old_model_name,
@@ -1792,7 +1797,7 @@ def _patched_car(
 
             # Warm the adapter repo first: PeftModel.from_pretrained downloads it
             # in-process and can hang on a stalled Xet transfer like the base model.
-            maybe_prefetch_hf_snapshot(
+            _prefetched = maybe_prefetch_hf_snapshot(
                 old_model_name,
                 token = token,
                 revision = revision,
@@ -1801,7 +1806,12 @@ def _patched_car(
                 fast_inference = fast_inference,
                 subfolder = kwargs.get("subfolder"),
                 force_download = kwargs.get("force_download", False),
+                use_safetensors = kwargs.get("use_safetensors"),
             )
+            # The killable child already did the forced download; clear the flag so
+            # the in-process load reuses that warm cache instead of re-forcing.
+            if _prefetched and kwargs.get("force_download", False):
+                kwargs["force_download"] = False
             try:
                 model = PeftModel.from_pretrained(
                     model,
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index d4c9383e1a..3f3dc9047a 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -761,7 +761,7 @@ def from_pretrained(
         # Pre-download the repo in a killable subprocess that falls back from Xet
         # to HTTP on a no-progress stall, so the in-process load below is a cache
         # hit and cannot hang on a stalled Xet transfer.
-        maybe_prefetch_hf_snapshot(
+        _prefetched = maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
             revision = kwargs.get("revision"),
@@ -770,7 +770,12 @@ def from_pretrained(
             fast_inference = fast_inference,
             subfolder = kwargs.get("subfolder"),
             force_download = kwargs.get("force_download", False),
+            use_safetensors = kwargs.get("use_safetensors"),
         )
+        # The killable child already did the forced download; clear the flag so the
+        # in-process load reuses that warm cache instead of re-forcing over Xet.
+        if _prefetched and kwargs.get("force_download", False):
+            kwargs["force_download"] = False
 
         if dtype is None:
             dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16

From 0387885e98ef89ff8a84cabc9ba374241d3425a5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 Jun 2026 08:56:21 +0000
Subject: [PATCH 12/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/models/_utils.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 587d86c4aa..8146dfb82b 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -877,9 +877,16 @@ def _prefetch_ignore_patterns(
         try:
             from huggingface_hub import HfApi
 
-            siblings = HfApi().model_info(
-                model_name, revision = revision, token = token,
-            ).siblings or []
+            siblings = (
+                HfApi()
+                .model_info(
+                    model_name,
+                    revision = revision,
+                    token = token,
+                )
+                .siblings
+                or []
+            )
             has_safetensors = any(
                 sibling.rfilename.endswith((".safetensors", ".safetensors.index.json"))
                 for sibling in siblings

From e229d389c55aa4ef987d82003b6bb3d47a7e2a0e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Jun 2026 10:05:58 +0000
Subject: [PATCH 13/80] Diffusion prefetch ordering/cache_dir and degraded
 heartbeats (review 4 P2)

- Diffusion: move the prefetch until after the config/class validation, so a
  non-diffusion (or unsupported) repo fails on config metadata alone instead of
  first snapshotting multi-GB weights; and thread cache_dir through the config,
  weight and processor/tokenizer loads so the warmed cache is the one they read
  (previously the prefetch warmed a custom cache the loads ignored).
- Degraded Studio shim: start_watchdog now emits periodic heartbeats (stall
  detection still disabled) so the orchestrator's inactivity deadline is not
  tripped during a long load on an older unsloth_zoo.
- Extended the degraded-path test to assert heartbeats are emitted.
---
 studio/backend/tests/test_hf_xet_fallback.py | 17 +++++++++
 studio/backend/utils/hf_xet_fallback.py      | 28 +++++++++++++-
 unsloth/models/diffusion.py                  | 40 ++++++++++++--------
 3 files changed, 68 insertions(+), 17 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index ffb3b86ca7..ce4bd3b035 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -163,6 +163,23 @@ def find_spec(
         event = degraded.start_watchdog(repo_ids = ["x"], on_stall = lambda m: None)
         assert hasattr(event, "set") and not event.is_set()  # never fires
 
+        # Degraded mode still emits heartbeats so the orchestrator's inactivity
+        # deadline is not tripped during a long load.
+        import time as _time
+
+        beats = []
+        hb_stop = degraded.start_watchdog(
+            repo_ids = ["x"], on_stall = lambda m: None,
+            on_heartbeat = beats.append, interval = 0.02,
+        )
+        try:
+            deadline = _time.monotonic() + 2.0
+            while not beats and _time.monotonic() < deadline:
+                _time.sleep(0.02)
+            assert beats, "degraded watchdog emitted no heartbeat"
+        finally:
+            hb_stop.set()
+
         # Downloads fall back to plain huggingface_hub (no watchdog, no crash).
         called = {}
 
diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index 0d1fe43d4b..e94d091907 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -64,8 +64,32 @@ def child_should_disable_xet(config: dict) -> bool:
     def get_hf_download_state(*args: Any, **kwargs: Any) -> None:
         return None  # unmeasurable -> the (absent) watchdog never fires
 
-    def start_watchdog(**kwargs: Any) -> "threading.Event":
-        return threading.Event()  # never set; no stall detection in degraded mode
+    def start_watchdog(
+        *,
+        on_heartbeat: "Optional[Callable[[str], None]]" = None,
+        interval: float = DEFAULT_HEARTBEAT_INTERVAL,
+        xet_disabled: bool = False,
+        **kwargs: Any,
+    ) -> "threading.Event":
+        # No stall detection without the shared helper, but keep emitting heartbeat
+        # statuses so the orchestrator's inactivity deadline is not tripped during a
+        # legitimately long load/download in this degraded mode.
+        stop = threading.Event()
+        if on_heartbeat is None:
+            return stop
+        transport = "https" if xet_disabled else "xet"
+
+        def _beat() -> None:
+            while not stop.wait(interval):
+                try:
+                    on_heartbeat(f"Downloading ({transport} transport)...")
+                except Exception:
+                    pass
+
+        threading.Thread(
+            target = _beat, daemon = True, name = "hf-xet-degraded-heartbeat",
+        ).start()
+        return stop
 
     def _degraded_cancelled(cancel_event: "Optional[threading.Event]") -> bool:
         return cancel_event is not None and cancel_event.is_set()
diff --git a/unsloth/models/diffusion.py b/unsloth/models/diffusion.py
index 3ae52bb769..28c50a5835 100644
--- a/unsloth/models/diffusion.py
+++ b/unsloth/models/diffusion.py
@@ -79,7 +79,7 @@ def _resolve_diffusion_model_class(config):
     )
 
 
-def _load_diffusion_config(model_name, token, trust_remote_code, revision, local_files_only):
+def _load_diffusion_config(model_name, token, trust_remote_code, revision, local_files_only, cache_dir = None):
     """Load the config, aliasing the legacy ``diffusion_gemma`` model_type to the ``diffusion_gemma4``
     classes current transformers ships. AutoConfig raises on the legacy type; catch that, rewrite the
     type/arch names in-memory, and rebuild."""
@@ -90,6 +90,7 @@ def _load_diffusion_config(model_name, token, trust_remote_code, revision, local
             trust_remote_code = trust_remote_code,
             revision = revision,
             local_files_only = local_files_only,
+            cache_dir = cache_dir,
         )
     except ValueError as e:
         if "diffusion_gemma" not in str(e):
@@ -103,6 +104,7 @@ def _load_diffusion_config(model_name, token, trust_remote_code, revision, local
             token = token,
             revision = revision,
             local_files_only = local_files_only,
+            cache_dir = cache_dir,
         )
         with open(cfg_path, encoding = "utf-8") as f:
             cd = json.load(f)
@@ -153,20 +155,7 @@ def from_pretrained(
                 or os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1"
             )
 
-        # Pre-download the repo in a killable subprocess that falls back from Xet
-        # to HTTP on a no-progress stall, so the config + weight loads below are
-        # cache hits and cannot hang on a stalled Xet transfer.
-        maybe_prefetch_hf_snapshot(
-            model_name,
-            token = token,
-            revision = revision,
-            cache_dir = kwargs.get("cache_dir"),
-            local_files_only = local_files_only,
-            fast_inference = False,
-            subfolder = kwargs.get("subfolder"),
-            force_download = kwargs.get("force_download", False),
-            use_safetensors = kwargs.get("use_safetensors"),
-        )
+        cache_dir = kwargs.get("cache_dir")
 
         config = _load_diffusion_config(
             model_name,
@@ -174,6 +163,7 @@ def from_pretrained(
             trust_remote_code,
             revision,
             local_files_only,
+            cache_dir = cache_dir,
         )
         model_type = getattr(config, "model_type", None)
         if not is_diffusion_model_type(model_type):
@@ -184,6 +174,23 @@ def from_pretrained(
 
         model_cls = _resolve_diffusion_model_class(config)
 
+        # The repo is confirmed a diffusion model: now pre-download it in a
+        # killable subprocess that falls back from Xet to HTTP on a no-progress
+        # stall, so the weight load below is a cache hit and cannot hang. Done
+        # after validation so a non-diffusion repo fails on config metadata alone,
+        # without first pulling multi-GB weights.
+        maybe_prefetch_hf_snapshot(
+            model_name,
+            token = token,
+            revision = revision,
+            cache_dir = cache_dir,
+            local_files_only = local_files_only,
+            fast_inference = False,
+            subfolder = kwargs.get("subfolder"),
+            force_download = kwargs.get("force_download", False),
+            use_safetensors = kwargs.get("use_safetensors"),
+        )
+
         load_kwargs = dict(
             dtype = dtype,
             device_map = device_map,
@@ -192,6 +199,7 @@ def from_pretrained(
             attn_implementation = attn_implementation,
             revision = revision,
             local_files_only = local_files_only,
+            cache_dir = cache_dir,
         )
 
         # Optional bitsandbytes quant. The MoE experts (3D Parameters) are not nn.Linear so bnb skips
@@ -238,6 +246,7 @@ def from_pretrained(
                 trust_remote_code = trust_remote_code,
                 revision = revision,
                 local_files_only = local_files_only,
+                cache_dir = cache_dir,
             )
         except Exception:
             tokenizer = AutoTokenizer.from_pretrained(
@@ -246,6 +255,7 @@ def from_pretrained(
                 trust_remote_code = trust_remote_code,
                 revision = revision,
                 local_files_only = local_files_only,
+                cache_dir = cache_dir,
             )
 
         return model, tokenizer

From 5a951962138f039701c93992cec3017aa62de14b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 Jun 2026 10:06:26 +0000
Subject: [PATCH 14/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_hf_xet_fallback.py | 6 ++++--
 studio/backend/utils/hf_xet_fallback.py      | 4 +++-
 unsloth/models/diffusion.py                  | 9 ++++++++-
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index ce4bd3b035..561e2ab7b0 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -169,8 +169,10 @@ def find_spec(
 
         beats = []
         hb_stop = degraded.start_watchdog(
-            repo_ids = ["x"], on_stall = lambda m: None,
-            on_heartbeat = beats.append, interval = 0.02,
+            repo_ids = ["x"],
+            on_stall = lambda m: None,
+            on_heartbeat = beats.append,
+            interval = 0.02,
         )
         try:
             deadline = _time.monotonic() + 2.0
diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index e94d091907..fb98737125 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -87,7 +87,9 @@ def _beat() -> None:
                     pass
 
         threading.Thread(
-            target = _beat, daemon = True, name = "hf-xet-degraded-heartbeat",
+            target = _beat,
+            daemon = True,
+            name = "hf-xet-degraded-heartbeat",
         ).start()
         return stop
 
diff --git a/unsloth/models/diffusion.py b/unsloth/models/diffusion.py
index 28c50a5835..d22b1dda71 100644
--- a/unsloth/models/diffusion.py
+++ b/unsloth/models/diffusion.py
@@ -79,7 +79,14 @@ def _resolve_diffusion_model_class(config):
     )
 
 
-def _load_diffusion_config(model_name, token, trust_remote_code, revision, local_files_only, cache_dir = None):
+def _load_diffusion_config(
+    model_name,
+    token,
+    trust_remote_code,
+    revision,
+    local_files_only,
+    cache_dir = None,
+):
     """Load the config, aliasing the legacy ``diffusion_gemma`` model_type to the ``diffusion_gemma4``
     classes current transformers ships. AutoConfig raises on the legacy type; catch that, rewrite the
     type/arch names in-memory, and rebuild."""

From 4d02717b40f823d89cc3548ccf4e032eca346e62 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Jun 2026 14:50:48 +0000
Subject: [PATCH 15/80] Load tokenizers/processors from the cache the prefetch
 warmed (review)

When a caller passes a custom cache_dir, the prefetch warms it (including
tokenizer/processor files), but the tokenizer/processor loads did not receive
cache_dir, so they read a different cache and could start an in-process Hub/Xet
download after the killable prewarm. Forward the custom cache_dir to
load_correct_tokenizer (llama path; only when set, to keep its own default
otherwise) and to the AutoProcessor/AutoTokenizer loads (vision path; None stays
the HF default), so the warm cache is the one they read.
---
 unsloth/models/llama.py  | 8 ++++++++
 unsloth/models/vision.py | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 7d93188c0a..6c55fd74e6 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2687,6 +2687,13 @@ def from_pretrained(
 
         # Counteract saved tokenizers
         tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
+        # When the caller set a custom cache_dir, the prefetch warmed it (incl.
+        # tokenizer files); route the tokenizer load there too so it reuses that
+        # warm cache instead of doing its own in-process Hub/Xet download. With no
+        # custom cache_dir, keep load_correct_tokenizer's own default.
+        _tokenizer_cache_kwargs = {}
+        if kwargs.get("cache_dir") is not None:
+            _tokenizer_cache_kwargs["cache_dir"] = kwargs["cache_dir"]
         tokenizer = load_correct_tokenizer(
             tokenizer_name = tokenizer_name,
             model_max_length = max_position_embeddings,
@@ -2694,6 +2701,7 @@ def from_pretrained(
             token = token,
             trust_remote_code = trust_remote_code,
             fix_tokenizer = fix_tokenizer,
+            **_tokenizer_cache_kwargs,
         )
 
         model, tokenizer = patch_tokenizer(model, tokenizer)
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 3f3dc9047a..58372b6eb0 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -1202,6 +1202,7 @@ def from_pretrained(
                     language = whisper_language,
                     task = whisper_task,
                     trust_remote_code = trust_remote_code,
+                    cache_dir = kwargs.get("cache_dir"),
                 )
             except Exception as e:
                 _processor_load_error = e
@@ -1213,6 +1214,7 @@ def from_pretrained(
                     padding_side = "left",
                     token = token,
                     trust_remote_code = trust_remote_code,
+                    cache_dir = kwargs.get("cache_dir"),
                 )
             except Exception as e:
                 _processor_load_error = e

From 1caea304c3979c178389d058cb5b0a952a1b5f7c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Jun 2026 15:29:39 +0000
Subject: [PATCH 16/80] Scope the prefetch .bin skip and forward cache_dir to
 fallback loads

- _utils.py: only treat a repo as "ships safetensors instead of .bin" when the
  safetensors live in the location the load actually reads (the requested
  subfolder, or the repo root). A mixed-layout repo whose loaded subfolder ships
  only .bin while safetensors sit elsewhere no longer has its .bin weights
  stripped from the prewarm snapshot. Adds _in_requested_load_scope.
- loader.py: forward cache_dir / subfolder / local_files_only into both
  PeftModel.from_pretrained sites so the in-process adapter load reuses the
  cache the prefetch warmed instead of the default cache.
- vision.py: thread cache_dir through the VLM fallback processor/tokenizer/
  config downloads (_construct_vlm_processor_fallback and the AutoTokenizer /
  PreTrainedTokenizerFast fallbacks) for the same reason. get_auto_processor is
  local-path only and never downloads, so it is left unchanged.
---
 unsloth/models/_utils.py | 20 ++++++++++++++++++++
 unsloth/models/loader.py | 22 ++++++++++++++++++++++
 unsloth/models/vision.py | 11 +++++++++--
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 8146dfb82b..f10025773c 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -847,6 +847,22 @@ def _run_temporary_patches(phase):
 )
 
 
+def _in_requested_load_scope(filename, subfolder):
+    """True if a repo-relative *filename* belongs to the location being loaded.
+
+    The load reads from *subfolder* when one is given, otherwise the repo root.
+    Used so the ".bin is redundant when safetensors exist" decision is keyed off
+    the files that load actually uses, not any safetensors elsewhere in the repo
+    (a subfolder that ships only .bin must keep its .bin even when an unrelated
+    subfolder ships safetensors).
+    """
+    filename = filename.replace("\\", "/")
+    if isinstance(subfolder, str) and subfolder.strip("/"):
+        return filename.startswith(subfolder.strip("/") + "/")
+    # Root load: only files at the repo root (no directory component).
+    return "/" not in filename
+
+
 def _prefetch_ignore_patterns(
     model_name,
     *,
@@ -887,8 +903,12 @@ def _prefetch_ignore_patterns(
                 .siblings
                 or []
             )
+            # Only count safetensors that the load will actually read (same
+            # subfolder / root), so a .bin-only subfolder is not stripped of its
+            # weights because some other path in the repo ships safetensors.
             has_safetensors = any(
                 sibling.rfilename.endswith((".safetensors", ".safetensors.index.json"))
+                and _in_requested_load_scope(sibling.rfilename, subfolder)
                 for sibling in siblings
             )
             if has_safetensors:
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 2c5e1027bf..7c102fda65 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -880,6 +880,16 @@ def from_pretrained(
             # the in-process load reuses that warm cache instead of re-forcing.
             if _prefetched and kwargs.get("force_download", False):
                 kwargs["force_download"] = False
+            # Read the adapter from the same place the prefetch warmed: forward
+            # cache_dir / subfolder when set and local_files_only when on, while
+            # leaving PEFT's own defaults untouched otherwise.
+            peft_load_kwargs = {}
+            if kwargs.get("cache_dir") is not None:
+                peft_load_kwargs["cache_dir"] = kwargs["cache_dir"]
+            if kwargs.get("subfolder") is not None:
+                peft_load_kwargs["subfolder"] = kwargs["subfolder"]
+            if local_files_only:
+                peft_load_kwargs["local_files_only"] = True
             model = PeftModel.from_pretrained(
                 model,
                 old_model_name,
@@ -887,6 +897,7 @@ def from_pretrained(
                 revision = revision,
                 is_trainable = True,
                 trust_remote_code = trust_remote_code,
+                **peft_load_kwargs,
             )
             # Patch it as well!
             model = dispatch_model.patch_peft_model(model, use_gradient_checkpointing)
@@ -1812,6 +1823,16 @@ def _patched_car(
             # the in-process load reuses that warm cache instead of re-forcing.
             if _prefetched and kwargs.get("force_download", False):
                 kwargs["force_download"] = False
+            # Read the adapter from the same place the prefetch warmed: forward
+            # cache_dir / subfolder when set and local_files_only when on, while
+            # leaving PEFT's own defaults untouched otherwise.
+            peft_load_kwargs = {}
+            if kwargs.get("cache_dir") is not None:
+                peft_load_kwargs["cache_dir"] = kwargs["cache_dir"]
+            if kwargs.get("subfolder") is not None:
+                peft_load_kwargs["subfolder"] = kwargs["subfolder"]
+            if local_files_only:
+                peft_load_kwargs["local_files_only"] = True
             try:
                 model = PeftModel.from_pretrained(
                     model,
@@ -1820,6 +1841,7 @@ def _patched_car(
                     revision = revision,
                     is_trainable = True,
                     trust_remote_code = trust_remote_code,
+                    **peft_load_kwargs,
                 )
             finally:
                 # Always restore original PEFT method, even if loading fails
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 58372b6eb0..36bfd3736a 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -466,7 +466,7 @@ def _missing_torchvision_error(error = None):
     return False
 
 
-def _construct_vlm_processor_fallback(tokenizer_name, model_type, token, trust_remote_code):
+def _construct_vlm_processor_fallback(tokenizer_name, model_type, token, trust_remote_code, cache_dir = None):
     """Construct a VLM processor manually when AutoProcessor.from_pretrained fails.
 
     Some VLMs (e.g., LFM2.5-VL) have tokenizer_class entries that AutoTokenizer
@@ -483,6 +483,7 @@ def _construct_vlm_processor_fallback(tokenizer_name, model_type, token, trust_r
             tokenizer_name,
             token = token,
             trust_remote_code = trust_remote_code,
+            cache_dir = cache_dir,
         )
         # Load tokenizer via PreTrainedTokenizerFast (bypasses tokenizer_class check)
         tok = PreTrainedTokenizerFast.from_pretrained(
@@ -490,12 +491,13 @@ def _construct_vlm_processor_fallback(tokenizer_name, model_type, token, trust_r
             padding_side = "left",
             token = token,
             trust_remote_code = trust_remote_code,
+            cache_dir = cache_dir,
         )
         # Read tokenizer_config.json for model-specific special tokens
         try:
             from huggingface_hub import hf_hub_download
 
-            config_path = hf_hub_download(tokenizer_name, "tokenizer_config.json", token = token)
+            config_path = hf_hub_download(tokenizer_name, "tokenizer_config.json", token = token, cache_dir = cache_dir)
             with open(config_path, "r", encoding = "utf-8") as f:
                 tok_config = json.load(f)
             # Set model-specific special tokens and their IDs
@@ -525,6 +527,7 @@ def _construct_vlm_processor_fallback(tokenizer_name, model_type, token, trust_r
                     tokenizer_name,
                     token = token,
                     trust_remote_code = trust_remote_code,
+                    cache_dir = cache_dir,
                 )
                 proc_class_name = PROCESSOR_MAPPING_NAMES.get(config.model_type)
             except Exception:
@@ -1238,6 +1241,7 @@ def from_pretrained(
                 model_type_arch,
                 token,
                 trust_remote_code,
+                cache_dir = kwargs.get("cache_dir"),
             )
             if _fallback is not None:
                 tokenizer = _fallback
@@ -1302,6 +1306,7 @@ def from_pretrained(
                     padding_side = "left",
                     token = token,
                     trust_remote_code = trust_remote_code,
+                    cache_dir = kwargs.get("cache_dir"),
                 )
                 model, _fallback_tok = patch_tokenizer(model, _fallback_tok)
                 # Re-attach as processor wrapper if original was a processor
@@ -1327,6 +1332,7 @@ def from_pretrained(
                     padding_side = "left",
                     token = token,
                     trust_remote_code = trust_remote_code,
+                    cache_dir = kwargs.get("cache_dir"),
                 )
             except Exception:
                 try:
@@ -1336,6 +1342,7 @@ def from_pretrained(
                         padding_side = "left",
                         token = token,
                         trust_remote_code = trust_remote_code,
+                        cache_dir = kwargs.get("cache_dir"),
                     )
                 except Exception:
                     del model

From a816c77d91c946ef7911d92d269084cbc445ca67 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 Jun 2026 15:31:44 +0000
Subject: [PATCH 17/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/models/vision.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 36bfd3736a..d308a49335 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -466,7 +466,13 @@ def _missing_torchvision_error(error = None):
     return False
 
 
-def _construct_vlm_processor_fallback(tokenizer_name, model_type, token, trust_remote_code, cache_dir = None):
+def _construct_vlm_processor_fallback(
+    tokenizer_name,
+    model_type,
+    token,
+    trust_remote_code,
+    cache_dir = None,
+):
     """Construct a VLM processor manually when AutoProcessor.from_pretrained fails.
 
     Some VLMs (e.g., LFM2.5-VL) have tokenizer_class entries that AutoTokenizer
@@ -497,7 +503,9 @@ def _construct_vlm_processor_fallback(tokenizer_name, model_type, token, trust_r
         try:
             from huggingface_hub import hf_hub_download
 
-            config_path = hf_hub_download(tokenizer_name, "tokenizer_config.json", token = token, cache_dir = cache_dir)
+            config_path = hf_hub_download(
+                tokenizer_name, "tokenizer_config.json", token = token, cache_dir = cache_dir
+            )
             with open(config_path, "r", encoding = "utf-8") as f:
                 tok_config = json.load(f)
             # Set model-specific special tokens and their IDs

From fc88b76a2dd833f31f8f84228064d4aaa2f97dc8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Jun 2026 03:39:26 +0000
Subject: [PATCH 18/80] Fix Studio shim CI: degrade when unsloth_zoo is absent,
 clear lint blocker

The Studio shim broke every CI job that imports it without unsloth_zoo
installed (the Python test matrix, wheel build, and all Studio inference E2E
jobs, which start a backend that imports the shim). Two problems:

- The degradation guard only handled a missing submodule
  (name == "unsloth_zoo.hf_xet_fallback"). When unsloth_zoo is not installed at
  all, `import unsloth_zoo.hf_xet_fallback` raises ModuleNotFoundError with
  name == "unsloth_zoo" (the first missing component), so the guard re-raised and
  crashed import. Broaden it to degrade for both names. The original Studio file
  had no unsloth_zoo dependency, so those environments regressed.
- verify_import_hoist flagged HOISTED-IMPORT-UNUSED on DownloadStallError: it was
  bound by both a `from ... import` and a class def (the degraded branch). Import
  the shared module under a single used alias and bind every public name by
  assignment in both branches, so no name is import+def at module level.

Also:
- tests: broaden the test module's own import guard to the same two cases and add
  a regression test that the shim degrades when unsloth_zoo is entirely absent
  (the exact gap that broke CI).
- _utils.py: scope maybe_prefetch_hf_snapshot to the requested subfolder via
  allow_patterns instead of warming the whole repo, which also scopes the stall
  protection to the weights the load reads.
---
 studio/backend/tests/test_hf_xet_fallback.py | 49 ++++++++++++++++++--
 studio/backend/utils/hf_xet_fallback.py      | 49 +++++++++++---------
 unsloth/models/_utils.py                     |  8 ++++
 3 files changed, 82 insertions(+), 24 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 561e2ab7b0..6885389420 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -37,10 +37,13 @@
 import huggingface_hub
 
 try:
-    import unsloth_zoo.hf_xet_fallback as shared
+    import unsloth_zoo.hf_xet_fallback as _shared_mod
+    shared = _shared_mod
 except ModuleNotFoundError as exc:
-    # The degraded-path test must still run on an unsloth_zoo without the helper.
-    if exc.name != "unsloth_zoo.hf_xet_fallback":
+    # The degraded-path test must still collect when unsloth_zoo lacks the helper
+    # OR is not installed at all (a Studio-only test/build env without the heavy
+    # ML package) -- the same two cases the shim itself degrades for.
+    if exc.name not in ("unsloth_zoo", "unsloth_zoo.hf_xet_fallback"):
         raise
     shared = None
 
@@ -210,3 +213,43 @@ def _fake_snapshot(repo_id, **kwargs):
             sys.modules["unsloth_zoo.hf_xet_fallback"] = saved_shared
         if saved_shim is not None:
             sys.modules["utils.hf_xet_fallback"] = saved_shim
+
+
+def test_degrades_when_unsloth_zoo_entirely_absent():
+    """Studio-only test/build environments do not install unsloth_zoo at all. The
+    shim must degrade for that case too: ``import unsloth_zoo.hf_xet_fallback``
+    then raises ModuleNotFoundError(name='unsloth_zoo') (the top-level package),
+    not name='unsloth_zoo.hf_xet_fallback'. Regression guard for the shim re-raising
+    and breaking every Studio import that transitively pulls it in."""
+    import importlib
+
+    class _BlockZoo:
+        def find_spec(self, name, path = None, target = None):
+            # Simulate the whole package being absent: the first missing component
+            # is 'unsloth_zoo', so that is what ModuleNotFoundError.name carries.
+            if name == "unsloth_zoo" or name.startswith("unsloth_zoo."):
+                raise ModuleNotFoundError("No module named 'unsloth_zoo'", name = "unsloth_zoo")
+            return None
+
+    finder = _BlockZoo()
+    saved = {
+        k: v for k, v in list(sys.modules.items())
+        if k == "unsloth_zoo" or k.startswith("unsloth_zoo.")
+    }
+    for k in saved:
+        del sys.modules[k]
+    saved_shim = sys.modules.pop("utils.hf_xet_fallback", None)
+    sys.meta_path.insert(0, finder)
+    try:
+        degraded = importlib.import_module("utils.hf_xet_fallback")
+        # Boots without raising and exposes the stub API.
+        assert issubclass(degraded.DownloadStallError, RuntimeError)
+        assert degraded.get_hf_download_state(["x"]) is None
+        event = degraded.start_watchdog(repo_ids = ["x"], on_stall = lambda m: None)
+        assert hasattr(event, "set") and not event.is_set()
+    finally:
+        sys.meta_path.remove(finder)
+        sys.modules.pop("utils.hf_xet_fallback", None)
+        sys.modules.update(saved)
+        if saved_shim is not None:
+            sys.modules["utils.hf_xet_fallback"] = saved_shim
diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index fb98737125..d0cf022210 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -18,29 +18,36 @@
 from typing import Any, Callable, Optional
 
 try:
-    from unsloth_zoo.hf_xet_fallback import (
-        DEFAULT_GRACE_PERIOD,
-        DEFAULT_HEARTBEAT_INTERVAL,
-        DEFAULT_STALL_TIMEOUT,
-        DownloadStallError,
-        child_should_disable_xet,
-        get_hf_download_state,
-        start_watchdog,
-    )
-    from unsloth_zoo.hf_xet_fallback import (
-        hf_hub_download_with_xet_fallback as _shared_hf_hub_download_with_xet_fallback,
-        snapshot_download_with_xet_fallback as _shared_snapshot_download_with_xet_fallback,
-    )
+    import unsloth_zoo.hf_xet_fallback as _shared
+    _shared_available = True
 except ModuleNotFoundError as exc:
-    if exc.name != "unsloth_zoo.hf_xet_fallback":
+    # Degrade when the shared helper is unavailable: either unsloth_zoo is too old
+    # to ship hf_xet_fallback, or unsloth_zoo is not importable at all (a
+    # Studio-only test/build environment without the heavy ML package). Re-raise
+    # any other missing module, which would indicate a real bug inside the helper.
+    if exc.name not in ("unsloth_zoo", "unsloth_zoo.hf_xet_fallback"):
         raise
-
-    # The shared helper lives in a newer unsloth_zoo. Rather than crash Studio at
-    # startup on an older (but dependency-satisfying) unsloth_zoo, degrade
-    # gracefully: plain HF downloads with the no-progress stall watchdog disabled
-    # -- the same best-effort posture core Unsloth uses in from_pretrained. The
-    # automatic Xet -> HTTP recovery returns as soon as unsloth_zoo is upgraded.
-    # These are thin stubs, not a second copy of the orchestration.
+    _shared_available = False
+
+if _shared_available:
+    # Bind the shared API by assignment (not `from ... import`) so each public name
+    # has a single, unambiguous module-level binding shared between this branch and
+    # the degraded one below.
+    DEFAULT_GRACE_PERIOD = _shared.DEFAULT_GRACE_PERIOD
+    DEFAULT_HEARTBEAT_INTERVAL = _shared.DEFAULT_HEARTBEAT_INTERVAL
+    DEFAULT_STALL_TIMEOUT = _shared.DEFAULT_STALL_TIMEOUT
+    DownloadStallError = _shared.DownloadStallError
+    child_should_disable_xet = _shared.child_should_disable_xet
+    get_hf_download_state = _shared.get_hf_download_state
+    start_watchdog = _shared.start_watchdog
+    _shared_hf_hub_download_with_xet_fallback = _shared.hf_hub_download_with_xet_fallback
+    _shared_snapshot_download_with_xet_fallback = _shared.snapshot_download_with_xet_fallback
+else:
+    # Rather than crash Studio at startup, degrade gracefully: plain HF downloads
+    # with the no-progress stall watchdog disabled -- the same best-effort posture
+    # core Unsloth uses in from_pretrained. Automatic Xet -> HTTP recovery returns
+    # as soon as unsloth_zoo is upgraded. These are thin stubs, not a second copy
+    # of the orchestration.
     import logging as _logging
 
     _logging.getLogger(__name__).warning(
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index f10025773c..aa2d541c9a 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -992,12 +992,20 @@ def maybe_prefetch_hf_snapshot(
         subfolder = subfolder,
         use_safetensors = use_safetensors,
     )
+    # When loading from a subfolder, warm only that subfolder instead of the whole
+    # repo: a from_pretrained(..., subfolder=X) resolves every file under X/, so the
+    # rest is wasted bandwidth and disk. This also scopes the stall protection to the
+    # weights the load actually reads.
+    allow_patterns = None
+    if isinstance(subfolder, str) and subfolder.strip("/"):
+        allow_patterns = [f"{subfolder.strip('/')}/*"]
     try:
         snapshot_download_with_xet_fallback(
             model_name,
             token = token,
             revision = revision,
             cache_dir = cache_dir,
+            allow_patterns = allow_patterns,
             ignore_patterns = ignore_patterns,
             force_download = force_download,
         )

From 872ca5e03e8db43c8f6e5ff291e7e34de5a5e99b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 26 Jun 2026 03:40:11 +0000
Subject: [PATCH 19/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_hf_xet_fallback.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 6885389420..71da7a6655 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -224,7 +224,12 @@ def test_degrades_when_unsloth_zoo_entirely_absent():
     import importlib
 
     class _BlockZoo:
-        def find_spec(self, name, path = None, target = None):
+        def find_spec(
+            self,
+            name,
+            path = None,
+            target = None,
+        ):
             # Simulate the whole package being absent: the first missing component
             # is 'unsloth_zoo', so that is what ModuleNotFoundError.name carries.
             if name == "unsloth_zoo" or name.startswith("unsloth_zoo."):
@@ -233,7 +238,8 @@ def find_spec(self, name, path = None, target = None):
 
     finder = _BlockZoo()
     saved = {
-        k: v for k, v in list(sys.modules.items())
+        k: v
+        for k, v in list(sys.modules.items())
         if k == "unsloth_zoo" or k.startswith("unsloth_zoo.")
     }
     for k in saved:

From 4e8242734382bd754ed8c97ed33ba4161979d6d1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Jun 2026 04:24:59 +0000
Subject: [PATCH 20/80] Degrade the Studio shim on ImportError too, not just
 ModuleNotFoundError

The Studio inference E2E jobs install unsloth_zoo but not torch (they serve
GGUF via llama.cpp and never need torch). Importing unsloth_zoo then runs its
package __init__, which raises ImportError("Pytorch is not installed") -- not a
ModuleNotFoundError. The shim only caught ModuleNotFoundError, so that
propagated and crashed the Studio server at startup (the orchestrator imports
DownloadStallError from the shim). The original Studio file never imported
unsloth_zoo, so a torch-less install worked.

Catch ImportError (which also covers ModuleNotFoundError) and degrade for any
import failure of the shared helper: absent/old unsloth_zoo, or unsloth_zoo
present but failing to import because torch is missing. Studio then boots with
plain HF downloads instead of crashing. The cause is logged. Stash the
exception in a module-level name first, since the `as` binding is unbound when
the except block exits.

tests: broaden the test module's own guard the same way and add a regression
test that the shim degrades when importing the helper raises ImportError
(torch-less install), the exact case that broke the inference E2E jobs.
---
 studio/backend/tests/test_hf_xet_fallback.py | 49 +++++++++++++++++---
 studio/backend/utils/hf_xet_fallback.py      | 27 +++++++----
 2 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 71da7a6655..9d9e694a9b 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -39,12 +39,10 @@
 try:
     import unsloth_zoo.hf_xet_fallback as _shared_mod
     shared = _shared_mod
-except ModuleNotFoundError as exc:
-    # The degraded-path test must still collect when unsloth_zoo lacks the helper
-    # OR is not installed at all (a Studio-only test/build env without the heavy
-    # ML package) -- the same two cases the shim itself degrades for.
-    if exc.name not in ("unsloth_zoo", "unsloth_zoo.hf_xet_fallback"):
-        raise
+except ImportError:
+    # The degraded-path test must still collect when unsloth_zoo lacks the helper,
+    # is not installed at all, or is installed but fails to import because torch is
+    # absent -- the same ImportError cases the shim itself degrades for.
     shared = None
 
 import utils.hf_xet_fallback as xf
@@ -259,3 +257,42 @@ def find_spec(
         sys.modules.update(saved)
         if saved_shim is not None:
             sys.modules["utils.hf_xet_fallback"] = saved_shim
+
+
+def test_degrades_when_shared_helper_import_raises_importerror():
+    """unsloth_zoo can be installed yet fail to import because a heavy dependency
+    it initializes at package import (torch) is missing -- a llama.cpp/GGUF-only
+    Studio install has no torch. That raises ImportError (not ModuleNotFoundError),
+    so the shim must degrade for it too rather than crash the Studio server at
+    startup. Regression guard for the inference E2E jobs."""
+    import importlib
+
+    class _BlockWithImportError:
+        def find_spec(self, name, path = None, target = None):
+            if name == "unsloth_zoo.hf_xet_fallback":
+                # Mirror unsloth_zoo/__init__ raising on a torch-less install: a
+                # plain ImportError with no .name, surfaced while importing the
+                # submodule's parent package.
+                raise ImportError("Unsloth: Pytorch is not installed.")
+            return None
+
+    finder = _BlockWithImportError()
+    saved_shared = sys.modules.pop("unsloth_zoo.hf_xet_fallback", None)
+    saved_zoo = sys.modules.pop("unsloth_zoo", None)
+    saved_shim = sys.modules.pop("utils.hf_xet_fallback", None)
+    sys.meta_path.insert(0, finder)
+    try:
+        degraded = importlib.import_module("utils.hf_xet_fallback")
+        assert issubclass(degraded.DownloadStallError, RuntimeError)
+        assert degraded.get_hf_download_state(["x"]) is None
+        event = degraded.start_watchdog(repo_ids = ["x"], on_stall = lambda m: None)
+        assert hasattr(event, "set") and not event.is_set()
+    finally:
+        sys.meta_path.remove(finder)
+        sys.modules.pop("utils.hf_xet_fallback", None)
+        if saved_shared is not None:
+            sys.modules["unsloth_zoo.hf_xet_fallback"] = saved_shared
+        if saved_zoo is not None:
+            sys.modules["unsloth_zoo"] = saved_zoo
+        if saved_shim is not None:
+            sys.modules["utils.hf_xet_fallback"] = saved_shim
diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index d0cf022210..ce06e75b43 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -17,16 +17,22 @@
 import threading
 from typing import Any, Callable, Optional
 
+_shared_import_error = None
 try:
     import unsloth_zoo.hf_xet_fallback as _shared
     _shared_available = True
-except ModuleNotFoundError as exc:
-    # Degrade when the shared helper is unavailable: either unsloth_zoo is too old
-    # to ship hf_xet_fallback, or unsloth_zoo is not importable at all (a
-    # Studio-only test/build environment without the heavy ML package). Re-raise
-    # any other missing module, which would indicate a real bug inside the helper.
-    if exc.name not in ("unsloth_zoo", "unsloth_zoo.hf_xet_fallback"):
-        raise
+except ImportError as exc:
+    # The `as exc` name is unbound when the except block exits, so stash the cause
+    # in a module-level name the degraded branch below can still log.
+    _shared_import_error = exc
+    # Degrade whenever the shared helper cannot be imported. Real cases in Studio
+    # deployments: unsloth_zoo is absent or too old to ship hf_xet_fallback
+    # (ModuleNotFoundError), or unsloth_zoo IS installed but importing it raises
+    # ImportError because a heavy dependency it initializes at package import (e.g.
+    # torch) is missing -- a llama.cpp/GGUF-only Studio install has no torch. In
+    # every case Studio must still boot with plain HF downloads instead of crashing
+    # the server on import; a genuinely broken helper degrades here too, which is
+    # the intended best-effort posture (the cause is logged below).
     _shared_available = False
 
 if _shared_available:
@@ -51,9 +57,10 @@
     import logging as _logging
 
     _logging.getLogger(__name__).warning(
-        "unsloth_zoo.hf_xet_fallback not found; the Xet stall watchdog is "
-        "disabled. Upgrade unsloth_zoo to re-enable automatic Xet -> HTTP "
-        "download recovery."
+        "unsloth_zoo.hf_xet_fallback unavailable (%s); the Xet stall watchdog is "
+        "disabled. Install/upgrade unsloth_zoo (and its torch dependency) to "
+        "re-enable automatic Xet -> HTTP download recovery.",
+        _shared_import_error,
     )
 
     DEFAULT_HEARTBEAT_INTERVAL = 30.0

From 27b32b504bc6126c7fe75c35d65eb617b9cbb630 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 26 Jun 2026 04:26:22 +0000
Subject: [PATCH 21/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_hf_xet_fallback.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 9d9e694a9b..0b4cbd802a 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -268,7 +268,12 @@ def test_degrades_when_shared_helper_import_raises_importerror():
     import importlib
 
     class _BlockWithImportError:
-        def find_spec(self, name, path = None, target = None):
+        def find_spec(
+            self,
+            name,
+            path = None,
+            target = None,
+        ):
             if name == "unsloth_zoo.hf_xet_fallback":
                 # Mirror unsloth_zoo/__init__ raising on a torch-less install: a
                 # plain ImportError with no .name, surfaced while importing the

From 163bc81bd175b00bfbf83e86e51a20186db73d59 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Jun 2026 05:18:45 +0000
Subject: [PATCH 22/80] Degrade/recover the Studio shim when unsloth_zoo GPU
 init fails

A third torch-less/GPU-less failure mode crashed the Studio inference E2E jobs
(CPU-only runners). unsloth_zoo's package __init__ runs torch/GPU device
detection, which raises NotImplementedError("...You need a GPU.") on a host with
no accelerator -- not an ImportError, so the previous guard let it propagate and
crash the server at startup (the orchestrator imports DownloadStallError from the
shim).

The download helper needs no device init, so on any import failure retry under
unsloth_zoo's documented light path (UNSLOTH_ZOO_DISABLE_GPU_INIT=1), which skips
the torch/GPU init. That lets the real helper load on CPU-only Studio hosts; the
first attempt keeps full init unchanged on a GPU host, and the env override is
restored after the import so it does not leak. If even the light retry fails
(unsloth_zoo absent/broken), degrade to plain HF downloads instead of crashing.

tests: broaden the test module guard to any import failure and add a regression
test that the shim retries under the light env (records [None, "1"]), restores
the env, and degrades when both attempts raise NotImplementedError.
---
 studio/backend/tests/test_hf_xet_fallback.py | 54 ++++++++++++++++++--
 studio/backend/utils/hf_xet_fallback.py      | 40 ++++++++++-----
 2 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 0b4cbd802a..5c8502ee12 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -39,10 +39,11 @@
 try:
     import unsloth_zoo.hf_xet_fallback as _shared_mod
     shared = _shared_mod
-except ImportError:
+except Exception:  # noqa: BLE001
     # The degraded-path test must still collect when unsloth_zoo lacks the helper,
-    # is not installed at all, or is installed but fails to import because torch is
-    # absent -- the same ImportError cases the shim itself degrades for.
+    # is not installed at all, fails to import because torch is absent (ImportError),
+    # or fails because the host has no GPU (NotImplementedError from its device
+    # init) -- the same failure cases the shim itself degrades for.
     shared = None
 
 import utils.hf_xet_fallback as xf
@@ -301,3 +302,50 @@ def find_spec(
             sys.modules["unsloth_zoo"] = saved_zoo
         if saved_shim is not None:
             sys.modules["utils.hf_xet_fallback"] = saved_shim
+
+
+def test_retries_under_light_gpu_init_when_import_fails(monkeypatch):
+    """unsloth_zoo's package __init__ runs torch/GPU device detection that raises
+    NotImplementedError on a GPU-less host (CPU GGUF Studio). The shim must retry
+    the import under UNSLOTH_ZOO_DISABLE_GPU_INIT=1 (its light path) -- which is
+    what lets the real helper load on CPU-only hosts -- then restore the env. If
+    even the retry fails, it degrades instead of crashing the server."""
+    import importlib
+    import os
+
+    monkeypatch.delenv("UNSLOTH_ZOO_DISABLE_GPU_INIT", raising = False)
+    seen_env = []
+
+    class _GpuGatedBlocker:
+        def find_spec(self, name, path = None, target = None):
+            if name == "unsloth_zoo.hf_xet_fallback":
+                # Record the env each import attempt sees; raise the no-GPU error
+                # both times so the shim ends up degrading (the recovery-succeeds
+                # path is covered by real unsloth_zoo on a CPU host in CI).
+                seen_env.append(os.environ.get("UNSLOTH_ZOO_DISABLE_GPU_INIT"))
+                raise NotImplementedError("Unsloth cannot find any torch accelerator")
+            return None
+
+    finder = _GpuGatedBlocker()
+    saved = {
+        k: v for k, v in list(sys.modules.items())
+        if k == "unsloth_zoo" or k.startswith("unsloth_zoo.")
+    }
+    for k in saved:
+        del sys.modules[k]
+    saved_shim = sys.modules.pop("utils.hf_xet_fallback", None)
+    sys.meta_path.insert(0, finder)
+    try:
+        degraded = importlib.import_module("utils.hf_xet_fallback")
+        # First attempt without the light env, then a retry with it set.
+        assert seen_env == [None, "1"], seen_env
+        # Both attempts raised -> Studio still boots in degraded mode.
+        assert issubclass(degraded.DownloadStallError, RuntimeError)
+        # The env override must not leak past the import.
+        assert os.environ.get("UNSLOTH_ZOO_DISABLE_GPU_INIT") is None
+    finally:
+        sys.meta_path.remove(finder)
+        sys.modules.pop("utils.hf_xet_fallback", None)
+        sys.modules.update(saved)
+        if saved_shim is not None:
+            sys.modules["utils.hf_xet_fallback"] = saved_shim
diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index ce06e75b43..d9fbcea87d 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -21,19 +21,33 @@
 try:
     import unsloth_zoo.hf_xet_fallback as _shared
     _shared_available = True
-except ImportError as exc:
-    # The `as exc` name is unbound when the except block exits, so stash the cause
-    # in a module-level name the degraded branch below can still log.
-    _shared_import_error = exc
-    # Degrade whenever the shared helper cannot be imported. Real cases in Studio
-    # deployments: unsloth_zoo is absent or too old to ship hf_xet_fallback
-    # (ModuleNotFoundError), or unsloth_zoo IS installed but importing it raises
-    # ImportError because a heavy dependency it initializes at package import (e.g.
-    # torch) is missing -- a llama.cpp/GGUF-only Studio install has no torch. In
-    # every case Studio must still boot with plain HF downloads instead of crashing
-    # the server on import; a genuinely broken helper degrades here too, which is
-    # the intended best-effort posture (the cause is logged below).
-    _shared_available = False
+except Exception as _exc:  # noqa: BLE001 - any import failure must degrade, not crash
+    # unsloth_zoo runs torch/GPU device detection in its package __init__, which
+    # raises on a Studio host without torch (ImportError) or without a GPU
+    # (NotImplementedError) -- a CPU / llama.cpp GGUF-only deployment. The download
+    # helper needs none of that, so retry with unsloth_zoo's documented light import
+    # path (UNSLOTH_ZOO_DISABLE_GPU_INIT) before giving up. The first attempt above
+    # keeps full device init unchanged on a normal GPU host; a failed import is
+    # dropped from sys.modules, so the retry re-runs __init__ on the light path.
+    _shared_import_error = _exc
+    import os as _os
+
+    _prev_gpu_init = _os.environ.get("UNSLOTH_ZOO_DISABLE_GPU_INIT")
+    _os.environ["UNSLOTH_ZOO_DISABLE_GPU_INIT"] = "1"
+    try:
+        import unsloth_zoo.hf_xet_fallback as _shared
+        _shared_available = True
+        _shared_import_error = None
+    except Exception as _exc2:  # noqa: BLE001
+        # unsloth_zoo is absent/too old, or genuinely broken: degrade so Studio
+        # still boots with plain HF downloads instead of crashing on import.
+        _shared_import_error = _exc2
+        _shared_available = False
+    finally:
+        if _prev_gpu_init is None:
+            _os.environ.pop("UNSLOTH_ZOO_DISABLE_GPU_INIT", None)
+        else:
+            _os.environ["UNSLOTH_ZOO_DISABLE_GPU_INIT"] = _prev_gpu_init
 
 if _shared_available:
     # Bind the shared API by assignment (not `from ... import`) so each public name

From 9253c7bd5f63a4c7114f40f1b4ee578bcb7a1e2d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 26 Jun 2026 05:21:48 +0000
Subject: [PATCH 23/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_hf_xet_fallback.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 5c8502ee12..2abdb22e86 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -317,7 +317,12 @@ def test_retries_under_light_gpu_init_when_import_fails(monkeypatch):
     seen_env = []
 
     class _GpuGatedBlocker:
-        def find_spec(self, name, path = None, target = None):
+        def find_spec(
+            self,
+            name,
+            path = None,
+            target = None,
+        ):
             if name == "unsloth_zoo.hf_xet_fallback":
                 # Record the env each import attempt sees; raise the no-GPU error
                 # both times so the shim ends up degrading (the recovery-succeeds
@@ -328,7 +333,8 @@ def find_spec(self, name, path = None, target = None):
 
     finder = _GpuGatedBlocker()
     saved = {
-        k: v for k, v in list(sys.modules.items())
+        k: v
+        for k, v in list(sys.modules.items())
         if k == "unsloth_zoo" or k.startswith("unsloth_zoo.")
     }
     for k in saved:

From 2313f555e148768c30a388ec714b350f295bf064 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Jun 2026 06:14:34 +0000
Subject: [PATCH 24/80] Fix the GPU-init degradation test to intercept the
 parent import

test_retries_under_light_gpu_init_when_import_fails matched the meta_path finder
on "unsloth_zoo.hf_xet_fallback", but in CI unsloth_zoo is not installed, so
`import unsloth_zoo.hf_xet_fallback` fails on the parent "unsloth_zoo" import
before the submodule is ever reached -- the finder never ran and seen_env was
empty. The real GPU-less crash is in unsloth_zoo's package __init__ (the parent
import), so match "unsloth_zoo" itself. The finder is first on meta_path, so it
intercepts the parent import whether or not unsloth_zoo is installed, making the
test behave identically in both environments.
---
 studio/backend/tests/test_hf_xet_fallback.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 2abdb22e86..5c42134587 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -323,7 +323,11 @@ def find_spec(
             path = None,
             target = None,
         ):
-            if name == "unsloth_zoo.hf_xet_fallback":
+            # The real crash is in unsloth_zoo's package __init__ (device
+            # detection), i.e. the PARENT import fails before the submodule is
+            # reached -- so intercept "unsloth_zoo" itself. This works whether or
+            # not unsloth_zoo is installed (the finder is first on meta_path).
+            if name == "unsloth_zoo":
                 # Record the env each import attempt sees; raise the no-GPU error
                 # both times so the shim ends up degrading (the recovery-succeeds
                 # path is covered by real unsloth_zoo on a CPU host in CI).

From 9c601c6898f4e9948294ff2a224880ca4ab85406 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Jun 2026 07:01:10 +0000
Subject: [PATCH 25/80] Treat use_safetensors as a format allowlist in the
 prefetch (Codex P2)

The prewarm snapshot only added the *.bin ignore after finding an in-scope
safetensors file, keying entirely off has_safetensors. Two cases were wrong:

- use_safetensors=True against a bin-only repo/subfolder found no safetensors, so
  *.bin was not ignored and the prefetch pulled multi-GB of PyTorch weights that
  the safetensors-only load then rejects.
- use_safetensors=False never ignored *.safetensors, so a mixed-format repo
  downloaded safetensors the .bin load never reads.

Treat an explicit use_safetensors as an allowlist: True skips *.bin /
*.bin.index.json outright (no model_info call needed), False skips *.safetensors /
*.safetensors.index.json. use_safetensors=None keeps the auto heuristic (skip *.bin
only when in-scope safetensors are also shipped, since Transformers prefers them).
---
 unsloth/models/_utils.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index aa2d541c9a..19cf5fc8c3 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -872,9 +872,13 @@ def _prefetch_ignore_patterns(
     use_safetensors = None,
 ):
     """ignore_patterns for the prewarm snapshot: the static skip list, minus the
-    checkpoint guard when loading from a checkpoint-* subfolder, plus *.bin when
-    the repo also ships safetensors (Transformers prefers safetensors, so pulling
-    the .bin copies just to discard them doubles the very download we optimize)."""
+    checkpoint guard when loading from a checkpoint-* subfolder, minus the weight
+    format the load will not read. An explicit use_safetensors is treated as a format
+    allowlist (True -> skip *.bin, False -> skip *.safetensors) so a bin-only repo is
+    not pulled in full just to be rejected by a safetensors-only load. use_safetensors
+    is None (auto) skips *.bin only when in-scope safetensors are also shipped, since
+    Transformers prefers them (pulling the other format just to discard it doubles the
+    very download we optimize)."""
     # A checkpoint-* subfolder is exactly what "checkpoint-*/*" would drop, so
     # do not ignore it when the caller is explicitly loading from that subfolder.
     ignore_patterns = [
@@ -886,10 +890,20 @@ def _prefetch_ignore_patterns(
             and subfolder.startswith("checkpoint-")
         )
     ]
-    # Skip .bin only when the caller has not explicitly asked for it and the repo
-    # actually ships safetensors to load instead. Best-effort: any failure leaves
-    # both formats eligible (correct, just less efficient).
-    if use_safetensors is not False:
+    # Drop the weight format the load will not read. Transformers reads exactly one
+    # format, so prefetching the other doubles the very download we optimize.
+    if use_safetensors is True:
+        # Explicit safetensors: the load never reads .bin, so skip it outright -- even
+        # for a bin-only repo, where the load fails anyway, do not pull multi-GB of
+        # PyTorch weights first. No model_info call needed for an explicit request.
+        ignore_patterns.extend(("*.bin", "*.bin.index.json"))
+    elif use_safetensors is False:
+        # Explicit .bin: the load never reads safetensors, so skip them.
+        ignore_patterns.extend(("*.safetensors", "*.safetensors.index.json"))
+    else:
+        # Auto (use_safetensors is None): skip .bin only once in-scope safetensors are
+        # confirmed to load instead, since Transformers prefers them. Best-effort: any
+        # failure leaves both formats eligible (correct, just less efficient).
         try:
             from huggingface_hub import HfApi
 

From ba2aa9f93c4698bb26db6aca92f836d85086c152 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Jun 2026 07:35:06 +0000
Subject: [PATCH 26/80] Fix three prefetch/load consistency issues from review
 (Codex P2)

loader.py: drop local_files_only from peft_load_kwargs. The merge with #6554
added an explicit local_files_only = local_files_only to both
PeftModel.from_pretrained calls, so also expanding it via **peft_load_kwargs
raised TypeError: got multiple values for keyword argument 'local_files_only'
whenever an adapter was loaded offline. Keep the explicit argument; forward only
cache_dir / subfolder through peft_load_kwargs.

llama.py: stop forwarding revision to the prefetch. FastLlamaModel.from_pretrained
consumes revision as a named parameter and never forwards it to the downstream
AutoConfig / AutoModel.from_pretrained (and model_name may be a remapped
prequantized repo where the caller's revision does not exist). Warming a specific
revision therefore predownloaded a snapshot the load never reads while leaving the
actual default-branch transfer unprotected. Warm the same default branch the load
resolves.

diffusion.py: forward an explicit use_safetensors to the real load. The prefetch
filters .bin / .safetensors by use_safetensors, but load_kwargs omitted it, so on a
mixed-format repo the in-process load could pick the other format and start an
unprotected Xet download after the warmup fetched a different one. Pass it through
when explicitly set (None/auto already matches the prefetch heuristic).
---
 unsloth/models/diffusion.py |  7 +++++++
 unsloth/models/llama.py     |  8 ++++++--
 unsloth/models/loader.py    | 12 ++++--------
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/unsloth/models/diffusion.py b/unsloth/models/diffusion.py
index d22b1dda71..0010b57aaa 100644
--- a/unsloth/models/diffusion.py
+++ b/unsloth/models/diffusion.py
@@ -208,6 +208,13 @@ def from_pretrained(
             local_files_only = local_files_only,
             cache_dir = cache_dir,
         )
+        # Honor an explicit weight format on the real load too, so it reads the same
+        # format the prefetch warmed. The prefetch filters .bin / .safetensors by
+        # use_safetensors; without forwarding it the load could pick the other format
+        # on a mixed-format repo and start an unprotected in-process Xet download.
+        # use_safetensors=None (auto) already matches the prefetch's auto heuristic.
+        if kwargs.get("use_safetensors") is not None:
+            load_kwargs["use_safetensors"] = kwargs["use_safetensors"]
 
         # Optional bitsandbytes quant. The MoE experts (3D Parameters) are not nn.Linear so bnb skips
         # them; only attention + dense MLP Linears quantize, lm_head/embeddings stay full precision.
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 76b29f412e..cd5ebd999c 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2375,11 +2375,15 @@ def from_pretrained(
 
         # Pre-download the repo in a killable subprocess that falls back from Xet
         # to HTTP on a no-progress stall, so the in-process load below is a cache
-        # hit and cannot hang on a stalled Xet transfer.
+        # hit and cannot hang on a stalled Xet transfer. revision is intentionally
+        # not forwarded: the base-model load below resolves model_name (possibly a
+        # remapped prequantized repo, where the caller's revision does not exist) on
+        # its default branch without a revision, so warming a specific revision would
+        # predownload a snapshot the load never reads and leave the real transfer
+        # unprotected.
         _prefetched = maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
-            revision = revision,
             cache_dir = kwargs.get("cache_dir"),
             local_files_only = kwargs.get("local_files_only", False),
             fast_inference = fast_inference,
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index f0dab7b8e8..421d42fb4a 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -883,15 +883,13 @@ def from_pretrained(
             if _prefetched and kwargs.get("force_download", False):
                 kwargs["force_download"] = False
             # Read the adapter from the same place the prefetch warmed: forward
-            # cache_dir / subfolder when set and local_files_only when on, while
-            # leaving PEFT's own defaults untouched otherwise.
+            # cache_dir / subfolder when set (local_files_only is already passed
+            # explicitly below), while leaving PEFT's own defaults untouched otherwise.
             peft_load_kwargs = {}
             if kwargs.get("cache_dir") is not None:
                 peft_load_kwargs["cache_dir"] = kwargs["cache_dir"]
             if kwargs.get("subfolder") is not None:
                 peft_load_kwargs["subfolder"] = kwargs["subfolder"]
-            if local_files_only:
-                peft_load_kwargs["local_files_only"] = True
             model = PeftModel.from_pretrained(
                 model,
                 old_model_name,
@@ -1835,15 +1833,13 @@ def _patched_car(
             if _prefetched and kwargs.get("force_download", False):
                 kwargs["force_download"] = False
             # Read the adapter from the same place the prefetch warmed: forward
-            # cache_dir / subfolder when set and local_files_only when on, while
-            # leaving PEFT's own defaults untouched otherwise.
+            # cache_dir / subfolder when set (local_files_only is already passed
+            # explicitly below), while leaving PEFT's own defaults untouched otherwise.
             peft_load_kwargs = {}
             if kwargs.get("cache_dir") is not None:
                 peft_load_kwargs["cache_dir"] = kwargs["cache_dir"]
             if kwargs.get("subfolder") is not None:
                 peft_load_kwargs["subfolder"] = kwargs["subfolder"]
-            if local_files_only:
-                peft_load_kwargs["local_files_only"] = True
             try:
                 model = PeftModel.from_pretrained(
                     model,

From 6a25b5d911f4f5ceec750c79c171fba6a01fe68b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Jun 2026 08:13:42 +0000
Subject: [PATCH 27/80] Tighten prefetch heuristics from review (Codex P2)

_utils.py: only count model-weights safetensors when deciding the .bin prefetch is
redundant. The auto use_safetensors path treated any in-scope .safetensors as proof
Transformers would not read .bin, but a repo can ship an adapter_model.safetensors
sidecar while its real weights are pytorch_model.bin. That made the prefetch skip the
needed .bin, and a caller clearing force_download after the apparent success then loaded
the real weights in-process without the Xet fallback (or reused stale .bin). Exclude
adapter / sidecar safetensors (adapter_*) via _is_model_weight_safetensors; real
model.safetensors and diffusers diffusion_pytorch_model.safetensors still count.

loader.py: do not forward the base subfolder to the adapter load. subfolder targets the
base checkpoint, but it was being passed to both the adapter prefetch and
PeftModel.from_pretrained, so an adapter-at-root / base-in-subfolder setup looked for
the adapter under old_model_name/<subfolder> and failed to find it. Warm and load the
adapter repo at its root; cache_dir is still forwarded.
---
 unsloth/models/_utils.py | 25 +++++++++++++++++++++----
 unsloth/models/loader.py | 22 ++++++++++------------
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 19cf5fc8c3..579cb3d078 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -863,6 +863,21 @@ def _in_requested_load_scope(filename, subfolder):
     return "/" not in filename
 
 
+def _is_model_weight_safetensors(filename):
+    """True if a repo-relative *filename* is a model-weights safetensors file rather than
+    a PEFT adapter / sidecar (e.g. ``adapter_model.safetensors``).
+
+    Only a real model-weights safetensors proves the ``.bin`` full-model weights are
+    redundant. A repo can ship an ``adapter_model.safetensors`` sidecar while its actual
+    weights are ``pytorch_model.bin``; counting the sidecar would wrongly skip the needed
+    ``.bin`` and leave the in-process load to fetch it without the Xet fallback.
+    """
+    name = filename.replace("\\", "/").rsplit("/", 1)[-1]
+    if not name.endswith((".safetensors", ".safetensors.index.json")):
+        return False
+    return not name.startswith("adapter_")
+
+
 def _prefetch_ignore_patterns(
     model_name,
     *,
@@ -917,11 +932,13 @@ def _prefetch_ignore_patterns(
                 .siblings
                 or []
             )
-            # Only count safetensors that the load will actually read (same
-            # subfolder / root), so a .bin-only subfolder is not stripped of its
-            # weights because some other path in the repo ships safetensors.
+            # Only count model-weights safetensors that the load will actually read
+            # (same subfolder / root, and not an adapter / sidecar), so a .bin-only
+            # subfolder is not stripped of its weights because some other path in the
+            # repo ships safetensors, nor because an adapter_model.safetensors sidecar
+            # sits next to real pytorch_model.bin weights.
             has_safetensors = any(
-                sibling.rfilename.endswith((".safetensors", ".safetensors.index.json"))
+                _is_model_weight_safetensors(sibling.rfilename)
                 and _in_requested_load_scope(sibling.rfilename, subfolder)
                 for sibling in siblings
             )
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 421d42fb4a..6e79d2dbfa 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -874,7 +874,6 @@ def from_pretrained(
                 cache_dir = kwargs.get("cache_dir"),
                 local_files_only = local_files_only,
                 fast_inference = fast_inference,
-                subfolder = kwargs.get("subfolder"),
                 force_download = kwargs.get("force_download", False),
                 use_safetensors = kwargs.get("use_safetensors"),
             )
@@ -882,14 +881,14 @@ def from_pretrained(
             # the in-process load reuses that warm cache instead of re-forcing.
             if _prefetched and kwargs.get("force_download", False):
                 kwargs["force_download"] = False
-            # Read the adapter from the same place the prefetch warmed: forward
-            # cache_dir / subfolder when set (local_files_only is already passed
-            # explicitly below), while leaving PEFT's own defaults untouched otherwise.
+            # Read the adapter from the same place the prefetch warmed: forward cache_dir
+            # when set (local_files_only is already passed explicitly below). subfolder is
+            # NOT forwarded -- it targets the base checkpoint, and an adapter typically
+            # lives at the repo root, so forwarding it would make PeftModel look under
+            # old_model_name/<subfolder> and miss a root adapter.
             peft_load_kwargs = {}
             if kwargs.get("cache_dir") is not None:
                 peft_load_kwargs["cache_dir"] = kwargs["cache_dir"]
-            if kwargs.get("subfolder") is not None:
-                peft_load_kwargs["subfolder"] = kwargs["subfolder"]
             model = PeftModel.from_pretrained(
                 model,
                 old_model_name,
@@ -1824,7 +1823,6 @@ def _patched_car(
                 cache_dir = kwargs.get("cache_dir"),
                 local_files_only = local_files_only,
                 fast_inference = fast_inference,
-                subfolder = kwargs.get("subfolder"),
                 force_download = kwargs.get("force_download", False),
                 use_safetensors = kwargs.get("use_safetensors"),
             )
@@ -1832,14 +1830,14 @@ def _patched_car(
             # the in-process load reuses that warm cache instead of re-forcing.
             if _prefetched and kwargs.get("force_download", False):
                 kwargs["force_download"] = False
-            # Read the adapter from the same place the prefetch warmed: forward
-            # cache_dir / subfolder when set (local_files_only is already passed
-            # explicitly below), while leaving PEFT's own defaults untouched otherwise.
+            # Read the adapter from the same place the prefetch warmed: forward cache_dir
+            # when set (local_files_only is already passed explicitly below). subfolder is
+            # NOT forwarded -- it targets the base checkpoint, and an adapter typically
+            # lives at the repo root, so forwarding it would make PeftModel look under
+            # old_model_name/<subfolder> and miss a root adapter.
             peft_load_kwargs = {}
             if kwargs.get("cache_dir") is not None:
                 peft_load_kwargs["cache_dir"] = kwargs["cache_dir"]
-            if kwargs.get("subfolder") is not None:
-                peft_load_kwargs["subfolder"] = kwargs["subfolder"]
             try:
                 model = PeftModel.from_pretrained(
                     model,

From 480bfae07d3d6072e82cc59df2253e61de4b9de7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Jun 2026 08:52:09 +0000
Subject: [PATCH 28/80] Protect adapter / TF / Flax / custom-cache_dir loads in
 the prefetch (Codex)

loader.py: warm the PEFT adapter repo even under fast_inference. The adapter is loaded
in-process by PeftModel.from_pretrained, not by vLLM, so forwarding fast_inference=True
made maybe_prefetch_hf_snapshot return early and left the adapter to an unprotected
in-process Xet download. Pass fast_inference=False for the adapter prefetch at both PEFT
sites (vLLM still owns the base model's download path).

_utils.py: keep TF / Flax weights when those load modes are requested. The static
prefetch ignore list always dropped *.h5 / *.msgpack, but a caller passing from_tf=True /
from_flax=True makes Transformers read those as the actual weights. Gate the *.h5 /
*.msgpack ignores on from_tf / from_flax (threaded through maybe_prefetch_hf_snapshot from
the FastModel / FastLanguageModel from_pretrained kwargs) so the only needed weight file
is warmed instead of skipped.

tokenizer_utils.py: preserve an explicit cache_dir in _load_correct_tokenizer. On a normal
(non-Colab/Kaggle) host it reset cache_dir to None unconditionally, so a caller-supplied
cache_dir that the snapshot prefetch warmed was ignored and the tokenizer load probed the
default cache instead. Only fall back to None for the default "huggingface_tokenizers_cache"
sentinel; keep an explicit cache_dir so the tokenizer loads from the warmed directory.
---
 unsloth/models/_utils.py   | 21 +++++++++++++++++----
 unsloth/models/llama.py    |  2 ++
 unsloth/models/loader.py   | 12 ++++++++++--
 unsloth/models/vision.py   |  2 ++
 unsloth/tokenizer_utils.py |  7 ++++++-
 5 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 579cb3d078..7546ca86e5 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -885,6 +885,8 @@ def _prefetch_ignore_patterns(
     revision = None,
     subfolder = None,
     use_safetensors = None,
+    from_tf = False,
+    from_flax = False,
 ):
     """ignore_patterns for the prewarm snapshot: the static skip list, minus the
     checkpoint guard when loading from a checkpoint-* subfolder, minus the weight
@@ -893,16 +895,23 @@ def _prefetch_ignore_patterns(
     not pulled in full just to be rejected by a safetensors-only load. use_safetensors
     is None (auto) skips *.bin only when in-scope safetensors are also shipped, since
     Transformers prefers them (pulling the other format just to discard it doubles the
-    very download we optimize)."""
+    very download we optimize). from_tf / from_flax keep the TF (*.h5) / Flax
+    (*.msgpack) weights, which Transformers reads as the actual weights under those load
+    modes; dropping them unconditionally would leave the only needed weight file unwarmed."""
     # A checkpoint-* subfolder is exactly what "checkpoint-*/*" would drop, so
     # do not ignore it when the caller is explicitly loading from that subfolder.
+    # from_tf / from_flax loads read *.h5 / *.msgpack as the weights, so keep them.
     ignore_patterns = [
         pattern
         for pattern in _PREFETCH_IGNORE_PATTERNS
         if not (
-            pattern == "checkpoint-*/*"
-            and isinstance(subfolder, str)
-            and subfolder.startswith("checkpoint-")
+            (
+                pattern == "checkpoint-*/*"
+                and isinstance(subfolder, str)
+                and subfolder.startswith("checkpoint-")
+            )
+            or (from_tf and pattern == "*.h5")
+            or (from_flax and pattern == "*.msgpack")
         )
     ]
     # Drop the weight format the load will not read. Transformers reads exactly one
@@ -960,6 +969,8 @@ def maybe_prefetch_hf_snapshot(
     subfolder = None,
     force_download = False,
     use_safetensors = None,
+    from_tf = False,
+    from_flax = False,
 ):
     """Warm the Hugging Face cache for a remote repo before the in-process load.
 
@@ -1022,6 +1033,8 @@ def maybe_prefetch_hf_snapshot(
         revision = revision,
         subfolder = subfolder,
         use_safetensors = use_safetensors,
+        from_tf = from_tf,
+        from_flax = from_flax,
     )
     # When loading from a subfolder, warm only that subfolder instead of the whole
     # repo: a from_pretrained(..., subfolder=X) resolves every file under X/, so the
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index cd5ebd999c..2dcd9bdaae 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2390,6 +2390,8 @@ def from_pretrained(
             subfolder = kwargs.get("subfolder"),
             force_download = kwargs.get("force_download", False),
             use_safetensors = kwargs.get("use_safetensors"),
+            from_tf = kwargs.get("from_tf", False),
+            from_flax = kwargs.get("from_flax", False),
         )
         # The killable child already did the forced download; clear the flag so the
         # in-process load reuses that warm cache instead of re-forcing over Xet.
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 6e79d2dbfa..04c893e51a 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -873,7 +873,11 @@ def from_pretrained(
                 revision = revision,
                 cache_dir = kwargs.get("cache_dir"),
                 local_files_only = local_files_only,
-                fast_inference = fast_inference,
+                # The adapter is loaded in-process by PeftModel.from_pretrained below,
+                # not by vLLM, so warm it even under fast_inference (vLLM only owns the
+                # base model's download path; the adapter would otherwise still hit an
+                # unprotected in-process Xet transfer).
+                fast_inference = False,
                 force_download = kwargs.get("force_download", False),
                 use_safetensors = kwargs.get("use_safetensors"),
             )
@@ -1822,7 +1826,11 @@ def _patched_car(
                 revision = revision,
                 cache_dir = kwargs.get("cache_dir"),
                 local_files_only = local_files_only,
-                fast_inference = fast_inference,
+                # The adapter is loaded in-process by PeftModel.from_pretrained below,
+                # not by vLLM, so warm it even under fast_inference (vLLM only owns the
+                # base model's download path; the adapter would otherwise still hit an
+                # unprotected in-process Xet transfer).
+                fast_inference = False,
                 force_download = kwargs.get("force_download", False),
                 use_safetensors = kwargs.get("use_safetensors"),
             )
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index dbdcfbed17..734b14c30a 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -818,6 +818,8 @@ def from_pretrained(
             subfolder = kwargs.get("subfolder"),
             force_download = kwargs.get("force_download", False),
             use_safetensors = kwargs.get("use_safetensors"),
+            from_tf = kwargs.get("from_tf", False),
+            from_flax = kwargs.get("from_flax", False),
         )
         # The killable child already did the forced download; clear the flag so the
         # in-process load reuses that warm cache instead of re-forcing over Xet.
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 93dfa9b2ad..41bcb106a5 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -563,8 +563,13 @@ def _load_correct_tokenizer(
         # /tmp of Kaggle seems has a 80GB limit!
         # Let's utilize them
         cache_dir = os.path.join(KAGGLE_TMP, cache_dir)
-    else:
+    elif cache_dir == "huggingface_tokenizers_cache":
+        # The default tokenizer-cache name is only meaningful on Colab/Kaggle; on a
+        # normal host fall back to the HF default cache.
         cache_dir = None
+    # else: an explicit caller-supplied cache_dir is preserved, so the tokenizer loads
+    # from the same directory the snapshot prefetch warmed instead of probing the default
+    # cache and risking an unprotected in-process Hub/Xet transfer on a miss.
 
     # Try loading the slow tokenizer. If it fails, then try Fast only
     # Mainly to solve Deepseek models with no tokenizer.model file

From d0e4f3ffe5bfb17c424d83efc41b295b87614293 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Jun 2026 10:24:47 +0000
Subject: [PATCH 29/80] Warm adapter and root tokenizer files correctly in the
 prefetch (Codex P2)

loader.py: do not pass the base model's use_safetensors into the adapter prefetch. It
selects the BASE weight format, but the adapter ships its own (usually
adapter_model.safetensors), and PeftModel.from_pretrained below is not given
use_safetensors. With use_safetensors=False a safetensors-only adapter had its weights
skipped by the prefetch, so the PEFT load fetched them in-process without the Xet stall
fallback. Leave the adapter prefetch on auto so its own format is warmed (both PEFT sites).

_utils.py: when loading weights from a subfolder, also warm the repo-root tokenizer /
config files. The prefetch restricted allow_patterns to subfolder/*, but the tokenizer /
processor load reads tokenizer / config files from the repo root, so repos that keep them
at the root missed the warmed cache and could still download them in-process over Xet. Add
the root auxiliary files (exact filenames, so only root-level matches) alongside subfolder/*.
---
 unsloth/models/_utils.py | 35 ++++++++++++++++++++++++++++++-----
 unsloth/models/loader.py | 12 ++++++++++--
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 7546ca86e5..e72d149abb 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -847,6 +847,28 @@ def _run_temporary_patches(phase):
 )
 
 
+# Repo-root tokenizer / config / processor files that a from_pretrained reads from the
+# root even when the weights load from a subfolder. Exact filenames (no wildcard) so they
+# match only root-level files, never same-named files inside another subfolder.
+_ROOT_AUX_PREFETCH_PATTERNS = (
+    "config.json",
+    "generation_config.json",
+    "tokenizer_config.json",
+    "tokenizer.json",
+    "tokenizer.model",
+    "special_tokens_map.json",
+    "added_tokens.json",
+    "vocab.json",
+    "vocab.txt",
+    "merges.txt",
+    "spiece.model",
+    "chat_template.jinja",
+    "chat_template.json",
+    "preprocessor_config.json",
+    "processor_config.json",
+)
+
+
 def _in_requested_load_scope(filename, subfolder):
     """True if a repo-relative *filename* belongs to the location being loaded.
 
@@ -1036,13 +1058,16 @@ def maybe_prefetch_hf_snapshot(
         from_tf = from_tf,
         from_flax = from_flax,
     )
-    # When loading from a subfolder, warm only that subfolder instead of the whole
-    # repo: a from_pretrained(..., subfolder=X) resolves every file under X/, so the
-    # rest is wasted bandwidth and disk. This also scopes the stall protection to the
-    # weights the load actually reads.
+    # When loading from a subfolder, warm that subfolder instead of the whole repo: a
+    # from_pretrained(..., subfolder=X) resolves every weight file under X/, so the rest
+    # is wasted bandwidth and disk. Also warm the repo-ROOT tokenizer / config files: the
+    # tokenizer / processor load reads those from the root even when the weights live in a
+    # subfolder, so a subfolder-only prefetch would leave them to an unprotected in-process
+    # download. The root patterns are exact filenames (no wildcard), so they match only
+    # root-level files, not same-named files deeper in the repo.
     allow_patterns = None
     if isinstance(subfolder, str) and subfolder.strip("/"):
-        allow_patterns = [f"{subfolder.strip('/')}/*"]
+        allow_patterns = [f"{subfolder.strip('/')}/*", *_ROOT_AUX_PREFETCH_PATTERNS]
     try:
         snapshot_download_with_xet_fallback(
             model_name,
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 04c893e51a..da0d78db19 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -879,7 +879,11 @@ def from_pretrained(
                 # unprotected in-process Xet transfer).
                 fast_inference = False,
                 force_download = kwargs.get("force_download", False),
-                use_safetensors = kwargs.get("use_safetensors"),
+                # Do not inherit the base model's use_safetensors: it selects the BASE
+                # weight format, but the adapter has its own (usually
+                # adapter_model.safetensors). Passing use_safetensors=False here would skip
+                # a safetensors-only adapter's weights, leaving PeftModel.from_pretrained to
+                # fetch them in-process. Leave it as auto so the adapter's format is warmed.
             )
             # The killable child already did the forced download; clear the flag so
             # the in-process load reuses that warm cache instead of re-forcing.
@@ -1832,7 +1836,11 @@ def _patched_car(
                 # unprotected in-process Xet transfer).
                 fast_inference = False,
                 force_download = kwargs.get("force_download", False),
-                use_safetensors = kwargs.get("use_safetensors"),
+                # Do not inherit the base model's use_safetensors: it selects the BASE
+                # weight format, but the adapter has its own (usually
+                # adapter_model.safetensors). Passing use_safetensors=False here would skip
+                # a safetensors-only adapter's weights, leaving PeftModel.from_pretrained to
+                # fetch them in-process. Leave it as auto so the adapter's format is warmed.
             )
             # The killable child already did the forced download; clear the flag so
             # the in-process load reuses that warm cache instead of re-forcing.

From 4c11447479421b057ea5ea93f0bee4946a4f56f9 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jun 2026 02:57:41 +0000
Subject: [PATCH 30/80] Prefetch a distinct tokenizer repo; require the
 unsloth_zoo with the helper

When tokenizer_name points at a separate remote repo, the base prefetch only
warmed model_name, so load_correct_tokenizer (llama) / the processor load
(vision) still fetched that tokenizer repo in-process, unprotected by the
Xet to HTTP fallback -- a stalled Xet tokenizer download could still hang
from_pretrained. Both base loaders now warm the distinct tokenizer repo through
the same killable subprocess before the in-process load.

maybe_prefetch_hf_snapshot gains a tokenizer_only flag: it allow-lists exactly
the tokenizer / config / vocab files (so a tokenizer repo that also ships weights
is not pulled in full) and skips the ignore-pattern computation, which avoids the
model_info network call for a repo whose weights are never fetched.

pyproject: raise the unsloth_zoo floor to 2026.6.8, the release that first ships
unsloth_zoo.hf_xet_fallback. With the prior >=2026.6.7 an environment could
upgrade unsloth while keeping a helper-less zoo, silently skipping every
pre-download and falling back to the in-process Xet path the change is meant to
guard. CI installs unsloth_zoo from git and unsloth with --no-deps, so the pin is
metadata only there and the bump does not affect CI resolution.
---
 pyproject.toml           |  6 +++---
 unsloth/models/_utils.py | 13 +++++++++++--
 unsloth/models/llama.py  | 17 +++++++++++++++++
 unsloth/models/vision.py | 16 ++++++++++++++++
 4 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 13c421d8ea..f61212d0af 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -73,7 +73,7 @@ triton = [
 ]
 
 huggingfacenotorch = [
-    "unsloth_zoo>=2026.6.7",
+    "unsloth_zoo>=2026.6.8",
     "wheel>=0.42.0",
     "packaging",
     "numpy",
@@ -94,7 +94,7 @@ huggingfacenotorch = [
 ]
 huggingface = [
     "unsloth[huggingfacenotorch]",
-    "unsloth_zoo>=2026.6.7",
+    "unsloth_zoo>=2026.6.8",
     "torchvision",
     "unsloth[triton]",
 ]
@@ -584,7 +584,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3 ; ('linux' in sys_platform)",
 ]
 colab-new = [
-    "unsloth_zoo>=2026.6.7",
+    "unsloth_zoo>=2026.6.8",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,!=4.57.0,!=4.57.4,!=4.57.5,!=5.0.0,!=5.1.0,<=5.5.0",
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index e72d149abb..2de79bb581 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -993,6 +993,7 @@ def maybe_prefetch_hf_snapshot(
     use_safetensors = None,
     from_tf = False,
     from_flax = False,
+    tokenizer_only = False,
 ):
     """Warm the Hugging Face cache for a remote repo before the in-process load.
 
@@ -1049,7 +1050,10 @@ def maybe_prefetch_hf_snapshot(
     if fast_inference:
         return False
 
-    ignore_patterns = _prefetch_ignore_patterns(
+    # A tokenizer-only warm allow-lists the exact tokenizer / config files below, so the
+    # weight-format ignore list is moot -- and skipping it avoids the model_info network
+    # call its auto branch would otherwise make for a repo whose weights we never fetch.
+    ignore_patterns = None if tokenizer_only else _prefetch_ignore_patterns(
         model_name,
         token = token,
         revision = revision,
@@ -1066,7 +1070,12 @@ def maybe_prefetch_hf_snapshot(
     # download. The root patterns are exact filenames (no wildcard), so they match only
     # root-level files, not same-named files deeper in the repo.
     allow_patterns = None
-    if isinstance(subfolder, str) and subfolder.strip("/"):
+    if tokenizer_only:
+        # A distinct tokenizer repo: warm only its tokenizer / config / vocab files. Restrict
+        # to those exact root filenames so we never pull weights, even if that repo also
+        # happens to ship them (the weights are not what the tokenizer load reads).
+        allow_patterns = list(_ROOT_AUX_PREFETCH_PATTERNS)
+    elif isinstance(subfolder, str) and subfolder.strip("/"):
         allow_patterns = [f"{subfolder.strip('/')}/*", *_ROOT_AUX_PREFETCH_PATTERNS]
     try:
         snapshot_download_with_xet_fallback(
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 2dcd9bdaae..35619d9de1 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2398,6 +2398,23 @@ def from_pretrained(
         if _prefetched and kwargs.get("force_download", False):
             kwargs["force_download"] = False
 
+        # A distinct tokenizer repo is loaded separately below (load_correct_tokenizer),
+        # which would otherwise fetch it in-process, unprotected by the Xet->HTTP fallback,
+        # so a stalled Xet tokenizer download could still hang from_pretrained. Warm just its
+        # tokenizer / config files through the same killable subprocess. fast_inference is not
+        # forwarded: the tokenizer loads in-process regardless of the vLLM weight path.
+        if (
+            isinstance(tokenizer_name, str) and tokenizer_name
+            and tokenizer_name != model_name
+        ):
+            maybe_prefetch_hf_snapshot(
+                tokenizer_name,
+                token = token,
+                cache_dir = kwargs.get("cache_dir"),
+                local_files_only = kwargs.get("local_files_only", False),
+                tokenizer_only = True,
+            )
+
         if dtype is None:
             dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
         elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16:
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 734b14c30a..d797d1d609 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -826,6 +826,22 @@ def from_pretrained(
         if _prefetched and kwargs.get("force_download", False):
             kwargs["force_download"] = False
 
+        # A distinct tokenizer / processor repo is loaded separately below, which would
+        # otherwise fetch it in-process, unprotected by the Xet->HTTP fallback, so a stalled
+        # Xet download of those files could still hang from_pretrained. Warm just its
+        # tokenizer / config files through the same killable subprocess.
+        if (
+            isinstance(tokenizer_name, str) and tokenizer_name
+            and tokenizer_name != model_name
+        ):
+            maybe_prefetch_hf_snapshot(
+                tokenizer_name,
+                token = token,
+                cache_dir = kwargs.get("cache_dir"),
+                local_files_only = kwargs.get("local_files_only", False),
+                tokenizer_only = True,
+            )
+
         if dtype is None:
             dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
         elif os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "1":

From 2f998a97d675f26d8834517251f6348131c95a86 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 27 Jun 2026 02:58:21 +0000
Subject: [PATCH 31/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/models/_utils.py | 20 ++++++++++++--------
 unsloth/models/llama.py  |  5 +----
 unsloth/models/vision.py |  5 +----
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 2de79bb581..792206c737 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1053,14 +1053,18 @@ def maybe_prefetch_hf_snapshot(
     # A tokenizer-only warm allow-lists the exact tokenizer / config files below, so the
     # weight-format ignore list is moot -- and skipping it avoids the model_info network
     # call its auto branch would otherwise make for a repo whose weights we never fetch.
-    ignore_patterns = None if tokenizer_only else _prefetch_ignore_patterns(
-        model_name,
-        token = token,
-        revision = revision,
-        subfolder = subfolder,
-        use_safetensors = use_safetensors,
-        from_tf = from_tf,
-        from_flax = from_flax,
+    ignore_patterns = (
+        None
+        if tokenizer_only
+        else _prefetch_ignore_patterns(
+            model_name,
+            token = token,
+            revision = revision,
+            subfolder = subfolder,
+            use_safetensors = use_safetensors,
+            from_tf = from_tf,
+            from_flax = from_flax,
+        )
     )
     # When loading from a subfolder, warm that subfolder instead of the whole repo: a
     # from_pretrained(..., subfolder=X) resolves every weight file under X/, so the rest
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 35619d9de1..104f21645b 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2403,10 +2403,7 @@ def from_pretrained(
         # so a stalled Xet tokenizer download could still hang from_pretrained. Warm just its
         # tokenizer / config files through the same killable subprocess. fast_inference is not
         # forwarded: the tokenizer loads in-process regardless of the vLLM weight path.
-        if (
-            isinstance(tokenizer_name, str) and tokenizer_name
-            and tokenizer_name != model_name
-        ):
+        if isinstance(tokenizer_name, str) and tokenizer_name and tokenizer_name != model_name:
             maybe_prefetch_hf_snapshot(
                 tokenizer_name,
                 token = token,
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index d797d1d609..6db49c1716 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -830,10 +830,7 @@ def from_pretrained(
         # otherwise fetch it in-process, unprotected by the Xet->HTTP fallback, so a stalled
         # Xet download of those files could still hang from_pretrained. Warm just its
         # tokenizer / config files through the same killable subprocess.
-        if (
-            isinstance(tokenizer_name, str) and tokenizer_name
-            and tokenizer_name != model_name
-        ):
+        if isinstance(tokenizer_name, str) and tokenizer_name and tokenizer_name != model_name:
             maybe_prefetch_hf_snapshot(
                 tokenizer_name,
                 token = token,

From 8138bf70d306c9e7347eda35bed81f07727d3230 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jun 2026 06:01:24 +0000
Subject: [PATCH 32/80] Cover vLLM tokenizers, TF/Flax warmups, and remote-code
 assets in the prefetch

Same-repo tokenizer / processor under vLLM: with fast_inference=True and no
distinct tokenizer_name, the base prefetch was skipped and the tokenizer-repo
prewarm did not run, yet load_correct_tokenizer / auto_processor still load the
tokenizer/processor from model_name in-process, exposed to a stalled Xet download.
Both base loaders now warm the effective tokenizer repo (tokenizer_name or
model_name) whenever it differs from model_name OR fast_inference skipped the base
warm, so the in-process tokenizer/processor load is always covered.

TF/Flax warmups: _prefetch_ignore_patterns now drops *.safetensors and *.bin when
from_tf / from_flax is set. The load reads the *.h5 / *.msgpack weights, so the auto
branch previously left the PyTorch formats eligible and warmed multi-GB of weights
the load discards, risking a disk-full on a mixed-format repo.

Remote-code and tiktoken assets: _ROOT_AUX_PREFETCH_PATTERNS now includes the
custom-code entry points a trust_remote_code load fetches from the repo root
(configuration_*.py, modeling_*.py, tokenization_*.py, processing_*.py,
image_processing_*.py, feature_extraction_*.py, video_processing_*.py) and tiktoken
vocab assets (*.tiktoken, e.g. Qwen). This covers both a subfolder remote-code load
(root code files) and a split-repo custom tokenizer (tokenizer_only warm), which the
fixed tokenizer/config allowlist previously left to an unprotected in-process fetch.
---
 unsloth/models/_utils.py | 27 ++++++++++++++++++++++++---
 unsloth/models/llama.py  | 23 ++++++++++++++++-------
 unsloth/models/vision.py | 22 ++++++++++++++++------
 3 files changed, 56 insertions(+), 16 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 792206c737..73b81080e2 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -866,6 +866,18 @@ def _run_temporary_patches(phase):
     "chat_template.json",
     "preprocessor_config.json",
     "processor_config.json",
+    # Custom-code entry points a trust_remote_code config / model / tokenizer / processor
+    # load fetches from the repo root. Each carries a literal prefix, so it stays effectively
+    # root-anchored, and matches nothing on a non-remote-code repo (harmless there).
+    "configuration_*.py",
+    "modeling_*.py",
+    "tokenization_*.py",
+    "processing_*.py",
+    "image_processing_*.py",
+    "feature_extraction_*.py",
+    "video_processing_*.py",
+    # tiktoken vocab assets (e.g. Qwen's qwen.tiktoken) a custom tokenizer can require.
+    "*.tiktoken",
 )
 
 
@@ -938,7 +950,15 @@ def _prefetch_ignore_patterns(
     ]
     # Drop the weight format the load will not read. Transformers reads exactly one
     # format, so prefetching the other doubles the very download we optimize.
-    if use_safetensors is True:
+    if from_tf or from_flax:
+        # TF (*.h5) / Flax (*.msgpack) loads read those as the weights and never touch the
+        # PyTorch formats. The static list above already keeps the requested h5 / msgpack;
+        # drop safetensors and .bin outright so the auto branch does not warm multi-GB of
+        # PyTorch weights the load will discard (and risk a disk-full on a mixed-format repo).
+        ignore_patterns.extend((
+            "*.safetensors", "*.safetensors.index.json", "*.bin", "*.bin.index.json",
+        ))
+    elif use_safetensors is True:
         # Explicit safetensors: the load never reads .bin, so skip it outright -- even
         # for a bin-only repo, where the load fails anyway, do not pull multi-GB of
         # PyTorch weights first. No model_info call needed for an explicit request.
@@ -1071,8 +1091,9 @@ def maybe_prefetch_hf_snapshot(
     # is wasted bandwidth and disk. Also warm the repo-ROOT tokenizer / config files: the
     # tokenizer / processor load reads those from the root even when the weights live in a
     # subfolder, so a subfolder-only prefetch would leave them to an unprotected in-process
-    # download. The root patterns are exact filenames (no wildcard), so they match only
-    # root-level files, not same-named files deeper in the repo.
+    # download. Also warm the custom-code / tiktoken assets a trust_remote_code load fetches
+    # from the root. The root patterns are exact filenames or literal-prefixed globs (e.g.
+    # modeling_*.py), so they stay anchored to repo-root files in practice.
     allow_patterns = None
     if tokenizer_only:
         # A distinct tokenizer repo: warm only its tokenizer / config / vocab files. Restrict
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 104f21645b..5e35c633de 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2398,14 +2398,23 @@ def from_pretrained(
         if _prefetched and kwargs.get("force_download", False):
             kwargs["force_download"] = False
 
-        # A distinct tokenizer repo is loaded separately below (load_correct_tokenizer),
-        # which would otherwise fetch it in-process, unprotected by the Xet->HTTP fallback,
-        # so a stalled Xet tokenizer download could still hang from_pretrained. Warm just its
-        # tokenizer / config files through the same killable subprocess. fast_inference is not
-        # forwarded: the tokenizer loads in-process regardless of the vLLM weight path.
-        if isinstance(tokenizer_name, str) and tokenizer_name and tokenizer_name != model_name:
+        # The tokenizer loads in-process below (load_correct_tokenizer) regardless of the
+        # vLLM weight path, so a stalled Xet download of its tokenizer / config files could
+        # still hang from_pretrained. Warm that repo's tokenizer files through the same
+        # killable subprocess. The base prefetch above already covered them when it warmed
+        # model_name itself, so only warm here when the tokenizer comes from a different repo,
+        # or when fast_inference skipped the base warm entirely.
+        _tokenizer_repo = (
+            tokenizer_name if (isinstance(tokenizer_name, str) and tokenizer_name) else model_name
+        )
+        _warm_tokenizer_repo = (
+            isinstance(_tokenizer_repo, str)
+            and bool(_tokenizer_repo)
+            and (_tokenizer_repo != model_name or fast_inference)
+        )
+        if _warm_tokenizer_repo:
             maybe_prefetch_hf_snapshot(
-                tokenizer_name,
+                _tokenizer_repo,
                 token = token,
                 cache_dir = kwargs.get("cache_dir"),
                 local_files_only = kwargs.get("local_files_only", False),
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 6db49c1716..f095243a77 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -826,13 +826,23 @@ def from_pretrained(
         if _prefetched and kwargs.get("force_download", False):
             kwargs["force_download"] = False
 
-        # A distinct tokenizer / processor repo is loaded separately below, which would
-        # otherwise fetch it in-process, unprotected by the Xet->HTTP fallback, so a stalled
-        # Xet download of those files could still hang from_pretrained. Warm just its
-        # tokenizer / config files through the same killable subprocess.
-        if isinstance(tokenizer_name, str) and tokenizer_name and tokenizer_name != model_name:
+        # The tokenizer / processor loads in-process below (auto_processor / AutoTokenizer)
+        # regardless of the vLLM weight path, so a stalled Xet download of its tokenizer /
+        # processor / config files could still hang from_pretrained. Warm that repo's files
+        # through the same killable subprocess. The base prefetch above already covered them
+        # when it warmed model_name itself, so only warm here when the tokenizer / processor
+        # comes from a different repo, or when fast_inference skipped the base warm entirely.
+        _tokenizer_repo = (
+            tokenizer_name if (isinstance(tokenizer_name, str) and tokenizer_name) else model_name
+        )
+        _warm_tokenizer_repo = (
+            isinstance(_tokenizer_repo, str)
+            and bool(_tokenizer_repo)
+            and (_tokenizer_repo != model_name or fast_inference)
+        )
+        if _warm_tokenizer_repo:
             maybe_prefetch_hf_snapshot(
-                tokenizer_name,
+                _tokenizer_repo,
                 token = token,
                 cache_dir = kwargs.get("cache_dir"),
                 local_files_only = kwargs.get("local_files_only", False),

From ac44e2f2e8530c6f64222138fbddbcb8e454d6f4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 27 Jun 2026 06:02:02 +0000
Subject: [PATCH 33/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/models/_utils.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 73b81080e2..826a841427 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -955,9 +955,14 @@ def _prefetch_ignore_patterns(
         # PyTorch formats. The static list above already keeps the requested h5 / msgpack;
         # drop safetensors and .bin outright so the auto branch does not warm multi-GB of
         # PyTorch weights the load will discard (and risk a disk-full on a mixed-format repo).
-        ignore_patterns.extend((
-            "*.safetensors", "*.safetensors.index.json", "*.bin", "*.bin.index.json",
-        ))
+        ignore_patterns.extend(
+            (
+                "*.safetensors",
+                "*.safetensors.index.json",
+                "*.bin",
+                "*.bin.index.json",
+            )
+        )
     elif use_safetensors is True:
         # Explicit safetensors: the load never reads .bin, so skip it outright -- even
         # for a bin-only repo, where the load fails anyway, do not pull multi-GB of

From 4b45d62997c665fe64f3b73e39bc9f6b3f5a8b47 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jun 2026 06:45:43 +0000
Subject: [PATCH 34/80] Keep unsloth_zoo floor at 2026.6.7 until 2026.6.8 is
 published

Raising the floor to unsloth_zoo>=2026.6.8 broke the security-audit
pip scan-packages :: hf-stack shard, which resolves the pyproject dependencies
with a real pip download against PyPI: 2026.6.8 is not published yet (the release
carrying hf_xet_fallback lands first as a separate step), so the spec cannot be
satisfied and the scan fails. The test / lint CI installs unsloth_zoo from git and
unsloth with --no-deps, so it never surfaced this.

Revert the floor to >=2026.6.7. The prefetch already degrades gracefully when the
helper module is absent (it skips warming and loads in-process), so this is safe;
the floor will be raised to 2026.6.8 in a follow-up once that release is on PyPI.
---
 pyproject.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f61212d0af..13c421d8ea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -73,7 +73,7 @@ triton = [
 ]
 
 huggingfacenotorch = [
-    "unsloth_zoo>=2026.6.8",
+    "unsloth_zoo>=2026.6.7",
     "wheel>=0.42.0",
     "packaging",
     "numpy",
@@ -94,7 +94,7 @@ huggingfacenotorch = [
 ]
 huggingface = [
     "unsloth[huggingfacenotorch]",
-    "unsloth_zoo>=2026.6.8",
+    "unsloth_zoo>=2026.6.7",
     "torchvision",
     "unsloth[triton]",
 ]
@@ -584,7 +584,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3 ; ('linux' in sys_platform)",
 ]
 colab-new = [
-    "unsloth_zoo>=2026.6.8",
+    "unsloth_zoo>=2026.6.7",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,!=4.57.0,!=4.57.4,!=4.57.5,!=5.0.0,!=5.1.0,<=5.5.0",

From 6d01b7e33f98cb62b673a8a7412e96f24efafe4c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jun 2026 07:11:44 +0000
Subject: [PATCH 35/80] Validate before prefetching; warm weights when vLLM is
 unavailable; skip checkpoint blobs

llama.py: the base weight prefetch now runs AFTER the AutoConfig load and the
MODEL_FOR_CAUSAL_LM_MAPPING model-class check, not before them. An unsupported or
incompatible repo now fails on its small config fetch (as it did before this
feature) instead of first pulling multi-GB of weights that the validation error
then discards.

vision.py: the weight prefetch is skipped under fast_inference only when vLLM is
actually available. fast_inference=True on a host without vLLM falls through to the
in-process HF load (fast_inference_setup flips the flag to False later), so the
weights must still be warmed through the killable subprocess rather than left to an
unprotected in-process Xet download. The same-repo tokenizer/processor warm uses the
resolved availability too. llama.py already resolves vLLM availability before its
prefetch, so it needs no change.

_utils.py: the prefetch ignore list now drops *.pt / *.pth / *.ckpt. from_pretrained
reads safetensors / .bin (or h5 / msgpack under from_tf / from_flax), not original
training checkpoints, so a repo that ships those alongside HF weights no longer warms
multi-GB of unused blobs.
---
 unsloth/models/_utils.py |  6 +++
 unsloth/models/llama.py  | 96 ++++++++++++++++++++--------------------
 unsloth/models/vision.py | 11 ++++-
 3 files changed, 63 insertions(+), 50 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 826a841427..42be54c274 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -838,6 +838,12 @@ def _run_temporary_patches(phase):
     "*.mlpackage/*",
     "*.mlmodel",
     "*.gguf",
+    # Original / training checkpoint formats that from_pretrained does not read (it loads
+    # safetensors / .bin, or h5 / msgpack under from_tf / from_flax). Skip them so the warm
+    # does not pull multi-GB of unused weights for a repo that ships them alongside HF weights.
+    "*.pt",
+    "*.pth",
+    "*.ckpt",
     "optimizer.*",
     "scheduler.*",
     "rng_state*",
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 5e35c633de..0ef75b4cce 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2373,54 +2373,6 @@ def from_pretrained(
         # For debugging - we use a download counter to see if environments are not breaking or if HF is down
         get_statistics(kwargs.get("local_files_only", False))
 
-        # Pre-download the repo in a killable subprocess that falls back from Xet
-        # to HTTP on a no-progress stall, so the in-process load below is a cache
-        # hit and cannot hang on a stalled Xet transfer. revision is intentionally
-        # not forwarded: the base-model load below resolves model_name (possibly a
-        # remapped prequantized repo, where the caller's revision does not exist) on
-        # its default branch without a revision, so warming a specific revision would
-        # predownload a snapshot the load never reads and leave the real transfer
-        # unprotected.
-        _prefetched = maybe_prefetch_hf_snapshot(
-            model_name,
-            token = token,
-            cache_dir = kwargs.get("cache_dir"),
-            local_files_only = kwargs.get("local_files_only", False),
-            fast_inference = fast_inference,
-            subfolder = kwargs.get("subfolder"),
-            force_download = kwargs.get("force_download", False),
-            use_safetensors = kwargs.get("use_safetensors"),
-            from_tf = kwargs.get("from_tf", False),
-            from_flax = kwargs.get("from_flax", False),
-        )
-        # The killable child already did the forced download; clear the flag so the
-        # in-process load reuses that warm cache instead of re-forcing over Xet.
-        if _prefetched and kwargs.get("force_download", False):
-            kwargs["force_download"] = False
-
-        # The tokenizer loads in-process below (load_correct_tokenizer) regardless of the
-        # vLLM weight path, so a stalled Xet download of its tokenizer / config files could
-        # still hang from_pretrained. Warm that repo's tokenizer files through the same
-        # killable subprocess. The base prefetch above already covered them when it warmed
-        # model_name itself, so only warm here when the tokenizer comes from a different repo,
-        # or when fast_inference skipped the base warm entirely.
-        _tokenizer_repo = (
-            tokenizer_name if (isinstance(tokenizer_name, str) and tokenizer_name) else model_name
-        )
-        _warm_tokenizer_repo = (
-            isinstance(_tokenizer_repo, str)
-            and bool(_tokenizer_repo)
-            and (_tokenizer_repo != model_name or fast_inference)
-        )
-        if _warm_tokenizer_repo:
-            maybe_prefetch_hf_snapshot(
-                _tokenizer_repo,
-                token = token,
-                cache_dir = kwargs.get("cache_dir"),
-                local_files_only = kwargs.get("local_files_only", False),
-                tokenizer_only = True,
-            )
-
         if dtype is None:
             dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
         elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16:
@@ -2468,6 +2420,54 @@ def from_pretrained(
 
         preferred_attn_impl = resolve_attention_implementation(model_function, model_config)
 
+        # Pre-download the repo in a killable subprocess that falls back from Xet to HTTP on a
+        # no-progress stall, so the in-process weight load below is a cache hit and cannot hang
+        # on a stalled Xet transfer. This runs AFTER the AutoConfig load + model-class check
+        # above, so an unsupported / incompatible repo fails on its small config fetch without
+        # first pulling multi-GB of weights. revision is intentionally not forwarded: the
+        # base-model load below resolves model_name (possibly a remapped prequantized repo,
+        # where the caller's revision does not exist) on its default branch without a revision,
+        # so warming a specific revision would predownload a snapshot the load never reads.
+        _prefetched = maybe_prefetch_hf_snapshot(
+            model_name,
+            token = token,
+            cache_dir = kwargs.get("cache_dir"),
+            local_files_only = kwargs.get("local_files_only", False),
+            fast_inference = fast_inference,
+            subfolder = kwargs.get("subfolder"),
+            force_download = kwargs.get("force_download", False),
+            use_safetensors = kwargs.get("use_safetensors"),
+            from_tf = kwargs.get("from_tf", False),
+            from_flax = kwargs.get("from_flax", False),
+        )
+        # The killable child already did the forced download; clear the flag so the
+        # in-process load reuses that warm cache instead of re-forcing over Xet.
+        if _prefetched and kwargs.get("force_download", False):
+            kwargs["force_download"] = False
+
+        # The tokenizer loads in-process below (load_correct_tokenizer) regardless of the
+        # vLLM weight path, so a stalled Xet download of its tokenizer / config files could
+        # still hang from_pretrained. Warm that repo's tokenizer files through the same
+        # killable subprocess. The base prefetch above already covered them when it warmed
+        # model_name itself, so only warm here when the tokenizer comes from a different repo,
+        # or when fast_inference skipped the base warm entirely.
+        _tokenizer_repo = (
+            tokenizer_name if (isinstance(tokenizer_name, str) and tokenizer_name) else model_name
+        )
+        _warm_tokenizer_repo = (
+            isinstance(_tokenizer_repo, str)
+            and bool(_tokenizer_repo)
+            and (_tokenizer_repo != model_name or fast_inference)
+        )
+        if _warm_tokenizer_repo:
+            maybe_prefetch_hf_snapshot(
+                _tokenizer_repo,
+                token = token,
+                cache_dir = kwargs.get("cache_dir"),
+                local_files_only = kwargs.get("local_files_only", False),
+                tokenizer_only = True,
+            )
+
         has_rope_scaling = False
         try:
             with open(inspect.getfile(model_function), "r", encoding = "utf-8") as file:
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index f095243a77..34b996adaa 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -805,6 +805,13 @@ def from_pretrained(
         # For debugging - we use a download counter to see if environments are not breaking or if HF is down
         get_statistics(kwargs.get("local_files_only", False))
 
+        # vLLM owns the weight download only when it is actually available; if
+        # fast_inference was requested but vLLM is missing, the load falls through to the
+        # in-process HF path (fast_inference_setup flips the flag to False below), so the
+        # weights must still be warmed here rather than left to an unprotected in-process
+        # Xet download. Resolve availability now so the prefetch skip reflects the real path.
+        _vllm_owns_weights = fast_inference and is_vLLM_available()
+
         # Pre-download the repo in a killable subprocess that falls back from Xet
         # to HTTP on a no-progress stall, so the in-process load below is a cache
         # hit and cannot hang on a stalled Xet transfer.
@@ -814,7 +821,7 @@ def from_pretrained(
             revision = kwargs.get("revision"),
             cache_dir = kwargs.get("cache_dir"),
             local_files_only = kwargs.get("local_files_only", False),
-            fast_inference = fast_inference,
+            fast_inference = _vllm_owns_weights,
             subfolder = kwargs.get("subfolder"),
             force_download = kwargs.get("force_download", False),
             use_safetensors = kwargs.get("use_safetensors"),
@@ -838,7 +845,7 @@ def from_pretrained(
         _warm_tokenizer_repo = (
             isinstance(_tokenizer_repo, str)
             and bool(_tokenizer_repo)
-            and (_tokenizer_repo != model_name or fast_inference)
+            and (_tokenizer_repo != model_name or _vllm_owns_weights)
         )
         if _warm_tokenizer_repo:
             maybe_prefetch_hf_snapshot(

From 92800b9eb63e07870745804c7ce7ecce7e23bf38 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Jun 2026 12:44:46 +0000
Subject: [PATCH 36/80] Narrow the cache prewarm to what each load actually
 reads (adapter, root, diffusion)

maybe_prefetch_hf_snapshot warms the whole repo before the in-process load so the load is a
cache hit and cannot hang on a stalled Xet transfer. Warming more than the load reads is wasted
bandwidth and disk, and can fill a disk before a small load finishes. Three review findings,
each narrowing a specific load path without ever under-fetching (the warm still covers every
file the load reads, so nothing falls back to an unprotected in-process Xet download).

Adapter loads (loader.py, both PEFT branches): a PeftModel.from_pretrained reads only
adapter_config.json + adapter_model.* (sharded or not), plus the root tokenizer / config it may
also load. A new adapter_only mode restricts the warm to those, so an adapter repo that ALSO
publishes merged / full-model weights no longer pulls multi-GB of weights PeftModel never reads.

Root loads (vision.py, llama.py): a bare from_pretrained(model_name) with no subfolder resolves
only the ROOT weight files, so a new weights_at_root mode drops weights nested in subdirectories
(fp16/, experimental/, alternate-checkpoint dirs) the root load never reads. Only weight files
are excluded (via */*.safetensors and */*.bin, which Hugging Face's fnmatch spans across /,
while a root model.safetensors with no / is kept); a subdir config the load might still consult
stays warmed. Not applied when a subfolder is requested (that branch warms the subfolder) nor to
diffusion.

Diffusion loads (diffusion.py): the pipeline / config / processor loads resolve the repo ROOT
(model_index.json plus every component subfolder), not a single subfolder, so subfolder is no
longer forwarded to the diffusion prewarm. Forwarding it narrowed the warm to one subfolder and
left the other components (unet/, vae/, text_encoder/) to an unprotected in-process Xet download
while pulling subfolder weights the load never reads.

Tests (tests/test_prefetch_snapshot_scope.py, pure CPU, no network): assert the allow / ignore
patterns each mode forwards, run them through the Hugging Face filter over a sample repo listing
to confirm subdir / merged weights are dropped while root weights, config, tokenizer, and a
sharded adapter survive, and that a requested subfolder takes precedence over weights_at_root. A
fake unsloth_zoo.hf_xet_fallback is injected so the test runs regardless of the installed
unsloth_zoo version.
---
 tests/test_prefetch_snapshot_scope.py | 175 ++++++++++++++++++++++++++
 unsloth/models/_utils.py              |  63 ++++++++--
 unsloth/models/diffusion.py           |   6 +-
 unsloth/models/llama.py               |   4 +
 unsloth/models/loader.py              |   8 ++
 unsloth/models/vision.py              |   4 +
 6 files changed, 247 insertions(+), 13 deletions(-)
 create mode 100644 tests/test_prefetch_snapshot_scope.py

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
new file mode 100644
index 0000000000..6ef4f80e9c
--- /dev/null
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -0,0 +1,175 @@
+"""Pure-CPU, no-network unit tests for the prefetch snapshot scoping in
+unsloth/models/_utils.py.
+
+maybe_prefetch_hf_snapshot warms the HF cache before the in-process load so the load is a cache
+hit and cannot hang on a stalled Xet transfer. The warm must download AT LEAST what the load
+reads (else the missing file falls to an unprotected in-process Xet fetch) but should not pull
+weights the load never reads. These tests lock the allow_patterns / ignore_patterns each mode
+hands snapshot_download_with_xet_fallback (Codex #6638: adapter-only, weights-at-root, subfolder).
+No network, no subprocess: the zoo downloader is monkeypatched to capture its kwargs.
+"""
+
+import fnmatch
+import sys
+import types
+
+import pytest
+
+from unsloth.models import _utils as U
+
+
+def _filter(names, allow_patterns, ignore_patterns):
+    """Mirror Hugging Face filter_repo_objects: keep a name if it matches any allow pattern
+    (or allow is None), then drop it if it matches any ignore pattern. fnmatch '*' spans '/'
+    exactly as HF's matcher does, so this reproduces the real selection over a sample file list."""
+    kept = []
+    for name in names:
+        if allow_patterns is not None and not any(fnmatch.fnmatch(name, p) for p in allow_patterns):
+            continue
+        if ignore_patterns and any(fnmatch.fnmatch(name, p) for p in ignore_patterns):
+            continue
+        kept.append(name)
+    return kept
+
+
+@pytest.fixture
+def capture(monkeypatch):
+    """Call maybe_prefetch_hf_snapshot with a fake repo id and capture the allow/ignore patterns
+    it forwards to the zoo downloader. A fake unsloth_zoo.hf_xet_fallback module is injected into
+    sys.modules so the test is independent of the installed unsloth_zoo version (the published
+    package may predate the helper, which maybe_prefetch_hf_snapshot then imports lazily). Offline
+    env vars are cleared so the warm is not skipped."""
+    monkeypatch.delenv("HF_HUB_OFFLINE", raising = False)
+    monkeypatch.delenv("TRANSFORMERS_OFFLINE", raising = False)
+
+    state = {}
+
+    def fake_download(repo_id, **kw):
+        state["repo_id"] = repo_id
+        state["allow_patterns"] = kw.get("allow_patterns")
+        state["ignore_patterns"] = kw.get("ignore_patterns")
+        return "/tmp/fake-snapshot"
+
+    fake_module = types.ModuleType("unsloth_zoo.hf_xet_fallback")
+    fake_module.snapshot_download_with_xet_fallback = fake_download
+    fake_module.DownloadStallError = type("DownloadStallError", (RuntimeError,), {})
+    monkeypatch.setitem(sys.modules, "unsloth_zoo.hf_xet_fallback", fake_module)
+
+    def run(**call_kwargs):
+        state.clear()
+        ok = U.maybe_prefetch_hf_snapshot("some-org/some-repo", **call_kwargs)
+        return ok, state
+
+    return run
+
+
+# A representative repo file listing: root weights + tokenizer/config, plus an alternate-precision
+# subdir, an adapter, a checkpoint dir, and merged full-model weights an adapter repo might ship.
+_SAMPLE_FILES = [
+    "config.json",
+    "tokenizer.json",
+    "tokenizer_config.json",
+    "model-00001-of-00002.safetensors",
+    "model-00002-of-00002.safetensors",
+    "model.safetensors.index.json",
+    "pytorch_model.bin",
+    "fp16/model.safetensors",
+    "experimental/model-00001-of-00002.safetensors",
+    "checkpoint-500/model.safetensors",
+    "adapter_config.json",
+    "adapter_model.safetensors",
+]
+
+
+def test_weights_at_root_excludes_subdir_weights(capture):
+    """A bare root load reads only root weight files, so weights nested in subdirs (fp16/,
+    experimental/, checkpoint-500/) must be ignored while the root weights stay warmed. An
+    explicit use_safetensors avoids the auto branch's model_info network call."""
+    ok, st = capture(weights_at_root = True, use_safetensors = True)
+    assert ok is True
+    assert st["allow_patterns"] is None          # the warm stays otherwise unfiltered
+    ig = st["ignore_patterns"]
+    assert "*/*.safetensors" in ig and "*/*.bin" in ig
+    kept = _filter(_SAMPLE_FILES, st["allow_patterns"], ig)
+    # Root weights + config/tokenizer survive; subdir weights are dropped.
+    assert "model-00001-of-00002.safetensors" in kept
+    assert "model.safetensors.index.json" in kept
+    assert "config.json" in kept
+    assert "fp16/model.safetensors" not in kept
+    assert "experimental/model-00001-of-00002.safetensors" not in kept
+    assert "checkpoint-500/model.safetensors" not in kept
+
+
+def test_adapter_only_excludes_merged_weights(capture):
+    """An adapter warm reads only adapter_config.json + adapter_model.* (plus root tokenizer /
+    config); a repo that also ships merged full-model weights must not pull them."""
+    ok, st = capture(adapter_only = True)
+    assert ok is True
+    assert st["ignore_patterns"] is None          # the exact allowlist makes the format filter moot
+    allow = st["allow_patterns"]
+    assert "adapter_config.json" in allow and "adapter_model*" in allow
+    kept = _filter(_SAMPLE_FILES, allow, st["ignore_patterns"])
+    # The adapter's own files + the root aux files are warmed.
+    assert "adapter_config.json" in kept
+    assert "adapter_model.safetensors" in kept
+    assert "config.json" in kept and "tokenizer.json" in kept
+    # The merged / full-model weights are NOT pulled.
+    assert "model-00001-of-00002.safetensors" not in kept
+    assert "pytorch_model.bin" not in kept
+    assert "fp16/model.safetensors" not in kept
+
+
+def test_adapter_only_warms_sharded_adapter(capture):
+    """A sharded adapter (adapter_model-00001-of-00002.safetensors) is still covered by the
+    adapter_model* glob, so a large adapter is not left to an in-process Xet fetch."""
+    _, st = capture(adapter_only = True)
+    sharded = [
+        "adapter_config.json",
+        "adapter_model-00001-of-00002.safetensors",
+        "adapter_model-00002-of-00002.safetensors",
+        "adapter_model.safetensors.index.json",
+    ]
+    kept = _filter(sharded, st["allow_patterns"], st["ignore_patterns"])
+    assert set(kept) == set(sharded)
+
+
+def test_tokenizer_only_warms_only_aux_files(capture):
+    """A distinct tokenizer repo warms only its tokenizer / config / vocab files, never weights."""
+    _, st = capture(tokenizer_only = True)
+    assert st["ignore_patterns"] is None
+    assert st["allow_patterns"] == list(U._ROOT_AUX_PREFETCH_PATTERNS)
+    kept = _filter(_SAMPLE_FILES, st["allow_patterns"], st["ignore_patterns"])
+    assert "tokenizer.json" in kept and "config.json" in kept
+    assert "model-00001-of-00002.safetensors" not in kept
+    assert "adapter_model.safetensors" not in kept
+
+
+def test_subfolder_warms_subfolder_plus_root_aux(capture):
+    """A subfolder load warms that subfolder's weights plus the root tokenizer / config; the
+    root weights and OTHER subfolders are skipped."""
+    _, st = capture(subfolder = "fp16")
+    allow = st["allow_patterns"]
+    assert "fp16/*" in allow
+    assert all(p in allow for p in U._ROOT_AUX_PREFETCH_PATTERNS)
+    kept = _filter(_SAMPLE_FILES, allow, st["ignore_patterns"])
+    assert "fp16/model.safetensors" in kept
+    assert "config.json" in kept
+    assert "experimental/model-00001-of-00002.safetensors" not in kept
+
+
+def test_subfolder_takes_precedence_over_weights_at_root(capture):
+    """weights_at_root is a root-load assertion; when a subfolder IS requested the subfolder
+    branch wins (the load reads that subfolder), so the warm is the subfolder, not a
+    root-with-subdir-weights-excluded warm."""
+    _, st = capture(subfolder = "fp16", weights_at_root = True)
+    assert "fp16/*" in st["allow_patterns"]
+    kept = _filter(_SAMPLE_FILES, st["allow_patterns"], st["ignore_patterns"])
+    assert "fp16/model.safetensors" in kept
+
+
+def test_local_dir_is_not_warmed(capture, tmp_path):
+    """A local directory path has nothing to download: the warm is skipped (returns False)."""
+    d = tmp_path / "local-model"
+    d.mkdir()
+    ok = U.maybe_prefetch_hf_snapshot(str(d), weights_at_root = True)
+    assert ok is False
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 0b5e706762..0aeab14f31 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -968,6 +968,29 @@ def _run_temporary_patches(phase):
 )
 
 
+# Exactly the files a PEFT adapter load (PeftModel.from_pretrained) reads: its config and its
+# weight files (adapter_model.safetensors / .bin, sharded or not). "adapter_model*" is a glob so
+# a sharded adapter is still covered; the merged / full-model weights an adapter repo may ALSO
+# publish (model*.safetensors, pytorch_model*.bin) match none of these and so are not pulled.
+_ADAPTER_PREFETCH_PATTERNS = (
+    "adapter_config.json",
+    "adapter_model*",
+)
+
+
+# Weight files that live in a SUBDIRECTORY, not the repo root. A bare from_pretrained(model_name)
+# (no subfolder) resolves only root weight files, so for such a root load these subdir weights are
+# unread; ignoring them keeps a repo's alternate-precision / experimental weight dirs (fp16/,
+# experimental/) from being pulled by the otherwise-unfiltered warm. Hugging Face's fnmatch "*"
+# spans "/", so "*/*.safetensors" matches any nested .safetensors while a root "model.safetensors"
+# (no "/") is kept. Only applied when the caller asserts a root-only load (weights_at_root), never
+# to a diffusion pipeline warm whose component weights DO live in subfolders.
+_SUBDIR_WEIGHT_IGNORE_PATTERNS = (
+    "*/*.safetensors",
+    "*/*.bin",
+)
+
+
 def _in_requested_load_scope(filename, subfolder):
     """True if a repo-relative *filename* belongs to the location being loaded.
 
@@ -1106,6 +1129,8 @@ def maybe_prefetch_hf_snapshot(
     from_tf = False,
     from_flax = False,
     tokenizer_only = False,
+    adapter_only = False,
+    weights_at_root = False,
 ):
     """Warm the Hugging Face cache for a remote repo before the in-process load.
 
@@ -1162,12 +1187,12 @@ def maybe_prefetch_hf_snapshot(
     if fast_inference:
         return False
 
-    # A tokenizer-only warm allow-lists the exact tokenizer / config files below, so the
-    # weight-format ignore list is moot -- and skipping it avoids the model_info network
-    # call its auto branch would otherwise make for a repo whose weights we never fetch.
+    # A tokenizer-only or adapter-only warm allow-lists the exact files the load reads below, so
+    # the weight-format ignore list is moot -- and skipping it avoids the model_info network call
+    # its auto branch would otherwise make for a repo whose full weights we never fetch.
     ignore_patterns = (
         None
-        if tokenizer_only
+        if tokenizer_only or adapter_only
         else _prefetch_ignore_patterns(
             model_name,
             token = token,
@@ -1178,22 +1203,36 @@ def maybe_prefetch_hf_snapshot(
             from_flax = from_flax,
         )
     )
-    # When loading from a subfolder, warm that subfolder instead of the whole repo: a
-    # from_pretrained(..., subfolder=X) resolves every weight file under X/, so the rest
-    # is wasted bandwidth and disk. Also warm the repo-ROOT tokenizer / config files: the
-    # tokenizer / processor load reads those from the root even when the weights live in a
-    # subfolder, so a subfolder-only prefetch would leave them to an unprotected in-process
-    # download. Also warm the custom-code / tiktoken assets a trust_remote_code load fetches
-    # from the root. The root patterns are exact filenames or literal-prefixed globs (e.g.
-    # modeling_*.py), so they stay anchored to repo-root files in practice.
+    # Narrow the warm to exactly what the in-process load reads, so a repo that ships extra
+    # weights (alternate checkpoints, merged full models, alternate precisions) is not pulled in
+    # full. Every branch still warms the repo-ROOT tokenizer / config / custom-code assets a
+    # tokenizer / processor / trust_remote_code load reads from the root, so those never fall to
+    # an unprotected in-process download. The root patterns are exact filenames or literal-prefixed
+    # globs (e.g. modeling_*.py), so they stay anchored to repo-root files in practice.
     allow_patterns = None
     if tokenizer_only:
         # A distinct tokenizer repo: warm only its tokenizer / config / vocab files. Restrict
         # to those exact root filenames so we never pull weights, even if that repo also
         # happens to ship them (the weights are not what the tokenizer load reads).
         allow_patterns = list(_ROOT_AUX_PREFETCH_PATTERNS)
+    elif adapter_only:
+        # A PEFT adapter load reads only adapter_config.json + adapter_model.* (plus the root
+        # tokenizer / config it may also load). Restrict to those so an adapter repo that ALSO
+        # publishes merged / full-model weights does not pull multi-GB of weights PeftModel never
+        # reads (and risk filling disk before a small adapter loads).
+        allow_patterns = [*_ADAPTER_PREFETCH_PATTERNS, *_ROOT_AUX_PREFETCH_PATTERNS]
     elif isinstance(subfolder, str) and subfolder.strip("/"):
+        # Loading from a subfolder: a from_pretrained(..., subfolder=X) resolves every weight
+        # file under X/, so warm that subfolder (plus the root aux files) and skip the rest.
         allow_patterns = [f"{subfolder.strip('/')}/*", *_ROOT_AUX_PREFETCH_PATTERNS]
+    elif weights_at_root:
+        # A bare from_pretrained(model_name) (no subfolder) reads only the ROOT weight files.
+        # Keep the warm otherwise unfiltered (config, tokenizer, root weights) but drop weights
+        # nested in subdirectories (fp16/, experimental/, alternate-checkpoint dirs) the root
+        # load never reads. Only weight files are excluded, so a subdir's config the load might
+        # still consult stays warmed. Not applied to diffusion (its component weights live in
+        # subfolders); see FastDiffusionModel's call, which leaves weights_at_root False.
+        ignore_patterns = [*(ignore_patterns or []), *_SUBDIR_WEIGHT_IGNORE_PATTERNS]
     try:
         snapshot_download_with_xet_fallback(
             model_name,
diff --git a/unsloth/models/diffusion.py b/unsloth/models/diffusion.py
index 0010b57aaa..738b2bfe09 100644
--- a/unsloth/models/diffusion.py
+++ b/unsloth/models/diffusion.py
@@ -186,6 +186,11 @@ def from_pretrained(
         # stall, so the weight load below is a cache hit and cannot hang. Done
         # after validation so a non-diffusion repo fails on config metadata alone,
         # without first pulling multi-GB weights.
+        # subfolder is deliberately NOT forwarded: the pipeline / config / processor loads below
+        # resolve the repo ROOT (the whole DiffusionPipeline -- model_index.json plus every
+        # component subfolder), not a single subfolder, so narrowing the warm to one subfolder
+        # would leave the other components (unet/, vae/, text_encoder/) to an unprotected
+        # in-process Xet download while pulling subfolder weights the load never reads.
         maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
@@ -193,7 +198,6 @@ def from_pretrained(
             cache_dir = cache_dir,
             local_files_only = local_files_only,
             fast_inference = False,
-            subfolder = kwargs.get("subfolder"),
             force_download = kwargs.get("force_download", False),
             use_safetensors = kwargs.get("use_safetensors"),
         )
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 89696c3e3b..1511cd12c0 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2439,6 +2439,10 @@ def from_pretrained(
             use_safetensors = kwargs.get("use_safetensors"),
             from_tf = kwargs.get("from_tf", False),
             from_flax = kwargs.get("from_flax", False),
+            # A bare from_pretrained(model_name) reads only the ROOT weight files, so skip
+            # weights nested in subdirs (fp16/, experimental/) the load never reads. Ignored
+            # when a subfolder is set (that branch narrows the warm to the subfolder instead).
+            weights_at_root = True,
         )
         # The killable child already did the forced download; clear the flag so the
         # in-process load reuses that warm cache instead of re-forcing over Xet.
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 7afaa81b32..091a7eacfa 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -885,6 +885,10 @@ def from_pretrained(
                 # adapter_model.safetensors). Passing use_safetensors=False here would skip
                 # a safetensors-only adapter's weights, leaving PeftModel.from_pretrained to
                 # fetch them in-process. Leave it as auto so the adapter's format is warmed.
+                # Restrict the warm to the adapter's own files (adapter_config.json +
+                # adapter_model.*) plus the root tokenizer / config: a repo that also publishes
+                # merged full-model weights must not pull them just to load a small adapter.
+                adapter_only = True,
             )
             # The killable child already did the forced download; clear the flag so
             # the in-process load reuses that warm cache instead of re-forcing.
@@ -1843,6 +1847,10 @@ def _patched_car(
                 # adapter_model.safetensors). Passing use_safetensors=False here would skip
                 # a safetensors-only adapter's weights, leaving PeftModel.from_pretrained to
                 # fetch them in-process. Leave it as auto so the adapter's format is warmed.
+                # Restrict the warm to the adapter's own files (adapter_config.json +
+                # adapter_model.*) plus the root tokenizer / config: a repo that also publishes
+                # merged full-model weights must not pull them just to load a small adapter.
+                adapter_only = True,
             )
             # The killable child already did the forced download; clear the flag so
             # the in-process load reuses that warm cache instead of re-forcing.
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index c348a6d414..bbe21b9191 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -827,6 +827,10 @@ def from_pretrained(
             use_safetensors = kwargs.get("use_safetensors"),
             from_tf = kwargs.get("from_tf", False),
             from_flax = kwargs.get("from_flax", False),
+            # A bare from_pretrained(model_name) reads only the ROOT weight files, so skip
+            # weights nested in subdirs (fp16/, experimental/) the load never reads. Ignored
+            # when a subfolder is set (that branch narrows the warm to the subfolder instead).
+            weights_at_root = True,
         )
         # The killable child already did the forced download; clear the flag so the
         # in-process load reuses that warm cache instead of re-forcing over Xet.

From e3e139e9766d78a3515e4d9b1bdd762987991114 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 27 Jun 2026 12:45:14 +0000
Subject: [PATCH 37/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 6ef4f80e9c..810a2004f9 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -87,7 +87,7 @@ def test_weights_at_root_excludes_subdir_weights(capture):
     explicit use_safetensors avoids the auto branch's model_info network call."""
     ok, st = capture(weights_at_root = True, use_safetensors = True)
     assert ok is True
-    assert st["allow_patterns"] is None          # the warm stays otherwise unfiltered
+    assert st["allow_patterns"] is None  # the warm stays otherwise unfiltered
     ig = st["ignore_patterns"]
     assert "*/*.safetensors" in ig and "*/*.bin" in ig
     kept = _filter(_SAMPLE_FILES, st["allow_patterns"], ig)
@@ -105,7 +105,7 @@ def test_adapter_only_excludes_merged_weights(capture):
     config); a repo that also ships merged full-model weights must not pull them."""
     ok, st = capture(adapter_only = True)
     assert ok is True
-    assert st["ignore_patterns"] is None          # the exact allowlist makes the format filter moot
+    assert st["ignore_patterns"] is None  # the exact allowlist makes the format filter moot
     allow = st["allow_patterns"]
     assert "adapter_config.json" in allow and "adapter_model*" in allow
     kept = _filter(_SAMPLE_FILES, allow, st["ignore_patterns"])

From 844c92bfc0b37fb84b69df1b0357d172d2531cf4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 28 Jun 2026 03:06:25 +0000
Subject: [PATCH 38/80] Forward cache_dir to the VLM processor fallback; make
 the adapter and variant prewarms format-aware

Three more review findings on the cache prewarm.

vision.py: the VLM processor fallback get_auto_processor(...) was called without cache_dir, while
the primary AutoProcessor.from_pretrained right above it forwards kwargs["cache_dir"]. With a
custom cache_dir the fallback probed the default HF cache instead of the warmed one, so processor
files for these VLM fallback cases could still be downloaded in-process over Hub / Xet (or missed
on an offline retry) despite the warm cache. It now forwards cache_dir like the primary load.

_utils.py (variant): the auto weight-format branch dropped *.bin once in-scope safetensors were
present, not accounting for a requested variant. A variant load (variant="fp16") needing
pytorch_model.fp16.bin while the default root ships model.safetensors would have the variant .bin
dropped, leaving the real load to fetch it over unprotected Xet. The warm now forwards variant and
the auto branch only treats a VARIANT-matching safetensors (model.fp16.safetensors) as proof the
variant .bin is redundant; with no variant safetensors the .bin stays warmed. variant is forwarded
from the vision / language / diffusion loads (diffusion variants are common).

_utils.py (adapter format): the adapter_only allowlist (adapter_model*) warmed BOTH
adapter_model.safetensors and adapter_model.bin for a mixed-format adapter repo, while
PeftModel.from_pretrained reads only one (safetensors when present). The adapter warm now selects
the format the load will read: an explicit use_safetensors wins; otherwise a best-effort model_info
prefers safetensors when the repo ships it, dropping the redundant .bin. Any failure leaves both
formats eligible, so a .bin-only adapter is never under-warmed into an in-process fetch.

Tests (pure CPU, no network): the variant keeping vs dropping the .bin, the adapter preferring
safetensors / keeping a bin-only adapter / honoring an explicit use_safetensors. The model_info
network call is neutralized by default in the test fixture.
---
 tests/test_prefetch_snapshot_scope.py | 95 +++++++++++++++++++++++++++
 unsloth/models/_utils.py              | 49 +++++++++++++-
 unsloth/models/diffusion.py           |  3 +
 unsloth/models/llama.py               |  4 ++
 unsloth/models/vision.py              |  5 ++
 5 files changed, 155 insertions(+), 1 deletion(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 810a2004f9..85a87758c1 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -55,6 +55,17 @@ def fake_download(repo_id, **kw):
     fake_module.DownloadStallError = type("DownloadStallError", (RuntimeError,), {})
     monkeypatch.setitem(sys.modules, "unsloth_zoo.hf_xet_fallback", fake_module)
 
+    # Neutralize the model_info network call (adapter format selection / use_safetensors auto
+    # branch) by default so the pure-CPU tests never reach the Hub. A best-effort failure leaves
+    # both weight formats eligible; tests that exercise format selection install their own.
+    import huggingface_hub
+
+    class _NoNetworkApi:
+        def model_info(self, *a, **k):
+            raise RuntimeError("no network in test")
+
+    monkeypatch.setattr(huggingface_hub, "HfApi", _NoNetworkApi)
+
     def run(**call_kwargs):
         state.clear()
         ok = U.maybe_prefetch_hf_snapshot("some-org/some-repo", **call_kwargs)
@@ -173,3 +184,87 @@ def test_local_dir_is_not_warmed(capture, tmp_path):
     d.mkdir()
     ok = U.maybe_prefetch_hf_snapshot(str(d), weights_at_root = True)
     assert ok is False
+
+
+def _install_fake_model_info(monkeypatch, filenames):
+    """Make HfApi().model_info(...).siblings report *filenames*, with no network."""
+    import huggingface_hub
+
+    class _Sib:
+        def __init__(self, name):
+            self.rfilename = name
+
+    class _Info:
+        def __init__(self, names):
+            self.siblings = [_Sib(n) for n in names]
+
+    class _Api:
+        def model_info(self, *a, **k):
+            return _Info(filenames)
+
+    monkeypatch.setattr(huggingface_hub, "HfApi", _Api)
+
+
+# ----- Finding P: variant-aware weight-format selection -----
+
+def test_variant_keeps_bin_when_only_default_safetensors(monkeypatch):
+    """With variant='fp16' requested, a DEFAULT model.safetensors must not prove the variant
+    pytorch_model.fp16.bin redundant: dropping it would leave the variant load to fetch the .bin
+    in-process over Xet. The .bin stays warmed (Codex #6638)."""
+    _install_fake_model_info(monkeypatch, ["model.safetensors", "pytorch_model.fp16.bin"])
+    ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16")
+    assert "*.bin" not in ig
+    # No variant: the default safetensors DOES make .bin redundant (existing behavior).
+    ig_default = U._prefetch_ignore_patterns("org/repo")
+    assert "*.bin" in ig_default
+
+
+def test_variant_drops_bin_when_variant_safetensors_present(monkeypatch):
+    """When a variant-matching safetensors (model.fp16.safetensors) is shipped, the variant load
+    reads it and the variant .bin is redundant, so .bin is dropped from the warm."""
+    _install_fake_model_info(monkeypatch, ["model.fp16.safetensors", "pytorch_model.fp16.bin"])
+    ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16")
+    assert "*.bin" in ig
+
+
+# ----- Finding Q: adapter weight-format selection -----
+
+def test_adapter_only_prefers_safetensors_over_bin(capture, monkeypatch):
+    """A mixed-format adapter repo (adapter_model.safetensors AND adapter_model.bin) warms only
+    the safetensors PeftModel.from_pretrained reads, not both formats (Codex #6638)."""
+    _install_fake_model_info(
+        monkeypatch, ["adapter_config.json", "adapter_model.safetensors", "adapter_model.bin"]
+    )
+    _, st = capture(adapter_only = True)
+    ig = st["ignore_patterns"]
+    assert ig is not None and "adapter_model*.bin" in ig
+    kept = _filter(
+        ["adapter_config.json", "adapter_model.safetensors", "adapter_model.bin"],
+        st["allow_patterns"], ig,
+    )
+    assert "adapter_model.safetensors" in kept
+    assert "adapter_model.bin" not in kept
+
+
+def test_adapter_only_bin_only_keeps_bin(capture, monkeypatch):
+    """A .bin-only adapter repo must keep adapter_model.bin -- never under-warm it into an
+    in-process Xet fetch (best-effort: no safetensors found -> both formats eligible)."""
+    _install_fake_model_info(monkeypatch, ["adapter_config.json", "adapter_model.bin"])
+    _, st = capture(adapter_only = True)
+    kept = _filter(
+        ["adapter_config.json", "adapter_model.bin"], st["allow_patterns"], st["ignore_patterns"]
+    )
+    assert "adapter_model.bin" in kept
+
+
+def test_adapter_only_explicit_use_safetensors_false_keeps_bin(capture):
+    """An explicit use_safetensors=False forces the .bin form without a model_info call."""
+    _, st = capture(adapter_only = True, use_safetensors = False)
+    ig = st["ignore_patterns"]
+    assert ig is not None and "adapter_model*.safetensors" in ig
+    kept = _filter(
+        ["adapter_config.json", "adapter_model.safetensors", "adapter_model.bin"],
+        st["allow_patterns"], ig,
+    )
+    assert "adapter_model.bin" in kept
+    assert "adapter_model.safetensors" not in kept
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 0aeab14f31..812dbb2ea5 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1022,6 +1022,35 @@ def _is_model_weight_safetensors(filename):
     return not name.startswith("adapter_")
 
 
+def _filename_has_variant(filename, variant):
+    """True if a weight *filename* belongs to the requested *variant* (variant="fp16" matches
+    ``model.fp16.safetensors`` / ``model-00001-of-00002.fp16.safetensors``). Transformers inserts
+    the variant token right before the file extension, so the basename carries ``.{variant}.`` as
+    an infix. Only meaningful when a variant is requested; callers gate on *variant* being truthy."""
+    base = filename.replace("\\", "/").rsplit("/", 1)[-1]
+    return f".{variant}." in base
+
+
+def _adapter_repo_has_safetensors(model_name, *, token = None, revision = None):
+    """Best-effort: does the adapter repo ship a safetensors adapter weight
+    (``adapter_model.safetensors`` or a sharded ``adapter_model*.safetensors``)?
+    PeftModel.from_pretrained prefers safetensors, so when one is present the ``.bin`` form is
+    redundant and can be dropped from the warm. Any failure returns False so BOTH formats stay
+    eligible -- never under-warm a ``.bin``-only adapter into an in-process Xet fetch."""
+    try:
+        from huggingface_hub import HfApi
+        siblings = (
+            HfApi().model_info(model_name, revision = revision, token = token).siblings or []
+        )
+        return any(
+            sibling.rfilename.replace("\\", "/").rsplit("/", 1)[-1].startswith("adapter_model")
+            and sibling.rfilename.endswith(".safetensors")
+            for sibling in siblings
+        )
+    except Exception:
+        return False
+
+
 def _prefetch_ignore_patterns(
     model_name,
     *,
@@ -1031,6 +1060,7 @@ def _prefetch_ignore_patterns(
     use_safetensors = None,
     from_tf = False,
     from_flax = False,
+    variant = None,
 ):
     """ignore_patterns for the prewarm snapshot: the static skip list, minus the
     checkpoint guard when loading from a checkpoint-* subfolder, minus the weight
@@ -1102,10 +1132,14 @@ def _prefetch_ignore_patterns(
             # (same subfolder / root, and not an adapter / sidecar), so a .bin-only
             # subfolder is not stripped of its weights because some other path in the
             # repo ships safetensors, nor because an adapter_model.safetensors sidecar
-            # sits next to real pytorch_model.bin weights.
+            # sits next to real pytorch_model.bin weights. When a variant is requested
+            # (variant="fp16"), only a variant-matching safetensors (model.fp16.safetensors)
+            # proves the variant's .bin (pytorch_model.fp16.bin) is redundant: counting the
+            # default-format safetensors would drop the variant .bin the load actually reads.
             has_safetensors = any(
                 _is_model_weight_safetensors(sibling.rfilename)
                 and _in_requested_load_scope(sibling.rfilename, subfolder)
+                and (not variant or _filename_has_variant(sibling.rfilename, variant))
                 for sibling in siblings
             )
             if has_safetensors:
@@ -1131,6 +1165,7 @@ def maybe_prefetch_hf_snapshot(
     tokenizer_only = False,
     adapter_only = False,
     weights_at_root = False,
+    variant = None,
 ):
     """Warm the Hugging Face cache for a remote repo before the in-process load.
 
@@ -1201,6 +1236,7 @@ def maybe_prefetch_hf_snapshot(
             use_safetensors = use_safetensors,
             from_tf = from_tf,
             from_flax = from_flax,
+            variant = variant,
         )
     )
     # Narrow the warm to exactly what the in-process load reads, so a repo that ships extra
@@ -1221,6 +1257,17 @@ def maybe_prefetch_hf_snapshot(
         # publishes merged / full-model weights does not pull multi-GB of weights PeftModel never
         # reads (and risk filling disk before a small adapter loads).
         allow_patterns = [*_ADAPTER_PREFETCH_PATTERNS, *_ROOT_AUX_PREFETCH_PATTERNS]
+        # An adapter ships its weights in ONE format and PeftModel.from_pretrained reads ONE
+        # (safetensors when present), so an adapter repo carrying both adapter_model.safetensors
+        # and adapter_model.bin must not warm both. Pick the format the load will read: an
+        # explicit use_safetensors wins; otherwise prefer safetensors when the repo ships it
+        # (best-effort model_info; any failure keeps both, never under-warming a .bin-only adapter).
+        if use_safetensors is False:
+            ignore_patterns = ["adapter_model*.safetensors", "adapter_model*.safetensors.index.json"]
+        elif use_safetensors is True or _adapter_repo_has_safetensors(
+            model_name, token = token, revision = revision
+        ):
+            ignore_patterns = ["adapter_model*.bin", "adapter_model*.bin.index.json"]
     elif isinstance(subfolder, str) and subfolder.strip("/"):
         # Loading from a subfolder: a from_pretrained(..., subfolder=X) resolves every weight
         # file under X/, so warm that subfolder (plus the root aux files) and skip the rest.
diff --git a/unsloth/models/diffusion.py b/unsloth/models/diffusion.py
index 738b2bfe09..2e8c2d31d0 100644
--- a/unsloth/models/diffusion.py
+++ b/unsloth/models/diffusion.py
@@ -200,6 +200,9 @@ def from_pretrained(
             fast_inference = False,
             force_download = kwargs.get("force_download", False),
             use_safetensors = kwargs.get("use_safetensors"),
+            # Diffusion variants (variant="fp16") are common: forward it so the warm's format
+            # auto-pick keeps the variant weights the pipeline load actually reads.
+            variant = kwargs.get("variant"),
         )
 
         load_kwargs = dict(
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 1511cd12c0..2fe8c397df 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2443,6 +2443,10 @@ def from_pretrained(
             # weights nested in subdirs (fp16/, experimental/) the load never reads. Ignored
             # when a subfolder is set (that branch narrows the warm to the subfolder instead).
             weights_at_root = True,
+            # A variant load (variant="fp16") reads model.fp16.* -- forward it so the warm's
+            # format auto-pick keeps the variant .bin instead of dropping it for a default
+            # safetensors the variant load never reads.
+            variant = kwargs.get("variant"),
         )
         # The killable child already did the forced download; clear the flag so the
         # in-process load reuses that warm cache instead of re-forcing over Xet.
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index bbe21b9191..3888f7b0c8 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -831,6 +831,10 @@ def from_pretrained(
             # weights nested in subdirs (fp16/, experimental/) the load never reads. Ignored
             # when a subfolder is set (that branch narrows the warm to the subfolder instead).
             weights_at_root = True,
+            # A variant load (variant="fp16") reads model.fp16.* -- forward it so the warm's
+            # format auto-pick keeps the variant .bin instead of dropping it for a default
+            # safetensors the variant load never reads.
+            variant = kwargs.get("variant"),
         )
         # The killable child already did the forced download; clear the flag so the
         # in-process load reuses that warm cache instead of re-forcing over Xet.
@@ -1316,6 +1320,7 @@ def _acquire_processor(lfo):
                             padding_side = "left",
                             token = token,
                             trust_remote_code = trust_remote_code,
+                            cache_dir = kwargs.get("cache_dir"),
                             local_files_only = lfo,
                         )
                     except Exception:

From cafa5ab7c301fcf986bc76275ac524f79ccbb971 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 28 Jun 2026 03:06:54 +0000
Subject: [PATCH 39/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py |  8 ++++++--
 unsloth/models/_utils.py              | 16 +++++++++++-----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 85a87758c1..f72cdaf888 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -207,6 +207,7 @@ def model_info(self, *a, **k):
 
 # ----- Finding P: variant-aware weight-format selection -----
 
+
 def test_variant_keeps_bin_when_only_default_safetensors(monkeypatch):
     """With variant='fp16' requested, a DEFAULT model.safetensors must not prove the variant
     pytorch_model.fp16.bin redundant: dropping it would leave the variant load to fetch the .bin
@@ -229,6 +230,7 @@ def test_variant_drops_bin_when_variant_safetensors_present(monkeypatch):
 
 # ----- Finding Q: adapter weight-format selection -----
 
+
 def test_adapter_only_prefers_safetensors_over_bin(capture, monkeypatch):
     """A mixed-format adapter repo (adapter_model.safetensors AND adapter_model.bin) warms only
     the safetensors PeftModel.from_pretrained reads, not both formats (Codex #6638)."""
@@ -240,7 +242,8 @@ def test_adapter_only_prefers_safetensors_over_bin(capture, monkeypatch):
     assert ig is not None and "adapter_model*.bin" in ig
     kept = _filter(
         ["adapter_config.json", "adapter_model.safetensors", "adapter_model.bin"],
-        st["allow_patterns"], ig,
+        st["allow_patterns"],
+        ig,
     )
     assert "adapter_model.safetensors" in kept
     assert "adapter_model.bin" not in kept
@@ -264,7 +267,8 @@ def test_adapter_only_explicit_use_safetensors_false_keeps_bin(capture):
     assert ig is not None and "adapter_model*.safetensors" in ig
     kept = _filter(
         ["adapter_config.json", "adapter_model.safetensors", "adapter_model.bin"],
-        st["allow_patterns"], ig,
+        st["allow_patterns"],
+        ig,
     )
     assert "adapter_model.bin" in kept
     assert "adapter_model.safetensors" not in kept
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 812dbb2ea5..7f967401b8 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1031,7 +1031,12 @@ def _filename_has_variant(filename, variant):
     return f".{variant}." in base
 
 
-def _adapter_repo_has_safetensors(model_name, *, token = None, revision = None):
+def _adapter_repo_has_safetensors(
+    model_name,
+    *,
+    token = None,
+    revision = None,
+):
     """Best-effort: does the adapter repo ship a safetensors adapter weight
     (``adapter_model.safetensors`` or a sharded ``adapter_model*.safetensors``)?
     PeftModel.from_pretrained prefers safetensors, so when one is present the ``.bin`` form is
@@ -1039,9 +1044,7 @@ def _adapter_repo_has_safetensors(model_name, *, token = None, revision = None):
     eligible -- never under-warm a ``.bin``-only adapter into an in-process Xet fetch."""
     try:
         from huggingface_hub import HfApi
-        siblings = (
-            HfApi().model_info(model_name, revision = revision, token = token).siblings or []
-        )
+        siblings = HfApi().model_info(model_name, revision = revision, token = token).siblings or []
         return any(
             sibling.rfilename.replace("\\", "/").rsplit("/", 1)[-1].startswith("adapter_model")
             and sibling.rfilename.endswith(".safetensors")
@@ -1263,7 +1266,10 @@ def maybe_prefetch_hf_snapshot(
         # explicit use_safetensors wins; otherwise prefer safetensors when the repo ships it
         # (best-effort model_info; any failure keeps both, never under-warming a .bin-only adapter).
         if use_safetensors is False:
-            ignore_patterns = ["adapter_model*.safetensors", "adapter_model*.safetensors.index.json"]
+            ignore_patterns = [
+                "adapter_model*.safetensors",
+                "adapter_model*.safetensors.index.json",
+            ]
         elif use_safetensors is True or _adapter_repo_has_safetensors(
             model_name, token = token, revision = revision
         ):

From 6a0bb93b61202104bf0a3bc4e65ba5ad6e652c8d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 28 Jun 2026 03:56:55 +0000
Subject: [PATCH 40/80] Tighten prefetch variant detection, tokenizer warm
 coverage, and cached-repo probing

Address the latest review round on the from_pretrained prefetch wiring:

- Recognize sharded variant weights. Transformers writes a sharded variant as
  model.fp16-00001-of-00002.safetensors (the variant token sits before the
  -NNNNN-of-NNNNN suffix), so the variant detector now matches both the .fp16.
  single-file infix and the .fp16- sharded infix. A sharded variant safetensors is
  therefore detected and its redundant .bin is dropped from the warm rather than
  both formats being fetched.
- Warm the SentencePiece / vocab / processor files real tokenizers load
  (spm.model, normalizer.json, tokenizer.model.v3, video_preprocessor_config.json,
  and a named additional_chat_templates/<name>.jinja), so a separate-repo tokenizer
  or processor load does not fall through to an in-process Xet fetch for them.
- Skip the weight-format model_info round-trip when the repo is already cached. A
  cached load downloads nothing, so the format-drop optimization is moot and
  probing the Hub would only add a network hop to an otherwise offline-capable
  load. The cache hint comes from a cheap local try_to_load_from_cache lookup; a
  wrong "cached" guess merely keeps both formats (over-warm, never under-warm).
- Soften the diffusion variant comment.

Tests: sharded-variant detection and the redundant-.bin drop; tokenizer vocab
files are warmed by a tokenizer-only prefetch; a cached repo skips the format
model_info probe and keeps both formats.
---
 tests/test_prefetch_snapshot_scope.py | 60 +++++++++++++++++++++++++++
 unsloth/models/_utils.py              | 57 ++++++++++++++++++++-----
 unsloth/models/diffusion.py           |  6 ++-
 3 files changed, 111 insertions(+), 12 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index f72cdaf888..05932c0512 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -228,6 +228,66 @@ def test_variant_drops_bin_when_variant_safetensors_present(monkeypatch):
     assert "*.bin" in ig
 
 
+def test_filename_has_variant_matches_single_and_sharded():
+    """The variant detector matches both the single-file (.fp16.) and SHARDED (.fp16-) infixes and
+    rejects the default (non-variant) names (gemini #6638)."""
+    assert U._filename_has_variant("model.fp16.safetensors", "fp16") is True
+    assert U._filename_has_variant("model.fp16-00001-of-00002.safetensors", "fp16") is True
+    assert U._filename_has_variant("diffusion_pytorch_model.fp16.safetensors", "fp16") is True
+    assert U._filename_has_variant("model.safetensors", "fp16") is False
+    assert U._filename_has_variant("model-00001-of-00002.safetensors", "fp16") is False
+
+
+def test_variant_drops_bin_for_sharded_variant_safetensors(monkeypatch):
+    """A SHARDED variant safetensors (model.fp16-00001-of-00002.safetensors) is recognized, so its
+    redundant variant .bin is dropped rather than both formats warmed (gemini #6638)."""
+    _install_fake_model_info(monkeypatch, [
+        "model.fp16-00001-of-00002.safetensors",
+        "model.fp16-00002-of-00002.safetensors",
+        "pytorch_model.fp16-00001-of-00002.bin",
+    ])
+    ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16")
+    assert "*.bin" in ig
+
+
+def test_tokenizer_only_warms_extra_vocab_files(capture):
+    """tokenizer_only must warm the SentencePiece / vocab / processor files real tokenizers load
+    (spm.model, normalizer.json, video_preprocessor_config.json, tokenizer.model.v3, and a named
+    additional_chat_templates/<name>.jinja) so a separate-repo tokenizer / processor load does not
+    fetch them in-process over Xet (Codex #6638)."""
+    _, st = capture(tokenizer_only = True)
+    allow = st["allow_patterns"]
+    for name in ("spm.model", "normalizer.json", "video_preprocessor_config.json", "tokenizer.model.v3"):
+        assert name in allow, name
+    sample = ["spm.model", "normalizer.json", "video_preprocessor_config.json",
+              "tokenizer.model.v3", "additional_chat_templates/custom.jinja"]
+    kept = _filter(sample, allow, st["ignore_patterns"])
+    assert set(kept) == set(sample)
+
+
+def test_cached_repo_skips_format_model_info(capture, monkeypatch):
+    """When the repo is already cached (config.json present locally), the weight-format model_info
+    probe is skipped -- a cached, offline-capable load must not add a Hub round-trip. Both formats
+    stay eligible (over-warm is safe; the cached warm downloads nothing) (Codex #6638)."""
+    import huggingface_hub
+
+    monkeypatch.setattr(
+        huggingface_hub, "try_to_load_from_cache", lambda *a, **k: "/cache/config.json"
+    )
+    called = {"n": 0}
+
+    class _Api:
+        def model_info(self, *a, **k):
+            called["n"] += 1
+            raise RuntimeError("model_info must not be called for a cached repo")
+
+    monkeypatch.setattr(huggingface_hub, "HfApi", _Api)
+    # weights_at_root with use_safetensors=None would normally hit the auto model_info branch.
+    _, st = capture(weights_at_root = True)
+    assert called["n"] == 0
+    assert "*.bin" not in (st["ignore_patterns"] or [])  # auto .bin-drop skipped, both formats kept
+
+
 # ----- Finding Q: adapter weight-format selection -----
 
 
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 7f967401b8..aa4be5cf20 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -949,10 +949,20 @@ def _run_temporary_patches(phase):
     "vocab.txt",
     "merges.txt",
     "spiece.model",
+    # Additional SentencePiece / vocab files that are real from_pretrained load targets
+    # (VOCAB_FILES_NAMES entries) but are NOT covered by the names above: spm.model
+    # (DeBERTa-v2), normalizer.json (Whisper), tokenizer.model.v3 (Mistral versioned SP).
+    "spm.model",
+    "normalizer.json",
+    "tokenizer.model.v3",
     "chat_template.jinja",
     "chat_template.json",
+    # A non-default chat_template="<name>" load fetches additional_chat_templates/<name>.jinja.
+    "additional_chat_templates/*.jinja",
     "preprocessor_config.json",
     "processor_config.json",
+    # Video processors (e.g. Qwen2.5-VL video) read a dedicated video_preprocessor_config.json.
+    "video_preprocessor_config.json",
     # Custom-code entry points a trust_remote_code config / model / tokenizer / processor
     # load fetches from the repo root. Each carries a literal prefix, so it stays effectively
     # root-anchored, and matches nothing on a non-remote-code repo (harmless there).
@@ -1023,12 +1033,15 @@ def _is_model_weight_safetensors(filename):
 
 
 def _filename_has_variant(filename, variant):
-    """True if a weight *filename* belongs to the requested *variant* (variant="fp16" matches
-    ``model.fp16.safetensors`` / ``model-00001-of-00002.fp16.safetensors``). Transformers inserts
-    the variant token right before the file extension, so the basename carries ``.{variant}.`` as
-    an infix. Only meaningful when a variant is requested; callers gate on *variant* being truthy."""
+    """True if a weight *filename* belongs to the requested *variant* (variant="fp16"). Transformers
+    inserts the variant token right before the extension, so a single-file weight carries it as a
+    ``.{variant}.`` infix (``model.fp16.safetensors``), while a SHARDED weight carries it as a
+    ``.{variant}-`` infix before the ``-NNNNN-of-NNNNN`` suffix
+    (``model.fp16-00001-of-00002.safetensors``). Both shapes are matched so a sharded variant
+    safetensors is recognized -- else its redundant ``.bin`` would not be dropped. Only meaningful
+    when a variant is requested; callers gate on *variant* being truthy."""
     base = filename.replace("\\", "/").rsplit("/", 1)[-1]
-    return f".{variant}." in base
+    return f".{variant}." in base or f".{variant}-" in base
 
 
 def _adapter_repo_has_safetensors(
@@ -1064,6 +1077,7 @@ def _prefetch_ignore_patterns(
     from_tf = False,
     from_flax = False,
     variant = None,
+    skip_format_probe = False,
 ):
     """ignore_patterns for the prewarm snapshot: the static skip list, minus the
     checkpoint guard when loading from a checkpoint-* subfolder, minus the weight
@@ -1114,6 +1128,10 @@ def _prefetch_ignore_patterns(
     elif use_safetensors is False:
         # Explicit .bin: the load never reads safetensors, so skip them.
         ignore_patterns.extend(("*.safetensors", "*.safetensors.index.json"))
+    elif skip_format_probe:
+        # The repo is already cached, so nothing downloads and the format-drop optimization is
+        # moot: skip the model_info network call and leave both formats eligible.
+        pass
     else:
         # Auto (use_safetensors is None): skip .bin only once in-scope safetensors are
         # confirmed to load instead, since Transformers prefers them. Best-effort: any
@@ -1225,9 +1243,26 @@ def maybe_prefetch_hf_snapshot(
     if fast_inference:
         return False
 
-    # A tokenizer-only or adapter-only warm allow-lists the exact files the load reads below, so
-    # the weight-format ignore list is moot -- and skipping it avoids the model_info network call
-    # its auto branch would otherwise make for a repo whose full weights we never fetch.
+    # Skip the weight-format model_info round-trips below when the repo is already on disk: a cached
+    # repo downloads nothing (the guarded warm short-circuits on the cache regardless of which format
+    # the ignore list would drop), so probing the Hub to choose a format would only add a network hop
+    # to an otherwise offline-capable cached load. try_to_load_from_cache is a cheap LOCAL lookup; a
+    # wrong "cached" guess merely keeps BOTH formats (over-warm, never under-warm). Adapter repos key
+    # off adapter_config.json, everything else off config.json.
+    skip_format_probe = False
+    try:
+        from huggingface_hub import try_to_load_from_cache
+        _probe_name = "adapter_config.json" if adapter_only else "config.json"
+        skip_format_probe = isinstance(
+            try_to_load_from_cache(model_name, _probe_name, cache_dir = cache_dir), str
+        )
+    except Exception:
+        skip_format_probe = False
+
+    # A tokenizer-only warm allow-lists the exact tokenizer / config files below, so the weight-
+    # format ignore list is moot -- and skipping it avoids the model_info network call the auto
+    # branch would otherwise make. (An adapter-only warm sets its own format ignore further down,
+    # gated on skip_format_probe so a cached adapter needs no model_info.)
     ignore_patterns = (
         None
         if tokenizer_only or adapter_only
@@ -1240,6 +1275,7 @@ def maybe_prefetch_hf_snapshot(
             from_tf = from_tf,
             from_flax = from_flax,
             variant = variant,
+            skip_format_probe = skip_format_probe,
         )
     )
     # Narrow the warm to exactly what the in-process load reads, so a repo that ships extra
@@ -1270,8 +1306,9 @@ def maybe_prefetch_hf_snapshot(
                 "adapter_model*.safetensors",
                 "adapter_model*.safetensors.index.json",
             ]
-        elif use_safetensors is True or _adapter_repo_has_safetensors(
-            model_name, token = token, revision = revision
+        elif use_safetensors is True or (
+            not skip_format_probe
+            and _adapter_repo_has_safetensors(model_name, token = token, revision = revision)
         ):
             ignore_patterns = ["adapter_model*.bin", "adapter_model*.bin.index.json"]
     elif isinstance(subfolder, str) and subfolder.strip("/"):
diff --git a/unsloth/models/diffusion.py b/unsloth/models/diffusion.py
index 2e8c2d31d0..5806959122 100644
--- a/unsloth/models/diffusion.py
+++ b/unsloth/models/diffusion.py
@@ -200,8 +200,10 @@ def from_pretrained(
             fast_inference = False,
             force_download = kwargs.get("force_download", False),
             use_safetensors = kwargs.get("use_safetensors"),
-            # Diffusion variants (variant="fp16") are common: forward it so the warm's format
-            # auto-pick keeps the variant weights the pipeline load actually reads.
+            # Diffusion variants (variant="fp16") are common: forward it so the warm never drops a
+            # variant .bin for a non-variant safetensors. (When component weights live in
+            # subfolders the auto branch finds no root safetensors and keeps both formats anyway;
+            # forwarding variant simply makes that correct rather than incidental.)
             variant = kwargs.get("variant"),
         )
 

From b8010f79fa18901bdef4a22c684d74cc4f284446 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 28 Jun 2026 03:57:43 +0000
Subject: [PATCH 41/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py | 29 +++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 05932c0512..7f01103d55 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -241,11 +241,14 @@ def test_filename_has_variant_matches_single_and_sharded():
 def test_variant_drops_bin_for_sharded_variant_safetensors(monkeypatch):
     """A SHARDED variant safetensors (model.fp16-00001-of-00002.safetensors) is recognized, so its
     redundant variant .bin is dropped rather than both formats warmed (gemini #6638)."""
-    _install_fake_model_info(monkeypatch, [
-        "model.fp16-00001-of-00002.safetensors",
-        "model.fp16-00002-of-00002.safetensors",
-        "pytorch_model.fp16-00001-of-00002.bin",
-    ])
+    _install_fake_model_info(
+        monkeypatch,
+        [
+            "model.fp16-00001-of-00002.safetensors",
+            "model.fp16-00002-of-00002.safetensors",
+            "pytorch_model.fp16-00001-of-00002.bin",
+        ],
+    )
     ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16")
     assert "*.bin" in ig
 
@@ -257,10 +260,20 @@ def test_tokenizer_only_warms_extra_vocab_files(capture):
     fetch them in-process over Xet (Codex #6638)."""
     _, st = capture(tokenizer_only = True)
     allow = st["allow_patterns"]
-    for name in ("spm.model", "normalizer.json", "video_preprocessor_config.json", "tokenizer.model.v3"):
+    for name in (
+        "spm.model",
+        "normalizer.json",
+        "video_preprocessor_config.json",
+        "tokenizer.model.v3",
+    ):
         assert name in allow, name
-    sample = ["spm.model", "normalizer.json", "video_preprocessor_config.json",
-              "tokenizer.model.v3", "additional_chat_templates/custom.jinja"]
+    sample = [
+        "spm.model",
+        "normalizer.json",
+        "video_preprocessor_config.json",
+        "tokenizer.model.v3",
+        "additional_chat_templates/custom.jinja",
+    ]
     kept = _filter(sample, allow, st["ignore_patterns"])
     assert set(kept) == set(sample)
 

From c3d674ba28d606d0442fbbee0a867bd5c44d6e5c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 28 Jun 2026 04:59:55 +0000
Subject: [PATCH 42/80] Tighten prefetch format selection, tokenizer vocab
 coverage, and the vLLM tokenizer warm

Address the latest Codex review round on the from_pretrained prefetch wiring:

- Do not let a cached config.json skip the weight-format probe. AutoConfig caches
  config.json before this helper runs in the Llama / diffusion paths, so a config-based
  "already cached" guess left BOTH weight formats eligible and over-fetched both
  multi-GB sets even when no weights were cached. The auto branch consults model_info
  again to drop the redundant .bin for a safetensors repo; the format-probe optimization
  that keyed off config.json is removed.
- Do not count a training-state optimizer.safetensors / scheduler.safetensors sidecar as
  model safetensors. A repo whose real weights are pytorch_model.bin alongside an
  optimizer.safetensors now keeps its .bin instead of dropping it, so the in-process
  load does not have to fetch the only weights over Xet without the fallback.
- Warm the slow-tokenizer SentencePiece / BPE vocab files AutoTokenizer fetches first
  (sentencepiece.bpe.model for XLM-R / mBART / CamemBERT, source.spm / target.spm for
  Marian, bpe.codes for FSMT / XLM, vocab.bpe), so a separate tokenizer load does not
  fall through to an in-process Xet fetch for them.
- On the vLLM fast-inference path, do not warm the tokenizer for the pre-remap
  model_name. fast_inference_setup may rewrite a *-unsloth-bnb-4bit name to the
  *-bnb-4bit repo the load actually reads, and vLLM downloads that repo -- including its
  tokenizer -- before the in-process tokenizer load, so the pre-remap warm targeted the
  wrong repo. Only a SEPARATE explicit tokenizer repo is warmed here now.

Tests: the format probe still drops .bin when config.json is cached but weights are not;
optimizer.safetensors does not drop .bin while model.safetensors still does; the slow
SentencePiece / BPE vocab files are warmed by a tokenizer-only prefetch.
---
 tests/test_prefetch_snapshot_scope.py | 62 +++++++++++++++++-----
 unsloth/models/_utils.py              | 74 ++++++++++++++-------------
 unsloth/models/vision.py              | 17 +++---
 3 files changed, 97 insertions(+), 56 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 7f01103d55..470d83e8b0 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -278,27 +278,61 @@ def test_tokenizer_only_warms_extra_vocab_files(capture):
     assert set(kept) == set(sample)
 
 
-def test_cached_repo_skips_format_model_info(capture, monkeypatch):
-    """When the repo is already cached (config.json present locally), the weight-format model_info
-    probe is skipped -- a cached, offline-capable load must not add a Hub round-trip. Both formats
-    stay eligible (over-warm is safe; the cached warm downloads nothing) (Codex #6638)."""
+def test_format_probe_runs_even_when_config_cached(capture, monkeypatch):
+    """A cached config.json must NOT skip the weight-format probe: AutoConfig caches config.json
+    before this helper runs (Llama / diffusion), so a config-based "cached" guess would leave both
+    formats eligible and over-fetch both multi-GB weight sets even when no weights are cached. The
+    auto branch still consults model_info and drops the redundant .bin for a safetensors repo
+    (Codex #6638)."""
     import huggingface_hub
 
+    # Pretend config.json is locally cached (the AutoConfig side effect). This must not gate the probe.
     monkeypatch.setattr(
         huggingface_hub, "try_to_load_from_cache", lambda *a, **k: "/cache/config.json"
     )
-    called = {"n": 0}
+    _install_fake_model_info(monkeypatch, ["model.safetensors", "pytorch_model.bin"])
+    _, st = capture(weights_at_root = True)
+    ig = st["ignore_patterns"] or []
+    assert "*.bin" in ig  # redundant .bin dropped because real model safetensors is present
 
-    class _Api:
-        def model_info(self, *a, **k):
-            called["n"] += 1
-            raise RuntimeError("model_info must not be called for a cached repo")
 
-    monkeypatch.setattr(huggingface_hub, "HfApi", _Api)
-    # weights_at_root with use_safetensors=None would normally hit the auto model_info branch.
-    _, st = capture(weights_at_root = True)
-    assert called["n"] == 0
-    assert "*.bin" not in (st["ignore_patterns"] or [])  # auto .bin-drop skipped, both formats kept
+def test_optimizer_safetensors_does_not_drop_bin(monkeypatch):
+    """A training-state optimizer.safetensors sidecar must NOT count as model safetensors: a repo
+    whose real weights are pytorch_model.bin alongside an optimizer.safetensors must keep its .bin,
+    else the in-process load fetches the only weights over Xet without the fallback (Codex #6638)."""
+    _install_fake_model_info(monkeypatch, ["pytorch_model.bin", "optimizer.safetensors"])
+    ig = U._prefetch_ignore_patterns("org/repo")
+    assert "*.bin" not in ig  # .bin is the only real weight -> not dropped
+
+
+def test_model_safetensors_still_drops_bin(monkeypatch):
+    """Control for the optimizer case: a real model.safetensors next to pytorch_model.bin still
+    drops the redundant .bin (the sidecar exclusion must not over-trigger) (Codex #6638)."""
+    _install_fake_model_info(monkeypatch, ["model.safetensors", "pytorch_model.bin", "optimizer.safetensors"])
+    ig = U._prefetch_ignore_patterns("org/repo")
+    assert "*.bin" in ig
+
+
+def test_is_model_weight_safetensors_classification():
+    """Direct unit coverage: real model weights count, adapter / trainer-state sidecars do not."""
+    assert U._is_model_weight_safetensors("model.safetensors") is True
+    assert U._is_model_weight_safetensors("model-00001-of-00002.safetensors") is True
+    assert U._is_model_weight_safetensors("model.safetensors.index.json") is True
+    assert U._is_model_weight_safetensors("consolidated.safetensors") is True
+    assert U._is_model_weight_safetensors("adapter_model.safetensors") is False
+    assert U._is_model_weight_safetensors("optimizer.safetensors") is False
+    assert U._is_model_weight_safetensors("scheduler.safetensors") is False
+    assert U._is_model_weight_safetensors("rng_state_0.safetensors") is False
+
+
+def test_tokenizer_only_warms_slow_sentencepiece_vocab(capture):
+    """tokenizer_only must warm the slow-tokenizer SentencePiece / BPE vocab files AutoTokenizer
+    fetches first (sentencepiece.bpe.model for XLM-R / mBART, source.spm / target.spm for Marian,
+    bpe.codes / vocab.bpe), so they are not left to an in-process Xet fetch (Codex #6638)."""
+    _, st = capture(tokenizer_only = True)
+    allow = st["allow_patterns"]
+    for name in ("sentencepiece.bpe.model", "source.spm", "target.spm", "bpe.codes", "vocab.bpe"):
+        assert name in allow, name
 
 
 # ----- Finding Q: adapter weight-format selection -----
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index aa4be5cf20..865eec0154 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -951,10 +951,18 @@ def _run_temporary_patches(phase):
     "spiece.model",
     # Additional SentencePiece / vocab files that are real from_pretrained load targets
     # (VOCAB_FILES_NAMES entries) but are NOT covered by the names above: spm.model
-    # (DeBERTa-v2), normalizer.json (Whisper), tokenizer.model.v3 (Mistral versioned SP).
+    # (DeBERTa-v2), normalizer.json (Whisper), tokenizer.model.v3 (Mistral versioned SP),
+    # sentencepiece.bpe.model (XLM-R / mBART / CamemBERT -- Unsloth tries the slow tokenizer
+    # first, which fetches these), source.spm / target.spm (Marian), bpe.codes (FSMT / XLM),
+    # vocab.bpe (GPT-2-style merges asset).
     "spm.model",
     "normalizer.json",
     "tokenizer.model.v3",
+    "sentencepiece.bpe.model",
+    "source.spm",
+    "target.spm",
+    "bpe.codes",
+    "vocab.bpe",
     "chat_template.jinja",
     "chat_template.json",
     # A non-default chat_template="<name>" load fetches additional_chat_templates/<name>.jinja.
@@ -1017,19 +1025,36 @@ def _in_requested_load_scope(filename, subfolder):
     return "/" not in filename
 
 
+# Training-state files that carry a .safetensors suffix but are NOT loadable model weights. A
+# Trainer checkpoint can ship optimizer.safetensors / scheduler.safetensors next to the real
+# pytorch_model.bin, and counting one as "model safetensors present" would drop the needed .bin.
+_NON_MODEL_WEIGHT_STEMS = frozenset({
+    "optimizer", "scheduler", "scaler", "rng_state", "training_args",
+})
+
+
 def _is_model_weight_safetensors(filename):
     """True if a repo-relative *filename* is a model-weights safetensors file rather than
-    a PEFT adapter / sidecar (e.g. ``adapter_model.safetensors``).
+    a PEFT adapter / sidecar (e.g. ``adapter_model.safetensors``) or a trainer-state file
+    (``optimizer.safetensors``).
 
     Only a real model-weights safetensors proves the ``.bin`` full-model weights are
-    redundant. A repo can ship an ``adapter_model.safetensors`` sidecar while its actual
-    weights are ``pytorch_model.bin``; counting the sidecar would wrongly skip the needed
-    ``.bin`` and leave the in-process load to fetch it without the Xet fallback.
+    redundant. A repo can ship an ``adapter_model.safetensors`` sidecar -- or an
+    ``optimizer.safetensors`` training-state file -- while its actual weights are
+    ``pytorch_model.bin``; counting those would wrongly skip the needed ``.bin`` and leave the
+    in-process load to fetch it without the Xet fallback.
     """
     name = filename.replace("\\", "/").rsplit("/", 1)[-1]
     if not name.endswith((".safetensors", ".safetensors.index.json")):
         return False
-    return not name.startswith("adapter_")
+    if name.startswith("adapter_"):
+        return False
+    # Stem before the first dot: "optimizer.safetensors" -> "optimizer", "rng_state_0..." -> caught
+    # by the prefix test; a real weight ("model", "model-00001-of-00002", "consolidated") is kept.
+    stem = name.split(".", 1)[0].lower()
+    if stem in _NON_MODEL_WEIGHT_STEMS or stem.startswith("rng_state"):
+        return False
+    return True
 
 
 def _filename_has_variant(filename, variant):
@@ -1077,7 +1102,6 @@ def _prefetch_ignore_patterns(
     from_tf = False,
     from_flax = False,
     variant = None,
-    skip_format_probe = False,
 ):
     """ignore_patterns for the prewarm snapshot: the static skip list, minus the
     checkpoint guard when loading from a checkpoint-* subfolder, minus the weight
@@ -1128,10 +1152,6 @@ def _prefetch_ignore_patterns(
     elif use_safetensors is False:
         # Explicit .bin: the load never reads safetensors, so skip them.
         ignore_patterns.extend(("*.safetensors", "*.safetensors.index.json"))
-    elif skip_format_probe:
-        # The repo is already cached, so nothing downloads and the format-drop optimization is
-        # moot: skip the model_info network call and leave both formats eligible.
-        pass
     else:
         # Auto (use_safetensors is None): skip .bin only once in-scope safetensors are
         # confirmed to load instead, since Transformers prefers them. Best-effort: any
@@ -1243,26 +1263,12 @@ def maybe_prefetch_hf_snapshot(
     if fast_inference:
         return False
 
-    # Skip the weight-format model_info round-trips below when the repo is already on disk: a cached
-    # repo downloads nothing (the guarded warm short-circuits on the cache regardless of which format
-    # the ignore list would drop), so probing the Hub to choose a format would only add a network hop
-    # to an otherwise offline-capable cached load. try_to_load_from_cache is a cheap LOCAL lookup; a
-    # wrong "cached" guess merely keeps BOTH formats (over-warm, never under-warm). Adapter repos key
-    # off adapter_config.json, everything else off config.json.
-    skip_format_probe = False
-    try:
-        from huggingface_hub import try_to_load_from_cache
-        _probe_name = "adapter_config.json" if adapter_only else "config.json"
-        skip_format_probe = isinstance(
-            try_to_load_from_cache(model_name, _probe_name, cache_dir = cache_dir), str
-        )
-    except Exception:
-        skip_format_probe = False
-
-    # A tokenizer-only warm allow-lists the exact tokenizer / config files below, so the weight-
-    # format ignore list is moot -- and skipping it avoids the model_info network call the auto
-    # branch would otherwise make. (An adapter-only warm sets its own format ignore further down,
-    # gated on skip_format_probe so a cached adapter needs no model_info.)
+    # A tokenizer-only or adapter-only warm allow-lists the exact files the load reads below, so the
+    # weight-format ignore list is moot -- and skipping it avoids the model_info network call the
+    # auto branch would otherwise make. (An adapter-only warm sets its own format ignore further
+    # down.) The format probe keys off an ACTUAL weight file, not config.json: AutoConfig caches
+    # config.json before this helper runs in the Llama / diffusion paths, so a config-based "cached"
+    # guess would skip the .bin-drop even when no weights are cached and over-fetch both formats.
     ignore_patterns = (
         None
         if tokenizer_only or adapter_only
@@ -1275,7 +1281,6 @@ def maybe_prefetch_hf_snapshot(
             from_tf = from_tf,
             from_flax = from_flax,
             variant = variant,
-            skip_format_probe = skip_format_probe,
         )
     )
     # Narrow the warm to exactly what the in-process load reads, so a repo that ships extra
@@ -1306,9 +1311,8 @@ def maybe_prefetch_hf_snapshot(
                 "adapter_model*.safetensors",
                 "adapter_model*.safetensors.index.json",
             ]
-        elif use_safetensors is True or (
-            not skip_format_probe
-            and _adapter_repo_has_safetensors(model_name, token = token, revision = revision)
+        elif use_safetensors is True or _adapter_repo_has_safetensors(
+            model_name, token = token, revision = revision
         ):
             ignore_patterns = ["adapter_model*.bin", "adapter_model*.bin.index.json"]
     elif isinstance(subfolder, str) and subfolder.strip("/"):
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 3888f7b0c8..8835e3151e 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -841,19 +841,22 @@ def from_pretrained(
         if _prefetched and kwargs.get("force_download", False):
             kwargs["force_download"] = False
 
-        # The tokenizer / processor loads in-process below (auto_processor / AutoTokenizer)
-        # regardless of the vLLM weight path, so a stalled Xet download of its tokenizer /
-        # processor / config files could still hang from_pretrained. Warm that repo's files
-        # through the same killable subprocess. The base prefetch above already covered them
-        # when it warmed model_name itself, so only warm here when the tokenizer / processor
-        # comes from a different repo, or when fast_inference skipped the base warm entirely.
+        # The tokenizer / processor loads in-process below (auto_processor / AutoTokenizer), so a
+        # stalled Xet download of its tokenizer / processor / config files could still hang
+        # from_pretrained. Warm a SEPARATE tokenizer repo (explicit tokenizer_name) through the
+        # killable subprocess. When the tokenizer comes from model_name itself, it is already
+        # covered: the base prefetch above warmed model_name, and on the vLLM path
+        # (_vllm_owns_weights) vLLM downloads its model repo -- including the tokenizer -- before
+        # the in-process tokenizer load. We must NOT warm model_name here on the vLLM path: that
+        # warm runs before fast_inference_setup may rewrite a "*-unsloth-bnb-4bit" name to the
+        # "*-bnb-4bit" repo the load actually reads, so it would warm the wrong (pre-remap) repo.
         _tokenizer_repo = (
             tokenizer_name if (isinstance(tokenizer_name, str) and tokenizer_name) else model_name
         )
         _warm_tokenizer_repo = (
             isinstance(_tokenizer_repo, str)
             and bool(_tokenizer_repo)
-            and (_tokenizer_repo != model_name or _vllm_owns_weights)
+            and _tokenizer_repo != model_name
         )
         if _warm_tokenizer_repo:
             maybe_prefetch_hf_snapshot(

From be49ab1bcaa7267ecf25f19c7d80dc9302914c63 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 28 Jun 2026 05:00:38 +0000
Subject: [PATCH 43/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py |  4 +++-
 unsloth/models/_utils.py              | 12 +++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 470d83e8b0..2685fd732a 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -308,7 +308,9 @@ def test_optimizer_safetensors_does_not_drop_bin(monkeypatch):
 def test_model_safetensors_still_drops_bin(monkeypatch):
     """Control for the optimizer case: a real model.safetensors next to pytorch_model.bin still
     drops the redundant .bin (the sidecar exclusion must not over-trigger) (Codex #6638)."""
-    _install_fake_model_info(monkeypatch, ["model.safetensors", "pytorch_model.bin", "optimizer.safetensors"])
+    _install_fake_model_info(
+        monkeypatch, ["model.safetensors", "pytorch_model.bin", "optimizer.safetensors"]
+    )
     ig = U._prefetch_ignore_patterns("org/repo")
     assert "*.bin" in ig
 
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 865eec0154..0ce88fb520 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1028,9 +1028,15 @@ def _in_requested_load_scope(filename, subfolder):
 # Training-state files that carry a .safetensors suffix but are NOT loadable model weights. A
 # Trainer checkpoint can ship optimizer.safetensors / scheduler.safetensors next to the real
 # pytorch_model.bin, and counting one as "model safetensors present" would drop the needed .bin.
-_NON_MODEL_WEIGHT_STEMS = frozenset({
-    "optimizer", "scheduler", "scaler", "rng_state", "training_args",
-})
+_NON_MODEL_WEIGHT_STEMS = frozenset(
+    {
+        "optimizer",
+        "scheduler",
+        "scaler",
+        "rng_state",
+        "training_args",
+    }
+)
 
 
 def _is_model_weight_safetensors(filename):

From 4710764f1f046f3d2fb27e6b7759a6e0d6d279fc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 28 Jun 2026 05:41:40 +0000
Subject: [PATCH 44/80] Warm any trust_remote_code module, not just the
 modeling_*.py convention

The aux prefetch list warmed remote-code entry points by prefix glob (modeling_*.py,
tokenization_*.py, ...), but a trust_remote_code auto_map can point at an arbitrarily
named module (modeling.py, tokenization.py, my_code.py) that does not use those prefixes.
The prefix globs then miss the file, so a subfolder / tokenizer-only warm reports success
while the in-process AutoConfig / AutoModel / tokenizer load fetches the code file over
Xet without the stall fallback.

Warm every *.py instead. Python files are tiny, a non-remote-code repo ships none
(harmless), and HF fnmatch "*" spans "/" so nested helper modules are covered too.

Tests: the aux warm keeps an arbitrarily named auto_map module (modeling.py,
tokenization.py, my_custom_code.py), not only the modeling_*.py convention.
---
 tests/test_prefetch_snapshot_scope.py | 19 +++++++++++++++++++
 unsloth/models/_utils.py              | 17 +++++++----------
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 2685fd732a..fecfcd3744 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -155,6 +155,25 @@ def test_tokenizer_only_warms_only_aux_files(capture):
     assert "adapter_model.safetensors" not in kept
 
 
+def test_aux_warm_covers_arbitrary_remote_code_modules(capture):
+    """A trust_remote_code auto_map can name its module arbitrarily (modeling.py, tokenization.py,
+    my_code.py), not just the transformers modeling_*.py convention, so the aux warm must cover any
+    *.py -- else the load fetches the code file in-process over Xet (Codex #6638)."""
+    _, st = capture(tokenizer_only = True)
+    allow = st["allow_patterns"]
+    assert "*.py" in allow
+    remote_code = [
+        "config.json",
+        "modeling.py",            # auto_map "modeling.Model" -- no underscore suffix
+        "tokenization.py",
+        "my_custom_code.py",
+        "configuration_foo.py",   # the convention still covered by *.py too
+    ]
+    kept = _filter(remote_code, allow, st["ignore_patterns"])
+    for name in ("modeling.py", "tokenization.py", "my_custom_code.py", "configuration_foo.py"):
+        assert name in kept, name
+
+
 def test_subfolder_warms_subfolder_plus_root_aux(capture):
     """A subfolder load warms that subfolder's weights plus the root tokenizer / config; the
     root weights and OTHER subfolders are skipped."""
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 0ce88fb520..292ef6947f 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -971,16 +971,13 @@ def _run_temporary_patches(phase):
     "processor_config.json",
     # Video processors (e.g. Qwen2.5-VL video) read a dedicated video_preprocessor_config.json.
     "video_preprocessor_config.json",
-    # Custom-code entry points a trust_remote_code config / model / tokenizer / processor
-    # load fetches from the repo root. Each carries a literal prefix, so it stays effectively
-    # root-anchored, and matches nothing on a non-remote-code repo (harmless there).
-    "configuration_*.py",
-    "modeling_*.py",
-    "tokenization_*.py",
-    "processing_*.py",
-    "image_processing_*.py",
-    "feature_extraction_*.py",
-    "video_processing_*.py",
+    # Custom-code entry points a trust_remote_code config / model / tokenizer / processor load
+    # fetches via auto_map. The module name is arbitrary: the transformers convention is
+    # modeling_*.py / tokenization_*.py, but an auto_map can point at any module (modeling.py,
+    # tokenization.py, my_code.py), so prefix globs would miss it and leave the .py to an
+    # in-process Xet fetch. Warm every *.py instead -- they are tiny, and a non-remote-code repo
+    # ships none (harmless). HF fnmatch "*" spans "/", so nested helper modules are covered too.
+    "*.py",
     # tiktoken vocab assets (e.g. Qwen's qwen.tiktoken) a custom tokenizer can require.
     "*.tiktoken",
 )

From 1c99378f8c3ffa36352709c5aaa1f44fb1d9a066 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 28 Jun 2026 05:42:19 +0000
Subject: [PATCH 45/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index fecfcd3744..327c87c0d1 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -164,10 +164,10 @@ def test_aux_warm_covers_arbitrary_remote_code_modules(capture):
     assert "*.py" in allow
     remote_code = [
         "config.json",
-        "modeling.py",            # auto_map "modeling.Model" -- no underscore suffix
+        "modeling.py",  # auto_map "modeling.Model" -- no underscore suffix
         "tokenization.py",
         "my_custom_code.py",
-        "configuration_foo.py",   # the convention still covered by *.py too
+        "configuration_foo.py",  # the convention still covered by *.py too
     ]
     kept = _filter(remote_code, allow, st["ignore_patterns"])
     for name in ("modeling.py", "tokenization.py", "my_custom_code.py", "configuration_foo.py"):

From 0b60a20a52fa15d3e0b300a676a14d4fd1810b99 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 28 Jun 2026 06:42:23 +0000
Subject: [PATCH 46/80] Scope the adapter format probe to root, and warm the
 requested GGUF file

Address the latest Codex review round on the from_pretrained prefetch wiring:

- Scope _adapter_repo_has_safetensors to repo-ROOT files. It stripped directories before
  deciding whether a safetensors adapter exists, so an unrelated
  checkpoint-*/adapter_model.safetensors in a subdirectory made it report safetensors
  present and drop the root adapter_model.bin from the warm -- leaving PeftModel to fetch
  the only root adapter weight in-process over Xet. The adapter warm only pulls root
  adapter_model*, so the format probe now matches that root scope.
- Warm the requested GGUF when gguf_file is passed. from_pretrained(model_name,
  gguf_file=NAME) resolves that GGUF from the Hub, but the static prewarm ignore list
  always dropped *.gguf, so the file was never warmed and the load fetched it in-process
  over Xet. maybe_prefetch_hf_snapshot now takes gguf_file (threaded from the Llama /
  vision base warms) and allow-lists exactly that file plus the root aux files, so the
  requested quant is warmed without pulling every other quant the repo may publish.

Tests: the adapter format probe ignores a subdirectory adapter_model.safetensors and only
counts a root one; a gguf_file warm keeps the requested GGUF (and root aux) while excluding
other quants.
---
 tests/test_prefetch_snapshot_scope.py | 50 +++++++++++++++++++++++++++
 unsloth/models/_utils.py              | 23 ++++++++----
 unsloth/models/llama.py               |  3 ++
 unsloth/models/vision.py              |  3 ++
 4 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 327c87c0d1..86884f557d 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -356,6 +356,56 @@ def test_tokenizer_only_warms_slow_sentencepiece_vocab(capture):
         assert name in allow, name
 
 
+def test_adapter_safetensors_check_scoped_to_root(monkeypatch):
+    """_adapter_repo_has_safetensors must only count a ROOT adapter_model*.safetensors: a repo with
+    a root adapter_model.bin plus an unrelated checkpoint-5/adapter_model.safetensors must NOT drop
+    the root .bin (the adapter warm only pulls root adapter_model*) (Codex #6638)."""
+    import huggingface_hub
+
+    class _Sib:
+        def __init__(self, name):
+            self.rfilename = name
+
+    class _Api:
+        def __init__(self, names):
+            self._names = names
+
+        def model_info(self, *a, **k):
+            return type("MI", (), {"siblings": [_Sib(n) for n in self._names]})()
+
+    # Subdir safetensors only -> not at root -> must NOT report safetensors present.
+    monkeypatch.setattr(
+        huggingface_hub, "HfApi",
+        lambda: _Api(["adapter_config.json", "adapter_model.bin", "checkpoint-5/adapter_model.safetensors"]),
+    )
+    assert U._adapter_repo_has_safetensors("org/repo") is False
+    # Root safetensors -> reported present.
+    monkeypatch.setattr(
+        huggingface_hub, "HfApi",
+        lambda: _Api(["adapter_config.json", "adapter_model.safetensors"]),
+    )
+    assert U._adapter_repo_has_safetensors("org/repo") is True
+
+
+def test_gguf_file_warm_keeps_gguf(capture):
+    """A gguf_file load reads exactly that GGUF, so the warm must allow-list it (not drop *.gguf via
+    the static ignore list) while not pulling other quants the repo may publish (Codex #6638)."""
+    _, st = capture(weights_at_root = True, gguf_file = "model-Q4_K_M.gguf")
+    allow = st["allow_patterns"]
+    ig = st["ignore_patterns"]
+    assert allow is not None and "model-Q4_K_M.gguf" in allow
+    sample = [
+        "model-Q4_K_M.gguf",
+        "model-Q8_0.gguf",   # a different quant the load does not read
+        "config.json",
+        "tokenizer.json",
+    ]
+    kept = _filter(sample, allow, ig)
+    assert "model-Q4_K_M.gguf" in kept          # the requested GGUF is warmed
+    assert "config.json" in kept                # root aux warmed
+    assert "model-Q8_0.gguf" not in kept        # other quants are not pulled
+
+
 # ----- Finding Q: adapter weight-format selection -----
 
 
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 292ef6947f..6d0bb23c93 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1078,16 +1078,20 @@ def _adapter_repo_has_safetensors(
     token = None,
     revision = None,
 ):
-    """Best-effort: does the adapter repo ship a safetensors adapter weight
+    """Best-effort: does the adapter repo ship a ROOT safetensors adapter weight
     (``adapter_model.safetensors`` or a sharded ``adapter_model*.safetensors``)?
     PeftModel.from_pretrained prefers safetensors, so when one is present the ``.bin`` form is
-    redundant and can be dropped from the warm. Any failure returns False so BOTH formats stay
-    eligible -- never under-warm a ``.bin``-only adapter into an in-process Xet fetch."""
+    redundant and can be dropped from the warm. Scoped to repo-ROOT files because the adapter warm
+    only pulls root ``adapter_model*``: an unrelated ``checkpoint-*/adapter_model.safetensors`` in a
+    subdirectory must NOT make us drop the root ``.bin`` the load actually reads. Any failure returns
+    False so BOTH formats stay eligible -- never under-warm a ``.bin``-only adapter into an
+    in-process Xet fetch."""
     try:
         from huggingface_hub import HfApi
         siblings = HfApi().model_info(model_name, revision = revision, token = token).siblings or []
         return any(
-            sibling.rfilename.replace("\\", "/").rsplit("/", 1)[-1].startswith("adapter_model")
+            "/" not in sibling.rfilename.replace("\\", "/")  # root files only
+            and sibling.rfilename.startswith("adapter_model")
             and sibling.rfilename.endswith(".safetensors")
             for sibling in siblings
         )
@@ -1210,6 +1214,7 @@ def maybe_prefetch_hf_snapshot(
     adapter_only = False,
     weights_at_root = False,
     variant = None,
+    gguf_file = None,
 ):
     """Warm the Hugging Face cache for a remote repo before the in-process load.
 
@@ -1274,7 +1279,7 @@ def maybe_prefetch_hf_snapshot(
     # guess would skip the .bin-drop even when no weights are cached and over-fetch both formats.
     ignore_patterns = (
         None
-        if tokenizer_only or adapter_only
+        if tokenizer_only or adapter_only or gguf_file
         else _prefetch_ignore_patterns(
             model_name,
             token = token,
@@ -1293,7 +1298,13 @@ def maybe_prefetch_hf_snapshot(
     # an unprotected in-process download. The root patterns are exact filenames or literal-prefixed
     # globs (e.g. modeling_*.py), so they stay anchored to repo-root files in practice.
     allow_patterns = None
-    if tokenizer_only:
+    if gguf_file:
+        # from_pretrained(model_name, gguf_file=NAME) reads exactly that GGUF from the repo
+        # (Transformers de-quantizes it on load). The static ignore list drops *.gguf, so without
+        # this the file would never be warmed and the load would fetch it in-process over Xet. Warm
+        # exactly that file (plus root aux) -- not every other quant the repo may also publish.
+        allow_patterns = [gguf_file, *_ROOT_AUX_PREFETCH_PATTERNS]
+    elif tokenizer_only:
         # A distinct tokenizer repo: warm only its tokenizer / config / vocab files. Restrict
         # to those exact root filenames so we never pull weights, even if that repo also
         # happens to ship them (the weights are not what the tokenizer load reads).
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 2fe8c397df..2ba68971c1 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2447,6 +2447,9 @@ def from_pretrained(
             # format auto-pick keeps the variant .bin instead of dropping it for a default
             # safetensors the variant load never reads.
             variant = kwargs.get("variant"),
+            # A gguf_file load reads exactly that GGUF; forward it so the warm fetches it instead of
+            # dropping every *.gguf (the static ignore list otherwise excludes it).
+            gguf_file = kwargs.get("gguf_file"),
         )
         # The killable child already did the forced download; clear the flag so the
         # in-process load reuses that warm cache instead of re-forcing over Xet.
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 8835e3151e..2713bb1cf8 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -835,6 +835,9 @@ def from_pretrained(
             # format auto-pick keeps the variant .bin instead of dropping it for a default
             # safetensors the variant load never reads.
             variant = kwargs.get("variant"),
+            # A gguf_file load reads exactly that GGUF; forward it so the warm fetches it instead of
+            # dropping every *.gguf (the static ignore list otherwise excludes it).
+            gguf_file = kwargs.get("gguf_file"),
         )
         # The killable child already did the forced download; clear the flag so the
         # in-process load reuses that warm cache instead of re-forcing over Xet.

From 58b872f013c4dcce821752527edfab26794afd2e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 28 Jun 2026 06:43:03 +0000
Subject: [PATCH 47/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 86884f557d..96949d8a37 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -375,13 +375,17 @@ def model_info(self, *a, **k):
 
     # Subdir safetensors only -> not at root -> must NOT report safetensors present.
     monkeypatch.setattr(
-        huggingface_hub, "HfApi",
-        lambda: _Api(["adapter_config.json", "adapter_model.bin", "checkpoint-5/adapter_model.safetensors"]),
+        huggingface_hub,
+        "HfApi",
+        lambda: _Api(
+            ["adapter_config.json", "adapter_model.bin", "checkpoint-5/adapter_model.safetensors"]
+        ),
     )
     assert U._adapter_repo_has_safetensors("org/repo") is False
     # Root safetensors -> reported present.
     monkeypatch.setattr(
-        huggingface_hub, "HfApi",
+        huggingface_hub,
+        "HfApi",
         lambda: _Api(["adapter_config.json", "adapter_model.safetensors"]),
     )
     assert U._adapter_repo_has_safetensors("org/repo") is True
@@ -396,14 +400,14 @@ def test_gguf_file_warm_keeps_gguf(capture):
     assert allow is not None and "model-Q4_K_M.gguf" in allow
     sample = [
         "model-Q4_K_M.gguf",
-        "model-Q8_0.gguf",   # a different quant the load does not read
+        "model-Q8_0.gguf",  # a different quant the load does not read
         "config.json",
         "tokenizer.json",
     ]
     kept = _filter(sample, allow, ig)
-    assert "model-Q4_K_M.gguf" in kept          # the requested GGUF is warmed
-    assert "config.json" in kept                # root aux warmed
-    assert "model-Q8_0.gguf" not in kept        # other quants are not pulled
+    assert "model-Q4_K_M.gguf" in kept  # the requested GGUF is warmed
+    assert "config.json" in kept  # root aux warmed
+    assert "model-Q8_0.gguf" not in kept  # other quants are not pulled
 
 
 # ----- Finding Q: adapter weight-format selection -----

From 290944fcfd9298e65a4204911a53e440bf3ff3da Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 28 Jun 2026 07:11:23 +0000
Subject: [PATCH 48/80] Warm a subfolder GGUF at its real path and ignore
 nested TF/Flax weights for root loads

Address the latest Codex review round on the from_pretrained prefetch wiring:

- Warm a subfolder GGUF at its resolved path. When gguf_file is combined with subfolder,
  the load reads <subfolder>/<gguf_file>, but the GGUF warm allow-listed only the bare root
  name -- so the snapshot read as warm while the real GGUF was fetched in-process over Xet.
  The warm now allow-lists <subfolder>/<gguf_file> when a subfolder is set.
- Ignore nested TF / Flax weights for a root load. The root-only subdirectory ignore
  dropped only nested safetensors / bin, but a from_tf / from_flax root load keeps .h5 /
  .msgpack at the root (the format filter does not drop them), so a repo with alternate
  .h5 / .msgpack checkpoints under subdirectories still pulled those unread multi-GB files.
  The subdir ignore now covers every weight format (.h5, .msgpack, .pt, .pth too), while a
  root weight of any format (no "/") is still kept.

Tests: a gguf_file + subfolder warm allow-lists the subfolder path; a from_tf root load
warms the root .h5 but ignores a nested checkpoint-*/model.h5.
---
 tests/test_prefetch_snapshot_scope.py | 22 ++++++++++++++++++++++
 unsloth/models/_utils.py              | 21 +++++++++++++++++----
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 96949d8a37..9e2ba51c07 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -454,3 +454,25 @@ def test_adapter_only_explicit_use_safetensors_false_keeps_bin(capture):
     )
     assert "adapter_model.bin" in kept
     assert "adapter_model.safetensors" not in kept
+
+
+def test_gguf_file_with_subfolder_warms_subfolder_path(capture):
+    """gguf_file + subfolder: the load resolves <subfolder>/<gguf_file>, so the warm must allow-list
+    that subfolder path, not the bare root name (Codex #6638)."""
+    _, st = capture(weights_at_root = True, gguf_file = "model-Q4_K_M.gguf", subfolder = "gguf")
+    allow = st["allow_patterns"]
+    assert "gguf/model-Q4_K_M.gguf" in allow
+    kept = _filter(["gguf/model-Q4_K_M.gguf", "config.json"], allow, st["ignore_patterns"])
+    assert "gguf/model-Q4_K_M.gguf" in kept and "config.json" in kept
+
+
+def test_from_tf_root_load_ignores_nested_h5(capture):
+    """A from_tf root load reads the ROOT .h5; nested .h5 / .msgpack checkpoints under subdirs are
+    unread, so the root-only subdir ignore must drop them (it covers every weight format, not only
+    safetensors / bin) (Codex #6638)."""
+    _, st = capture(weights_at_root = True, from_tf = True)
+    ig = st["ignore_patterns"]
+    assert "*/*.h5" in ig and "*/*.msgpack" in ig
+    kept = _filter(["model.h5", "checkpoint-1/model.h5", "config.json"], st["allow_patterns"], ig)
+    assert "model.h5" in kept                    # root TF weight warmed
+    assert "checkpoint-1/model.h5" not in kept   # nested TF checkpoint ignored
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 6d0bb23c93..13fbf6213b 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -998,11 +998,18 @@ def _run_temporary_patches(phase):
 # unread; ignoring them keeps a repo's alternate-precision / experimental weight dirs (fp16/,
 # experimental/) from being pulled by the otherwise-unfiltered warm. Hugging Face's fnmatch "*"
 # spans "/", so "*/*.safetensors" matches any nested .safetensors while a root "model.safetensors"
-# (no "/") is kept. Only applied when the caller asserts a root-only load (weights_at_root), never
-# to a diffusion pipeline warm whose component weights DO live in subfolders.
+# (no "/") is kept. Every weight format is covered -- including .h5 / .msgpack, which a
+# from_tf / from_flax root load keeps at the root (the format filter does NOT drop them), so without
+# the nested forms a TF / Flax root load would still pull alternate .h5 / .msgpack checkpoints under
+# subdirectories. Only applied when the caller asserts a root-only load (weights_at_root), never to a
+# diffusion pipeline warm whose component weights DO live in subfolders.
 _SUBDIR_WEIGHT_IGNORE_PATTERNS = (
     "*/*.safetensors",
     "*/*.bin",
+    "*/*.h5",
+    "*/*.msgpack",
+    "*/*.pt",
+    "*/*.pth",
 )
 
 
@@ -1302,8 +1309,14 @@ def maybe_prefetch_hf_snapshot(
         # from_pretrained(model_name, gguf_file=NAME) reads exactly that GGUF from the repo
         # (Transformers de-quantizes it on load). The static ignore list drops *.gguf, so without
         # this the file would never be warmed and the load would fetch it in-process over Xet. Warm
-        # exactly that file (plus root aux) -- not every other quant the repo may also publish.
-        allow_patterns = [gguf_file, *_ROOT_AUX_PREFETCH_PATTERNS]
+        # exactly that file (plus root aux) -- not every other quant the repo may also publish. When
+        # a subfolder is set, the load resolves <subfolder>/<gguf_file>, so warm THAT path.
+        _gguf_path = (
+            f"{subfolder.strip('/')}/{gguf_file}"
+            if isinstance(subfolder, str) and subfolder.strip("/")
+            else gguf_file
+        )
+        allow_patterns = [_gguf_path, *_ROOT_AUX_PREFETCH_PATTERNS]
     elif tokenizer_only:
         # A distinct tokenizer repo: warm only its tokenizer / config / vocab files. Restrict
         # to those exact root filenames so we never pull weights, even if that repo also

From affcc0a837efb946968666032d39e8cdf9cc1a0a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 28 Jun 2026 07:12:03 +0000
Subject: [PATCH 49/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 9e2ba51c07..7c4dfbfcc8 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -474,5 +474,5 @@ def test_from_tf_root_load_ignores_nested_h5(capture):
     ig = st["ignore_patterns"]
     assert "*/*.h5" in ig and "*/*.msgpack" in ig
     kept = _filter(["model.h5", "checkpoint-1/model.h5", "config.json"], st["allow_patterns"], ig)
-    assert "model.h5" in kept                    # root TF weight warmed
-    assert "checkpoint-1/model.h5" not in kept   # nested TF checkpoint ignored
+    assert "model.h5" in kept  # root TF weight warmed
+    assert "checkpoint-1/model.h5" not in kept  # nested TF checkpoint ignored

From 8e315aa117329515b88123a29510e2009a78135b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 29 Jun 2026 04:51:11 +0000
Subject: [PATCH 50/80] Trim prefetch / Studio shim comments to a succinct core

Comment-only pass over the snapshot-prewarm wiring and the Studio shim. No code,
control flow, or test behavior changes (AST-verified: only comments / docstrings /
whitespace differ).

- Collapse the long blocks in _utils.py (the ignore / allow pattern tables and
  maybe_prefetch_hf_snapshot) to short explanations, keeping the load-bearing why
  (weight-format selection, root vs subfolder scope, variant / gguf / adapter
  handling) and dropping restatements of obvious code.
- Tighten the per-call comments in llama.py, loader.py, vision.py, diffusion.py,
  and tokenizer_utils.py, and the Studio shim's degraded-path notes.
- Tighten the test setup comments.

Tests: prefetch-scope 25 passed; Studio shim 6 passed / 2 skipped; ruff clean.
---
 studio/backend/tests/test_hf_xet_fallback.py |  34 +--
 studio/backend/utils/hf_xet_fallback.py      |  62 ++---
 unsloth/models/_utils.py                     | 273 +++++++------------
 unsloth/models/diffusion.py                  |  27 +-
 unsloth/models/llama.py                      |  45 +--
 unsloth/models/loader.py                     |  62 ++---
 unsloth/models/vision.py                     |  44 +--
 unsloth/tokenizer_utils.py                   |   8 +-
 8 files changed, 196 insertions(+), 359 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 5c42134587..00d84e7c73 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -23,9 +23,8 @@
 if _BACKEND_DIR not in sys.path:
     sys.path.insert(0, _BACKEND_DIR)
 
-# Stub heavy/unavailable deps before importing the module under test. Use the
-# real structlog when present; a bare stub left in sys.modules would break later
-# modules that log at import time.
+# Stub heavy/unavailable deps before importing the module under test. Use the real structlog when
+# present; a bare stub would break later modules that log at import time.
 _loggers_stub = _types.ModuleType("loggers")
 _loggers_stub.get_logger = lambda name: __import__("logging").getLogger(name)
 sys.modules.setdefault("loggers", _loggers_stub)
@@ -40,10 +39,8 @@
     import unsloth_zoo.hf_xet_fallback as _shared_mod
     shared = _shared_mod
 except Exception:  # noqa: BLE001
-    # The degraded-path test must still collect when unsloth_zoo lacks the helper,
-    # is not installed at all, fails to import because torch is absent (ImportError),
-    # or fails because the host has no GPU (NotImplementedError from its device
-    # init) -- the same failure cases the shim itself degrades for.
+    # Still collect the degraded-path test when unsloth_zoo is missing / too old / torch-less /
+    # GPU-less -- the same failure cases the shim itself degrades for.
     shared = None
 
 import utils.hf_xet_fallback as xf
@@ -276,9 +273,7 @@ def find_spec(
             target = None,
         ):
             if name == "unsloth_zoo.hf_xet_fallback":
-                # Mirror unsloth_zoo/__init__ raising on a torch-less install: a
-                # plain ImportError with no .name, surfaced while importing the
-                # submodule's parent package.
+                # Mirror unsloth_zoo/__init__ on a torch-less install: a plain ImportError with no .name.
                 raise ImportError("Unsloth: Pytorch is not installed.")
             return None
 
@@ -305,11 +300,9 @@ def find_spec(
 
 
 def test_retries_under_light_gpu_init_when_import_fails(monkeypatch):
-    """unsloth_zoo's package __init__ runs torch/GPU device detection that raises
-    NotImplementedError on a GPU-less host (CPU GGUF Studio). The shim must retry
-    the import under UNSLOTH_ZOO_DISABLE_GPU_INIT=1 (its light path) -- which is
-    what lets the real helper load on CPU-only hosts -- then restore the env. If
-    even the retry fails, it degrades instead of crashing the server."""
+    """unsloth_zoo's __init__ runs GPU detection that raises NotImplementedError on a GPU-less host
+    (CPU GGUF Studio). The shim must retry the import under UNSLOTH_ZOO_DISABLE_GPU_INIT=1 (its light
+    path), then restore the env; if even the retry fails, it degrades instead of crashing."""
     import importlib
     import os
 
@@ -323,14 +316,11 @@ def find_spec(
             path = None,
             target = None,
         ):
-            # The real crash is in unsloth_zoo's package __init__ (device
-            # detection), i.e. the PARENT import fails before the submodule is
-            # reached -- so intercept "unsloth_zoo" itself. This works whether or
-            # not unsloth_zoo is installed (the finder is first on meta_path).
+            # The crash is in unsloth_zoo's __init__ (the PARENT import fails before the submodule), so
+            # intercept "unsloth_zoo" itself (works whether or not it is installed; finder is first).
             if name == "unsloth_zoo":
-                # Record the env each import attempt sees; raise the no-GPU error
-                # both times so the shim ends up degrading (the recovery-succeeds
-                # path is covered by real unsloth_zoo on a CPU host in CI).
+                # Record the env each attempt sees; raise the no-GPU error both times so the shim
+                # degrades (the recovery-succeeds path is covered by real unsloth_zoo on a CPU host).
                 seen_env.append(os.environ.get("UNSLOTH_ZOO_DISABLE_GPU_INIT"))
                 raise NotImplementedError("Unsloth cannot find any torch accelerator")
             return None
diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index d9fbcea87d..06b033c885 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -3,12 +3,10 @@
 
 """Studio shim over the shared Xet -> HTTP stall fallback.
 
-The no-progress watchdog, the spawn-child download, and the single Xet -> HTTP
-retry now live once in ``unsloth_zoo.hf_xet_fallback`` (so Unsloth main and Studio
-share one implementation). This module re-exports that API and injects Studio's
-marker-aware cache purge (``prepare_cache_for_transport``) so the hub download
-manager keeps its ``.transport`` marker semantics on the HTTP retry. Call sites
-(core/inference/llama_cpp.py, core/training/worker.py) and the orchestrator's
+The watchdog, spawn-child download, and Xet -> HTTP retry live once in
+``unsloth_zoo.hf_xet_fallback`` (shared by Unsloth main and Studio). This module re-exports that API
+and injects Studio's marker-aware cache purge (``prepare_cache_for_transport``) so the download
+manager keeps its ``.transport`` marker semantics on the HTTP retry. Call sites and the orchestrator's
 ``DownloadStallError`` import are unchanged.
 """
 
@@ -22,13 +20,11 @@
     import unsloth_zoo.hf_xet_fallback as _shared
     _shared_available = True
 except Exception as _exc:  # noqa: BLE001 - any import failure must degrade, not crash
-    # unsloth_zoo runs torch/GPU device detection in its package __init__, which
-    # raises on a Studio host without torch (ImportError) or without a GPU
-    # (NotImplementedError) -- a CPU / llama.cpp GGUF-only deployment. The download
-    # helper needs none of that, so retry with unsloth_zoo's documented light import
-    # path (UNSLOTH_ZOO_DISABLE_GPU_INIT) before giving up. The first attempt above
-    # keeps full device init unchanged on a normal GPU host; a failed import is
-    # dropped from sys.modules, so the retry re-runs __init__ on the light path.
+    # unsloth_zoo's package __init__ runs torch/GPU detection, which raises on a Studio host without
+    # torch / without a GPU (CPU / llama.cpp GGUF-only). The download helper needs none of that, so
+    # retry on the light import path (UNSLOTH_ZOO_DISABLE_GPU_INIT) before giving up. The full GPU
+    # path above is unchanged on a normal host; a failed import is dropped from sys.modules so __init__
+    # re-runs here.
     _shared_import_error = _exc
     import os as _os
 
@@ -39,8 +35,7 @@
         _shared_available = True
         _shared_import_error = None
     except Exception as _exc2:  # noqa: BLE001
-        # unsloth_zoo is absent/too old, or genuinely broken: degrade so Studio
-        # still boots with plain HF downloads instead of crashing on import.
+        # unsloth_zoo absent / too old / broken: degrade so Studio still boots with plain HF downloads.
         _shared_import_error = _exc2
         _shared_available = False
     finally:
@@ -50,9 +45,8 @@
             _os.environ["UNSLOTH_ZOO_DISABLE_GPU_INIT"] = _prev_gpu_init
 
 if _shared_available:
-    # Bind the shared API by assignment (not `from ... import`) so each public name
-    # has a single, unambiguous module-level binding shared between this branch and
-    # the degraded one below.
+    # Bind by assignment (not `from ... import`) so each public name has one module-level binding
+    # shared between this branch and the degraded one below.
     DEFAULT_GRACE_PERIOD = _shared.DEFAULT_GRACE_PERIOD
     DEFAULT_HEARTBEAT_INTERVAL = _shared.DEFAULT_HEARTBEAT_INTERVAL
     DEFAULT_STALL_TIMEOUT = _shared.DEFAULT_STALL_TIMEOUT
@@ -63,11 +57,9 @@
     _shared_hf_hub_download_with_xet_fallback = _shared.hf_hub_download_with_xet_fallback
     _shared_snapshot_download_with_xet_fallback = _shared.snapshot_download_with_xet_fallback
 else:
-    # Rather than crash Studio at startup, degrade gracefully: plain HF downloads
-    # with the no-progress stall watchdog disabled -- the same best-effort posture
-    # core Unsloth uses in from_pretrained. Automatic Xet -> HTTP recovery returns
-    # as soon as unsloth_zoo is upgraded. These are thin stubs, not a second copy
-    # of the orchestration.
+    # Degrade gracefully instead of crashing Studio: plain HF downloads with the stall watchdog
+    # disabled (the same best-effort posture core Unsloth uses). Recovery returns once unsloth_zoo is
+    # upgraded. Thin stubs, not a second copy of the orchestration.
     import logging as _logging
 
     _logging.getLogger(__name__).warning(
@@ -82,9 +74,8 @@
     DEFAULT_GRACE_PERIOD = 10.0
 
     class DownloadStallError(RuntimeError):
-        """Stub mirror of the shared type so callers and ``except`` clauses still
-        resolve when the shared helper is unavailable (it is simply never raised
-        in degraded mode, since there is no watchdog to detect a stall)."""
+        """Stub mirror of the shared type so callers' ``except`` clauses still resolve; never raised in
+        degraded mode (no watchdog to detect a stall)."""
 
     def child_should_disable_xet(config: dict) -> bool:
         return bool(config.get("disable_xet"))
@@ -99,9 +90,8 @@ def start_watchdog(
         xet_disabled: bool = False,
         **kwargs: Any,
     ) -> "threading.Event":
-        # No stall detection without the shared helper, but keep emitting heartbeat
-        # statuses so the orchestrator's inactivity deadline is not tripped during a
-        # legitimately long load/download in this degraded mode.
+        # No stall detection here, but keep emitting heartbeats so the orchestrator's inactivity
+        # deadline is not tripped during a legitimately long download.
         stop = threading.Event()
         if on_heartbeat is None:
             return stop
@@ -136,8 +126,8 @@ def _shared_hf_hub_download_with_xet_fallback(
         cancel_event: "Optional[threading.Event]" = None,
         **_ignored: Any,
     ) -> str:
-        # No subprocess to interrupt mid-call here, but keep the cancellation
-        # contract: do not start, and do not return, a download once cancelled.
+        # No subprocess to interrupt here, but keep the cancellation contract: do not start or return
+        # a download once cancelled.
         if _degraded_cancelled(cancel_event):
             raise RuntimeError("Cancelled")
 
@@ -203,13 +193,9 @@ def _shared_snapshot_download_with_xet_fallback(
 
 
 def _studio_prepare_for_http(repo_type: str, repo_id: str) -> None:
-    """Make the partial safe for an HTTP resume using Studio's marker-aware purge,
-    so the download manager's ``.transport`` marker accounting stays consistent
-    (vs the generic delete-incompletes default in unsloth_zoo).
-
-    The shared orchestrator already wraps this hook, but guard it here too so a
-    purge failure (locked file, missing dir) is logged rather than aborting the
-    HTTP retry that is the whole point of the fallback."""
+    """Make the partial safe for an HTTP resume using Studio's marker-aware purge, so the download
+    manager's ``.transport`` accounting stays consistent (vs unsloth_zoo's generic default). Guarded so
+    a purge failure (locked file, missing dir) is logged rather than aborting the HTTP retry."""
     try:
         from hub.utils.download_registry import prepare_cache_for_transport
         prepare_cache_for_transport(repo_type, repo_id, "http")
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 13fbf6213b..bd310b439b 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -906,9 +906,9 @@ def _run_temporary_patches(phase):
 TORCHAO_MSG = "Error: torchao not found, please install with `pip install torchao`"
 
 
-# Artifacts a Transformers/PEFT load never consumes -- skip them when prewarming
-# so a mixed-format repo does not pull ONNX/TF/Flax/CoreML/GGUF/training state.
-# Conservative ignore list (not an allowlist) so no file a load needs is dropped.
+# Artifacts a Transformers/PEFT load never consumes (ONNX/TF/Flax/CoreML/GGUF/training state) -- skip
+# them when prewarming so a mixed-format repo is not pulled in full. An ignore list, not an allowlist,
+# so no file a load needs is dropped.
 _PREFETCH_IGNORE_PATTERNS = (
     "*.onnx",
     "onnx/*",
@@ -919,9 +919,7 @@ def _run_temporary_patches(phase):
     "*.mlpackage/*",
     "*.mlmodel",
     "*.gguf",
-    # Original / training checkpoint formats that from_pretrained does not read (it loads
-    # safetensors / .bin, or h5 / msgpack under from_tf / from_flax). Skip them so the warm
-    # does not pull multi-GB of unused weights for a repo that ships them alongside HF weights.
+    # Training / original-checkpoint formats from_pretrained does not read.
     "*.pt",
     "*.pth",
     "*.ckpt",
@@ -934,9 +932,8 @@ def _run_temporary_patches(phase):
 )
 
 
-# Repo-root tokenizer / config / processor files that a from_pretrained reads from the
-# root even when the weights load from a subfolder. Exact filenames (no wildcard) so they
-# match only root-level files, never same-named files inside another subfolder.
+# Repo-root tokenizer / config / processor files a from_pretrained reads from the root even when the
+# weights load from a subfolder. Exact filenames (no wildcard) so they match only root-level files.
 _ROOT_AUX_PREFETCH_PATTERNS = (
     "config.json",
     "generation_config.json",
@@ -949,12 +946,10 @@ def _run_temporary_patches(phase):
     "vocab.txt",
     "merges.txt",
     "spiece.model",
-    # Additional SentencePiece / vocab files that are real from_pretrained load targets
-    # (VOCAB_FILES_NAMES entries) but are NOT covered by the names above: spm.model
-    # (DeBERTa-v2), normalizer.json (Whisper), tokenizer.model.v3 (Mistral versioned SP),
-    # sentencepiece.bpe.model (XLM-R / mBART / CamemBERT -- Unsloth tries the slow tokenizer
-    # first, which fetches these), source.spm / target.spm (Marian), bpe.codes (FSMT / XLM),
-    # vocab.bpe (GPT-2-style merges asset).
+    # More SentencePiece / vocab load targets (VOCAB_FILES_NAMES) not covered above: spm.model
+    # (DeBERTa-v2), normalizer.json (Whisper), tokenizer.model.v3 (Mistral), sentencepiece.bpe.model
+    # (XLM-R / mBART -- the slow tokenizer Unsloth tries first fetches these), source/target.spm
+    # (Marian), bpe.codes (FSMT / XLM), vocab.bpe (GPT-2 merges).
     "spm.model",
     "normalizer.json",
     "tokenizer.model.v3",
@@ -969,40 +964,28 @@ def _run_temporary_patches(phase):
     "additional_chat_templates/*.jinja",
     "preprocessor_config.json",
     "processor_config.json",
-    # Video processors (e.g. Qwen2.5-VL video) read a dedicated video_preprocessor_config.json.
-    "video_preprocessor_config.json",
-    # Custom-code entry points a trust_remote_code config / model / tokenizer / processor load
-    # fetches via auto_map. The module name is arbitrary: the transformers convention is
-    # modeling_*.py / tokenization_*.py, but an auto_map can point at any module (modeling.py,
-    # tokenization.py, my_code.py), so prefix globs would miss it and leave the .py to an
-    # in-process Xet fetch. Warm every *.py instead -- they are tiny, and a non-remote-code repo
-    # ships none (harmless). HF fnmatch "*" spans "/", so nested helper modules are covered too.
+    "video_preprocessor_config.json",  # Qwen2.5-VL-style video processors
+    # trust_remote_code entry points via auto_map can point at any module name, so prefix globs would
+    # miss them. Warm every *.py (tiny; a non-remote-code repo ships none). HF fnmatch "*" spans "/".
     "*.py",
-    # tiktoken vocab assets (e.g. Qwen's qwen.tiktoken) a custom tokenizer can require.
-    "*.tiktoken",
+    "*.tiktoken",  # tiktoken vocab assets (e.g. Qwen's qwen.tiktoken)
 )
 
 
-# Exactly the files a PEFT adapter load (PeftModel.from_pretrained) reads: its config and its
-# weight files (adapter_model.safetensors / .bin, sharded or not). "adapter_model*" is a glob so
-# a sharded adapter is still covered; the merged / full-model weights an adapter repo may ALSO
-# publish (model*.safetensors, pytorch_model*.bin) match none of these and so are not pulled.
+# The files a PEFT adapter load reads: its config + weights ("adapter_model*" is a glob so a sharded
+# adapter is covered). Merged / full-model weights an adapter repo may also publish match none of these.
 _ADAPTER_PREFETCH_PATTERNS = (
     "adapter_config.json",
     "adapter_model*",
 )
 
 
-# Weight files that live in a SUBDIRECTORY, not the repo root. A bare from_pretrained(model_name)
-# (no subfolder) resolves only root weight files, so for such a root load these subdir weights are
-# unread; ignoring them keeps a repo's alternate-precision / experimental weight dirs (fp16/,
-# experimental/) from being pulled by the otherwise-unfiltered warm. Hugging Face's fnmatch "*"
-# spans "/", so "*/*.safetensors" matches any nested .safetensors while a root "model.safetensors"
-# (no "/") is kept. Every weight format is covered -- including .h5 / .msgpack, which a
-# from_tf / from_flax root load keeps at the root (the format filter does NOT drop them), so without
-# the nested forms a TF / Flax root load would still pull alternate .h5 / .msgpack checkpoints under
-# subdirectories. Only applied when the caller asserts a root-only load (weights_at_root), never to a
-# diffusion pipeline warm whose component weights DO live in subfolders.
+# Weight files in a SUBDIRECTORY. A bare root load reads only root weights, so ignoring these keeps a
+# repo's alternate-precision / experimental weight dirs (fp16/, experimental/) from the otherwise
+# unfiltered warm. HF fnmatch "*" spans "/", so "*/*.safetensors" matches any nested weight while root
+# "model.safetensors" is kept. Every format is covered, incl. .h5 / .msgpack (a from_tf / from_flax
+# root load keeps those at the root). Only applied when weights_at_root, never to a diffusion warm
+# (its component weights live in subfolders).
 _SUBDIR_WEIGHT_IGNORE_PATTERNS = (
     "*/*.safetensors",
     "*/*.bin",
@@ -1014,24 +997,19 @@ def _run_temporary_patches(phase):
 
 
 def _in_requested_load_scope(filename, subfolder):
-    """True if a repo-relative *filename* belongs to the location being loaded.
-
-    The load reads from *subfolder* when one is given, otherwise the repo root.
-    Used so the ".bin is redundant when safetensors exist" decision is keyed off
-    the files that load actually uses, not any safetensors elsewhere in the repo
-    (a subfolder that ships only .bin must keep its .bin even when an unrelated
-    subfolder ships safetensors).
+    """True if a repo-relative *filename* belongs to the location being loaded (*subfolder*, else the
+    repo root). Keys the ".bin is redundant when safetensors exist" decision off the files the load
+    actually uses, so a .bin-only subfolder keeps its .bin even when another subfolder ships safetensors.
     """
     filename = filename.replace("\\", "/")
     if isinstance(subfolder, str) and subfolder.strip("/"):
         return filename.startswith(subfolder.strip("/") + "/")
-    # Root load: only files at the repo root (no directory component).
-    return "/" not in filename
+    return "/" not in filename  # root load: no directory component
 
 
-# Training-state files that carry a .safetensors suffix but are NOT loadable model weights. A
-# Trainer checkpoint can ship optimizer.safetensors / scheduler.safetensors next to the real
-# pytorch_model.bin, and counting one as "model safetensors present" would drop the needed .bin.
+# Training-state files with a .safetensors suffix that are NOT loadable model weights. A Trainer
+# checkpoint can ship optimizer.safetensors next to a real pytorch_model.bin, and counting it as
+# "model safetensors present" would drop the needed .bin.
 _NON_MODEL_WEIGHT_STEMS = frozenset(
     {
         "optimizer",
@@ -1044,23 +1022,18 @@ def _in_requested_load_scope(filename, subfolder):
 
 
 def _is_model_weight_safetensors(filename):
-    """True if a repo-relative *filename* is a model-weights safetensors file rather than
-    a PEFT adapter / sidecar (e.g. ``adapter_model.safetensors``) or a trainer-state file
-    (``optimizer.safetensors``).
-
-    Only a real model-weights safetensors proves the ``.bin`` full-model weights are
-    redundant. A repo can ship an ``adapter_model.safetensors`` sidecar -- or an
-    ``optimizer.safetensors`` training-state file -- while its actual weights are
-    ``pytorch_model.bin``; counting those would wrongly skip the needed ``.bin`` and leave the
-    in-process load to fetch it without the Xet fallback.
+    """True if a repo-relative *filename* is a model-weights safetensors, not a PEFT adapter / sidecar
+    (``adapter_model.safetensors``) or a trainer-state file (``optimizer.safetensors``). Only a real
+    model-weights safetensors proves the ``.bin`` full-model weights are redundant; counting a sidecar
+    would wrongly skip the needed ``.bin`` and leave the load to fetch it without the Xet fallback.
     """
     name = filename.replace("\\", "/").rsplit("/", 1)[-1]
     if not name.endswith((".safetensors", ".safetensors.index.json")):
         return False
     if name.startswith("adapter_"):
         return False
-    # Stem before the first dot: "optimizer.safetensors" -> "optimizer", "rng_state_0..." -> caught
-    # by the prefix test; a real weight ("model", "model-00001-of-00002", "consolidated") is kept.
+    # Stem before the first dot: "optimizer.safetensors" -> "optimizer" (a real "model-00001-of-00002"
+    # is kept); "rng_state_0..." is caught by the prefix test below.
     stem = name.split(".", 1)[0].lower()
     if stem in _NON_MODEL_WEIGHT_STEMS or stem.startswith("rng_state"):
         return False
@@ -1069,12 +1042,10 @@ def _is_model_weight_safetensors(filename):
 
 def _filename_has_variant(filename, variant):
     """True if a weight *filename* belongs to the requested *variant* (variant="fp16"). Transformers
-    inserts the variant token right before the extension, so a single-file weight carries it as a
-    ``.{variant}.`` infix (``model.fp16.safetensors``), while a SHARDED weight carries it as a
-    ``.{variant}-`` infix before the ``-NNNNN-of-NNNNN`` suffix
-    (``model.fp16-00001-of-00002.safetensors``). Both shapes are matched so a sharded variant
-    safetensors is recognized -- else its redundant ``.bin`` would not be dropped. Only meaningful
-    when a variant is requested; callers gate on *variant* being truthy."""
+    inserts the variant token before the extension, so a single-file weight carries it as a
+    ``.{variant}.`` infix (``model.fp16.safetensors``) and a sharded weight as a ``.{variant}-`` infix
+    (``model.fp16-00001-of-00002.safetensors``). Match both so a sharded variant is recognized and its
+    redundant ``.bin`` dropped. Callers gate on *variant* being truthy."""
     base = filename.replace("\\", "/").rsplit("/", 1)[-1]
     return f".{variant}." in base or f".{variant}-" in base
 
@@ -1085,14 +1056,10 @@ def _adapter_repo_has_safetensors(
     token = None,
     revision = None,
 ):
-    """Best-effort: does the adapter repo ship a ROOT safetensors adapter weight
-    (``adapter_model.safetensors`` or a sharded ``adapter_model*.safetensors``)?
-    PeftModel.from_pretrained prefers safetensors, so when one is present the ``.bin`` form is
-    redundant and can be dropped from the warm. Scoped to repo-ROOT files because the adapter warm
-    only pulls root ``adapter_model*``: an unrelated ``checkpoint-*/adapter_model.safetensors`` in a
-    subdirectory must NOT make us drop the root ``.bin`` the load actually reads. Any failure returns
-    False so BOTH formats stay eligible -- never under-warm a ``.bin``-only adapter into an
-    in-process Xet fetch."""
+    """Best-effort: does the adapter repo ship a ROOT safetensors adapter weight? PeftModel prefers
+    safetensors, so when one is present the ``.bin`` form is redundant. Scoped to repo-ROOT files (the
+    adapter warm only pulls root ``adapter_model*``), so an unrelated ``checkpoint-*/adapter_model.safetensors``
+    does not make us drop the root ``.bin``. Any failure returns False so both formats stay eligible."""
     try:
         from huggingface_hub import HfApi
         siblings = HfApi().model_info(model_name, revision = revision, token = token).siblings or []
@@ -1117,19 +1084,12 @@ def _prefetch_ignore_patterns(
     from_flax = False,
     variant = None,
 ):
-    """ignore_patterns for the prewarm snapshot: the static skip list, minus the
-    checkpoint guard when loading from a checkpoint-* subfolder, minus the weight
-    format the load will not read. An explicit use_safetensors is treated as a format
-    allowlist (True -> skip *.bin, False -> skip *.safetensors) so a bin-only repo is
-    not pulled in full just to be rejected by a safetensors-only load. use_safetensors
-    is None (auto) skips *.bin only when in-scope safetensors are also shipped, since
-    Transformers prefers them (pulling the other format just to discard it doubles the
-    very download we optimize). from_tf / from_flax keep the TF (*.h5) / Flax
-    (*.msgpack) weights, which Transformers reads as the actual weights under those load
-    modes; dropping them unconditionally would leave the only needed weight file unwarmed."""
-    # A checkpoint-* subfolder is exactly what "checkpoint-*/*" would drop, so
-    # do not ignore it when the caller is explicitly loading from that subfolder.
-    # from_tf / from_flax loads read *.h5 / *.msgpack as the weights, so keep them.
+    """ignore_patterns for the prewarm snapshot: the static skip list, minus the checkpoint guard when
+    loading from a checkpoint-* subfolder, minus the weight format the load will not read. Explicit
+    use_safetensors acts as a format allowlist (True -> skip *.bin, False -> skip *.safetensors); auto
+    (None) skips *.bin only when in-scope safetensors are also shipped (Transformers prefers them).
+    from_tf / from_flax keep the *.h5 / *.msgpack weights they read as the actual weights."""
+    # Keep checkpoint-*/* when loading from such a subfolder; keep *.h5 / *.msgpack under from_tf/flax.
     ignore_patterns = [
         pattern
         for pattern in _PREFETCH_IGNORE_PATTERNS
@@ -1143,13 +1103,10 @@ def _prefetch_ignore_patterns(
             or (from_flax and pattern == "*.msgpack")
         )
     ]
-    # Drop the weight format the load will not read. Transformers reads exactly one
-    # format, so prefetching the other doubles the very download we optimize.
+    # Drop the format the load will not read (Transformers reads exactly one; the other doubles the
+    # download we optimize).
     if from_tf or from_flax:
-        # TF (*.h5) / Flax (*.msgpack) loads read those as the weights and never touch the
-        # PyTorch formats. The static list above already keeps the requested h5 / msgpack;
-        # drop safetensors and .bin outright so the auto branch does not warm multi-GB of
-        # PyTorch weights the load will discard (and risk a disk-full on a mixed-format repo).
+        # TF / Flax loads never touch the PyTorch formats; drop safetensors and .bin outright.
         ignore_patterns.extend(
             (
                 "*.safetensors",
@@ -1159,17 +1116,14 @@ def _prefetch_ignore_patterns(
             )
         )
     elif use_safetensors is True:
-        # Explicit safetensors: the load never reads .bin, so skip it outright -- even
-        # for a bin-only repo, where the load fails anyway, do not pull multi-GB of
-        # PyTorch weights first. No model_info call needed for an explicit request.
+        # Explicit safetensors: the load never reads .bin (no model_info call needed).
         ignore_patterns.extend(("*.bin", "*.bin.index.json"))
     elif use_safetensors is False:
-        # Explicit .bin: the load never reads safetensors, so skip them.
+        # Explicit .bin: the load never reads safetensors.
         ignore_patterns.extend(("*.safetensors", "*.safetensors.index.json"))
     else:
-        # Auto (use_safetensors is None): skip .bin only once in-scope safetensors are
-        # confirmed to load instead, since Transformers prefers them. Best-effort: any
-        # failure leaves both formats eligible (correct, just less efficient).
+        # Auto: skip .bin only once in-scope safetensors are confirmed (Transformers prefers them).
+        # Best-effort: any failure leaves both formats eligible.
         try:
             from huggingface_hub import HfApi
 
@@ -1183,14 +1137,9 @@ def _prefetch_ignore_patterns(
                 .siblings
                 or []
             )
-            # Only count model-weights safetensors that the load will actually read
-            # (same subfolder / root, and not an adapter / sidecar), so a .bin-only
-            # subfolder is not stripped of its weights because some other path in the
-            # repo ships safetensors, nor because an adapter_model.safetensors sidecar
-            # sits next to real pytorch_model.bin weights. When a variant is requested
-            # (variant="fp16"), only a variant-matching safetensors (model.fp16.safetensors)
-            # proves the variant's .bin (pytorch_model.fp16.bin) is redundant: counting the
-            # default-format safetensors would drop the variant .bin the load actually reads.
+            # Only count model-weights safetensors the load actually reads (in-scope subfolder/root,
+            # not an adapter / sidecar), so a .bin-only subfolder is not stripped of its weights. With
+            # a variant, only a variant-matching safetensors proves the variant's .bin redundant.
             has_safetensors = any(
                 _is_model_weight_safetensors(sibling.rfilename)
                 and _in_requested_load_scope(sibling.rfilename, subfolder)
@@ -1225,21 +1174,16 @@ def maybe_prefetch_hf_snapshot(
 ):
     """Warm the Hugging Face cache for a remote repo before the in-process load.
 
-    Xet downloads can hang on a blob with no progress and no exception, and a
-    blocked native Xet thread cannot be killed in-process. We pull the whole
-    snapshot first in a killable subprocess that automatically falls back from
-    Xet to plain HTTP on a no-progress stall (unsloth_zoo.hf_xet_fallback); the
-    from_pretrained that follows is then a cache hit and cannot stall on Xet.
-
-    Returns True iff the snapshot was warmed in the killable subprocess, so the
-    caller can clear force_download for the in-process load (else a forced reload
-    would re-download over the very Xet path this avoids). Returns False when
-    warming was skipped (local path / offline / local_files_only / fast_inference,
-    or an older unsloth_zoo) or failed.
-
-    Best-effort: a deterministic failure (missing repo, auth, disk) is left for
-    from_pretrained to surface canonically; only a both-transports-stalled
-    DownloadStallError is raised here, to avoid a silent in-process hang.
+    Xet downloads can hang on a blob with no progress and no exception, and a blocked native Xet thread
+    cannot be killed in-process. We pull the snapshot first in a killable subprocess that falls back
+    from Xet to HTTP on a no-progress stall (unsloth_zoo.hf_xet_fallback); the following from_pretrained
+    is then a cache hit and cannot stall on Xet.
+
+    Returns True iff the snapshot was warmed in the subprocess, so the caller can clear force_download
+    for the in-process load (else a forced reload re-downloads over the Xet path this avoids). Returns
+    False when warming was skipped (local path / offline / local_files_only / fast_inference, or an
+    older unsloth_zoo) or failed. Best-effort: a deterministic failure (missing repo, auth, disk) is
+    left for from_pretrained to surface; only a both-transports-stalled DownloadStallError is raised.
     """
     try:
         from unsloth_zoo.hf_xet_fallback import (
@@ -1247,43 +1191,36 @@ def maybe_prefetch_hf_snapshot(
             DownloadStallError,
         )
     except Exception:
-        # Older unsloth_zoo without the helper: skip warming, load normally.
-        return False
+        return False  # older unsloth_zoo without the helper: load normally
 
     if not isinstance(model_name, str) or not model_name:
         return False
-    # A local directory / file path has nothing to download. Expand ~ first, since
-    # os.path.exists does not, so a home-relative path is detected as local.
+    # A local path has nothing to download. Expand ~ first (os.path.exists does not).
     model_path = os.path.expanduser(model_name)
     if os.path.isdir(model_path) or os.path.exists(model_path):
         return False
-    # A path that looks local but is not on disk yet (e.g. a not-created output
-    # dir) is still not a Hub repo id ("org/name"); leave it for from_pretrained
-    # to surface canonically rather than trying to download it.
+    # Looks local but not yet on disk (e.g. an uncreated output dir): still not a Hub repo id; leave
+    # it for from_pretrained to surface rather than trying to download it.
     if (
         os.path.isabs(model_path)
         or model_name.startswith(("~", "./", "../", ".\\", "..\\"))
         or "\\" in model_name
     ):
         return False
-    # Offline / cache-only: never reach out.
-    if local_files_only:
+    if local_files_only:  # offline / cache-only: never reach out
         return False
     if any(
         os.environ.get(flag, "0").lower() in ("1", "true", "yes", "on")
         for flag in ("HF_HUB_OFFLINE", "TRANSFORMERS_OFFLINE")
     ):
         return False
-    # vLLM has its own download path; leave it untouched.
-    if fast_inference:
+    if fast_inference:  # vLLM has its own download path
         return False
 
-    # A tokenizer-only or adapter-only warm allow-lists the exact files the load reads below, so the
-    # weight-format ignore list is moot -- and skipping it avoids the model_info network call the
-    # auto branch would otherwise make. (An adapter-only warm sets its own format ignore further
-    # down.) The format probe keys off an ACTUAL weight file, not config.json: AutoConfig caches
-    # config.json before this helper runs in the Llama / diffusion paths, so a config-based "cached"
-    # guess would skip the .bin-drop even when no weights are cached and over-fetch both formats.
+    # A tokenizer-only / adapter-only warm allow-lists exact files below, so the weight-format ignore
+    # list is moot -- skipping it also avoids the auto branch's model_info call. The format probe keys
+    # off an ACTUAL weight file, not config.json (which AutoConfig caches before this runs in the Llama
+    # / diffusion paths), so a config-based guess would skip the .bin-drop and over-fetch both formats.
     ignore_patterns = (
         None
         if tokenizer_only or adapter_only or gguf_file
@@ -1298,19 +1235,13 @@ def maybe_prefetch_hf_snapshot(
             variant = variant,
         )
     )
-    # Narrow the warm to exactly what the in-process load reads, so a repo that ships extra
-    # weights (alternate checkpoints, merged full models, alternate precisions) is not pulled in
-    # full. Every branch still warms the repo-ROOT tokenizer / config / custom-code assets a
-    # tokenizer / processor / trust_remote_code load reads from the root, so those never fall to
-    # an unprotected in-process download. The root patterns are exact filenames or literal-prefixed
-    # globs (e.g. modeling_*.py), so they stay anchored to repo-root files in practice.
+    # Narrow the warm to what the load reads, so a repo shipping extra weights (alternate checkpoints,
+    # merged models, alternate precisions) is not pulled in full. Every branch still warms the root
+    # tokenizer / config / custom-code assets, so those never fall to an unprotected in-process download.
     allow_patterns = None
     if gguf_file:
-        # from_pretrained(model_name, gguf_file=NAME) reads exactly that GGUF from the repo
-        # (Transformers de-quantizes it on load). The static ignore list drops *.gguf, so without
-        # this the file would never be warmed and the load would fetch it in-process over Xet. Warm
-        # exactly that file (plus root aux) -- not every other quant the repo may also publish. When
-        # a subfolder is set, the load resolves <subfolder>/<gguf_file>, so warm THAT path.
+        # from_pretrained(gguf_file=NAME) reads exactly that GGUF (de-quantized on load), but the static
+        # ignore list drops *.gguf -- so warm exactly that file (plus root aux), under <subfolder>/ if set.
         _gguf_path = (
             f"{subfolder.strip('/')}/{gguf_file}"
             if isinstance(subfolder, str) and subfolder.strip("/")
@@ -1318,21 +1249,17 @@ def maybe_prefetch_hf_snapshot(
         )
         allow_patterns = [_gguf_path, *_ROOT_AUX_PREFETCH_PATTERNS]
     elif tokenizer_only:
-        # A distinct tokenizer repo: warm only its tokenizer / config / vocab files. Restrict
-        # to those exact root filenames so we never pull weights, even if that repo also
-        # happens to ship them (the weights are not what the tokenizer load reads).
+        # A distinct tokenizer repo: warm only its tokenizer / config / vocab files, never its weights
+        # (not what the tokenizer load reads) even if it ships them.
         allow_patterns = list(_ROOT_AUX_PREFETCH_PATTERNS)
     elif adapter_only:
-        # A PEFT adapter load reads only adapter_config.json + adapter_model.* (plus the root
-        # tokenizer / config it may also load). Restrict to those so an adapter repo that ALSO
-        # publishes merged / full-model weights does not pull multi-GB of weights PeftModel never
-        # reads (and risk filling disk before a small adapter loads).
+        # A PEFT adapter load reads only adapter_config.json + adapter_model.* (plus root tokenizer /
+        # config), so restrict to those: an adapter repo also publishing merged weights does not pull
+        # them.
         allow_patterns = [*_ADAPTER_PREFETCH_PATTERNS, *_ROOT_AUX_PREFETCH_PATTERNS]
-        # An adapter ships its weights in ONE format and PeftModel.from_pretrained reads ONE
-        # (safetensors when present), so an adapter repo carrying both adapter_model.safetensors
-        # and adapter_model.bin must not warm both. Pick the format the load will read: an
-        # explicit use_safetensors wins; otherwise prefer safetensors when the repo ships it
-        # (best-effort model_info; any failure keeps both, never under-warming a .bin-only adapter).
+        # An adapter ships ONE format and PeftModel reads ONE (safetensors when present), so don't warm
+        # both: explicit use_safetensors wins, else prefer safetensors when the repo ships it
+        # (best-effort; any failure keeps both, never under-warming a .bin-only adapter).
         if use_safetensors is False:
             ignore_patterns = [
                 "adapter_model*.safetensors",
@@ -1343,16 +1270,13 @@ def maybe_prefetch_hf_snapshot(
         ):
             ignore_patterns = ["adapter_model*.bin", "adapter_model*.bin.index.json"]
     elif isinstance(subfolder, str) and subfolder.strip("/"):
-        # Loading from a subfolder: a from_pretrained(..., subfolder=X) resolves every weight
-        # file under X/, so warm that subfolder (plus the root aux files) and skip the rest.
+        # subfolder=X: the load resolves every weight under X/, so warm that subfolder (plus root aux).
         allow_patterns = [f"{subfolder.strip('/')}/*", *_ROOT_AUX_PREFETCH_PATTERNS]
     elif weights_at_root:
-        # A bare from_pretrained(model_name) (no subfolder) reads only the ROOT weight files.
-        # Keep the warm otherwise unfiltered (config, tokenizer, root weights) but drop weights
-        # nested in subdirectories (fp16/, experimental/, alternate-checkpoint dirs) the root
-        # load never reads. Only weight files are excluded, so a subdir's config the load might
-        # still consult stays warmed. Not applied to diffusion (its component weights live in
-        # subfolders); see FastDiffusionModel's call, which leaves weights_at_root False.
+        # A bare load reads only ROOT weights: keep the warm otherwise unfiltered but drop weights
+        # nested in subdirs (fp16/, experimental/, checkpoint dirs) the root load never reads. Only
+        # weight files are excluded, so a subdir config stays warmed. Not applied to diffusion (its
+        # component weights live in subfolders; FastDiffusionModel leaves weights_at_root False).
         ignore_patterns = [*(ignore_patterns or []), *_SUBDIR_WEIGHT_IGNORE_PATTERNS]
     try:
         snapshot_download_with_xet_fallback(
@@ -1366,8 +1290,7 @@ def maybe_prefetch_hf_snapshot(
         )
         return True
     except DownloadStallError:
-        # Both Xet and HTTP stalled: surface a clear network error instead of
-        # letting the in-process load hang on the same stall.
+        # Both transports stalled: surface a clear network error, not a silent in-process hang.
         raise
     except Exception as exception:
         logger.warning_once(
diff --git a/unsloth/models/diffusion.py b/unsloth/models/diffusion.py
index 5806959122..512074b3fb 100644
--- a/unsloth/models/diffusion.py
+++ b/unsloth/models/diffusion.py
@@ -181,16 +181,11 @@ def from_pretrained(
 
         model_cls = _resolve_diffusion_model_class(config)
 
-        # The repo is confirmed a diffusion model: now pre-download it in a
-        # killable subprocess that falls back from Xet to HTTP on a no-progress
-        # stall, so the weight load below is a cache hit and cannot hang. Done
-        # after validation so a non-diffusion repo fails on config metadata alone,
-        # without first pulling multi-GB weights.
-        # subfolder is deliberately NOT forwarded: the pipeline / config / processor loads below
-        # resolve the repo ROOT (the whole DiffusionPipeline -- model_index.json plus every
-        # component subfolder), not a single subfolder, so narrowing the warm to one subfolder
-        # would leave the other components (unet/, vae/, text_encoder/) to an unprotected
-        # in-process Xet download while pulling subfolder weights the load never reads.
+        # Repo confirmed a diffusion model: pre-download it in a killable subprocess (Xet -> HTTP on a
+        # stall) so the weight load is a cache hit. After validation, so a non-diffusion repo fails on
+        # config metadata alone. subfolder is NOT forwarded: the pipeline loads the repo ROOT (the whole
+        # DiffusionPipeline -- model_index.json + every component subfolder), so narrowing to one
+        # subfolder would leave the other components (unet/, vae/, text_encoder/) to in-process Xet.
         maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
@@ -201,9 +196,7 @@ def from_pretrained(
             force_download = kwargs.get("force_download", False),
             use_safetensors = kwargs.get("use_safetensors"),
             # Diffusion variants (variant="fp16") are common: forward it so the warm never drops a
-            # variant .bin for a non-variant safetensors. (When component weights live in
-            # subfolders the auto branch finds no root safetensors and keeps both formats anyway;
-            # forwarding variant simply makes that correct rather than incidental.)
+            # variant .bin for a non-variant safetensors.
             variant = kwargs.get("variant"),
         )
 
@@ -217,11 +210,9 @@ def from_pretrained(
             local_files_only = local_files_only,
             cache_dir = cache_dir,
         )
-        # Honor an explicit weight format on the real load too, so it reads the same
-        # format the prefetch warmed. The prefetch filters .bin / .safetensors by
-        # use_safetensors; without forwarding it the load could pick the other format
-        # on a mixed-format repo and start an unprotected in-process Xet download.
-        # use_safetensors=None (auto) already matches the prefetch's auto heuristic.
+        # Honor an explicit weight format on the real load too, so it reads the format the prefetch
+        # warmed (else a mixed-format repo could pick the other and start an in-process Xet download).
+        # use_safetensors=None (auto) already matches the prefetch's heuristic.
         if kwargs.get("use_safetensors") is not None:
             load_kwargs["use_safetensors"] = kwargs["use_safetensors"]
 
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 2ba68971c1..c0c98a9595 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2420,14 +2420,11 @@ def from_pretrained(
 
         preferred_attn_impl = resolve_attention_implementation(model_function, model_config)
 
-        # Pre-download the repo in a killable subprocess that falls back from Xet to HTTP on a
-        # no-progress stall, so the in-process weight load below is a cache hit and cannot hang
-        # on a stalled Xet transfer. This runs AFTER the AutoConfig load + model-class check
-        # above, so an unsupported / incompatible repo fails on its small config fetch without
-        # first pulling multi-GB of weights. revision is intentionally not forwarded: the
-        # base-model load below resolves model_name (possibly a remapped prequantized repo,
-        # where the caller's revision does not exist) on its default branch without a revision,
-        # so warming a specific revision would predownload a snapshot the load never reads.
+        # Pre-download the repo in a killable subprocess (Xet -> HTTP on a no-progress stall) so the
+        # in-process weight load below is a cache hit and cannot hang. Runs AFTER the AutoConfig +
+        # model-class check, so an unsupported repo fails on its small config fetch without pulling
+        # weights. revision is NOT forwarded: the load resolves model_name (possibly a remapped
+        # prequantized repo, where the caller's revision does not exist) on its default branch.
         _prefetched = maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
@@ -2439,29 +2436,19 @@ def from_pretrained(
             use_safetensors = kwargs.get("use_safetensors"),
             from_tf = kwargs.get("from_tf", False),
             from_flax = kwargs.get("from_flax", False),
-            # A bare from_pretrained(model_name) reads only the ROOT weight files, so skip
-            # weights nested in subdirs (fp16/, experimental/) the load never reads. Ignored
-            # when a subfolder is set (that branch narrows the warm to the subfolder instead).
+            # Bare load reads only ROOT weights; skip subdir weights (fp16/, experimental/). Ignored
+            # when a subfolder is set.
             weights_at_root = True,
-            # A variant load (variant="fp16") reads model.fp16.* -- forward it so the warm's
-            # format auto-pick keeps the variant .bin instead of dropping it for a default
-            # safetensors the variant load never reads.
-            variant = kwargs.get("variant"),
-            # A gguf_file load reads exactly that GGUF; forward it so the warm fetches it instead of
-            # dropping every *.gguf (the static ignore list otherwise excludes it).
-            gguf_file = kwargs.get("gguf_file"),
+            variant = kwargs.get("variant"),       # forward so the warm keeps the variant .bin
+            gguf_file = kwargs.get("gguf_file"),   # forward so the warm fetches the GGUF (else ignored)
         )
-        # The killable child already did the forced download; clear the flag so the
-        # in-process load reuses that warm cache instead of re-forcing over Xet.
+        # Child already did the forced download; clear the flag so the load reuses the warm cache.
         if _prefetched and kwargs.get("force_download", False):
             kwargs["force_download"] = False
 
-        # The tokenizer loads in-process below (load_correct_tokenizer) regardless of the
-        # vLLM weight path, so a stalled Xet download of its tokenizer / config files could
-        # still hang from_pretrained. Warm that repo's tokenizer files through the same
-        # killable subprocess. The base prefetch above already covered them when it warmed
-        # model_name itself, so only warm here when the tokenizer comes from a different repo,
-        # or when fast_inference skipped the base warm entirely.
+        # The tokenizer loads in-process below regardless of the vLLM weight path, so warm its files
+        # through the same killable subprocess. The base prefetch already covered model_name, so only
+        # warm here when the tokenizer is a different repo, or fast_inference skipped the base warm.
         _tokenizer_repo = (
             tokenizer_name if (isinstance(tokenizer_name, str) and tokenizer_name) else model_name
         )
@@ -2731,10 +2718,8 @@ def from_pretrained(
 
         # Counteract saved tokenizers
         tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
-        # When the caller set a custom cache_dir, the prefetch warmed it (incl.
-        # tokenizer files); route the tokenizer load there too so it reuses that
-        # warm cache instead of doing its own in-process Hub/Xet download. With no
-        # custom cache_dir, keep load_correct_tokenizer's own default.
+        # With a custom cache_dir the prefetch warmed it (incl. tokenizer files); route the tokenizer
+        # load there too so it reuses that cache instead of its own in-process Hub/Xet download.
         _tokenizer_cache_kwargs = {}
         if kwargs.get("cache_dir") is not None:
             _tokenizer_cache_kwargs["cache_dir"] = kwargs["cache_dir"]
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 091a7eacfa..75bb122fbc 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -866,39 +866,28 @@ def from_pretrained(
         if is_peft:
             # From https://github.com/huggingface/peft/issues/184
             # Now add PEFT adapters
-            # Warm the adapter repo first: PeftModel.from_pretrained downloads it
-            # in-process and can hang on a stalled Xet transfer like the base model.
+            # Warm the adapter repo first: PeftModel downloads it in-process and can hang on Xet.
             _prefetched = maybe_prefetch_hf_snapshot(
                 old_model_name,
                 token = token,
                 revision = revision,
                 cache_dir = kwargs.get("cache_dir"),
                 local_files_only = local_files_only,
-                # The adapter is loaded in-process by PeftModel.from_pretrained below,
-                # not by vLLM, so warm it even under fast_inference (vLLM only owns the
-                # base model's download path; the adapter would otherwise still hit an
-                # unprotected in-process Xet transfer).
+                # The adapter loads in-process via PeftModel, not vLLM, so warm it even under
+                # fast_inference (vLLM owns only the base model's download path).
                 fast_inference = False,
                 force_download = kwargs.get("force_download", False),
-                # Do not inherit the base model's use_safetensors: it selects the BASE
-                # weight format, but the adapter has its own (usually
-                # adapter_model.safetensors). Passing use_safetensors=False here would skip
-                # a safetensors-only adapter's weights, leaving PeftModel.from_pretrained to
-                # fetch them in-process. Leave it as auto so the adapter's format is warmed.
-                # Restrict the warm to the adapter's own files (adapter_config.json +
-                # adapter_model.*) plus the root tokenizer / config: a repo that also publishes
-                # merged full-model weights must not pull them just to load a small adapter.
+                # Do NOT inherit the base use_safetensors (it selects the BASE format): the adapter has
+                # its own (usually adapter_model.safetensors), and use_safetensors=False would skip a
+                # safetensors-only adapter. Leave it auto. adapter_only restricts the warm to the
+                # adapter's own files + root aux, so a repo also publishing merged weights does not pull them.
                 adapter_only = True,
             )
-            # The killable child already did the forced download; clear the flag so
-            # the in-process load reuses that warm cache instead of re-forcing.
+            # Child already did the forced download; clear the flag so the load reuses the warm cache.
             if _prefetched and kwargs.get("force_download", False):
                 kwargs["force_download"] = False
-            # Read the adapter from the same place the prefetch warmed: forward cache_dir
-            # when set (local_files_only is already passed explicitly below). subfolder is
-            # NOT forwarded -- it targets the base checkpoint, and an adapter typically
-            # lives at the repo root, so forwarding it would make PeftModel look under
-            # old_model_name/<subfolder> and miss a root adapter.
+            # Read the adapter from where the prefetch warmed it: forward cache_dir when set. subfolder
+            # is NOT forwarded (it targets the base checkpoint; an adapter usually lives at the root).
             peft_load_kwargs = {}
             if kwargs.get("cache_dir") is not None:
                 peft_load_kwargs["cache_dir"] = kwargs["cache_dir"]
@@ -1828,39 +1817,28 @@ def _patched_car(
 
                 _LoraModel._create_and_replace = _patched_car
 
-            # Warm the adapter repo first: PeftModel.from_pretrained downloads it
-            # in-process and can hang on a stalled Xet transfer like the base model.
+            # Warm the adapter repo first: PeftModel downloads it in-process and can hang on Xet.
             _prefetched = maybe_prefetch_hf_snapshot(
                 old_model_name,
                 token = token,
                 revision = revision,
                 cache_dir = kwargs.get("cache_dir"),
                 local_files_only = local_files_only,
-                # The adapter is loaded in-process by PeftModel.from_pretrained below,
-                # not by vLLM, so warm it even under fast_inference (vLLM only owns the
-                # base model's download path; the adapter would otherwise still hit an
-                # unprotected in-process Xet transfer).
+                # The adapter loads in-process via PeftModel, not vLLM, so warm it even under
+                # fast_inference (vLLM owns only the base model's download path).
                 fast_inference = False,
                 force_download = kwargs.get("force_download", False),
-                # Do not inherit the base model's use_safetensors: it selects the BASE
-                # weight format, but the adapter has its own (usually
-                # adapter_model.safetensors). Passing use_safetensors=False here would skip
-                # a safetensors-only adapter's weights, leaving PeftModel.from_pretrained to
-                # fetch them in-process. Leave it as auto so the adapter's format is warmed.
-                # Restrict the warm to the adapter's own files (adapter_config.json +
-                # adapter_model.*) plus the root tokenizer / config: a repo that also publishes
-                # merged full-model weights must not pull them just to load a small adapter.
+                # Do NOT inherit the base use_safetensors (it selects the BASE format): the adapter has
+                # its own (usually adapter_model.safetensors), and use_safetensors=False would skip a
+                # safetensors-only adapter. Leave it auto. adapter_only restricts the warm to the
+                # adapter's own files + root aux, so a repo also publishing merged weights does not pull them.
                 adapter_only = True,
             )
-            # The killable child already did the forced download; clear the flag so
-            # the in-process load reuses that warm cache instead of re-forcing.
+            # Child already did the forced download; clear the flag so the load reuses the warm cache.
             if _prefetched and kwargs.get("force_download", False):
                 kwargs["force_download"] = False
-            # Read the adapter from the same place the prefetch warmed: forward cache_dir
-            # when set (local_files_only is already passed explicitly below). subfolder is
-            # NOT forwarded -- it targets the base checkpoint, and an adapter typically
-            # lives at the repo root, so forwarding it would make PeftModel look under
-            # old_model_name/<subfolder> and miss a root adapter.
+            # Read the adapter from where the prefetch warmed it: forward cache_dir when set. subfolder
+            # is NOT forwarded (it targets the base checkpoint; an adapter usually lives at the root).
             peft_load_kwargs = {}
             if kwargs.get("cache_dir") is not None:
                 peft_load_kwargs["cache_dir"] = kwargs["cache_dir"]
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 2713bb1cf8..1d8cf94b90 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -805,16 +805,13 @@ def from_pretrained(
         # For debugging - we use a download counter to see if environments are not breaking or if HF is down
         get_statistics(kwargs.get("local_files_only", False))
 
-        # vLLM owns the weight download only when it is actually available; if
-        # fast_inference was requested but vLLM is missing, the load falls through to the
-        # in-process HF path (fast_inference_setup flips the flag to False below), so the
-        # weights must still be warmed here rather than left to an unprotected in-process
-        # Xet download. Resolve availability now so the prefetch skip reflects the real path.
+        # vLLM owns the weight download only when actually available; if fast_inference was requested
+        # but vLLM is missing, the load falls through to the in-process HF path (fast_inference_setup
+        # flips the flag below), so the weights must still be warmed here. Resolve availability now.
         _vllm_owns_weights = fast_inference and is_vLLM_available()
 
-        # Pre-download the repo in a killable subprocess that falls back from Xet
-        # to HTTP on a no-progress stall, so the in-process load below is a cache
-        # hit and cannot hang on a stalled Xet transfer.
+        # Pre-download the repo in a killable subprocess (Xet -> HTTP on a no-progress stall) so the
+        # in-process load below is a cache hit and cannot hang.
         _prefetched = maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
@@ -827,32 +824,21 @@ def from_pretrained(
             use_safetensors = kwargs.get("use_safetensors"),
             from_tf = kwargs.get("from_tf", False),
             from_flax = kwargs.get("from_flax", False),
-            # A bare from_pretrained(model_name) reads only the ROOT weight files, so skip
-            # weights nested in subdirs (fp16/, experimental/) the load never reads. Ignored
-            # when a subfolder is set (that branch narrows the warm to the subfolder instead).
+            # Bare load reads only ROOT weights; skip subdir weights (fp16/, experimental/). Ignored
+            # when a subfolder is set.
             weights_at_root = True,
-            # A variant load (variant="fp16") reads model.fp16.* -- forward it so the warm's
-            # format auto-pick keeps the variant .bin instead of dropping it for a default
-            # safetensors the variant load never reads.
-            variant = kwargs.get("variant"),
-            # A gguf_file load reads exactly that GGUF; forward it so the warm fetches it instead of
-            # dropping every *.gguf (the static ignore list otherwise excludes it).
-            gguf_file = kwargs.get("gguf_file"),
+            variant = kwargs.get("variant"),       # forward so the warm keeps the variant .bin
+            gguf_file = kwargs.get("gguf_file"),   # forward so the warm fetches the GGUF (else ignored)
         )
-        # The killable child already did the forced download; clear the flag so the
-        # in-process load reuses that warm cache instead of re-forcing over Xet.
+        # Child already did the forced download; clear the flag so the load reuses the warm cache.
         if _prefetched and kwargs.get("force_download", False):
             kwargs["force_download"] = False
 
-        # The tokenizer / processor loads in-process below (auto_processor / AutoTokenizer), so a
-        # stalled Xet download of its tokenizer / processor / config files could still hang
-        # from_pretrained. Warm a SEPARATE tokenizer repo (explicit tokenizer_name) through the
-        # killable subprocess. When the tokenizer comes from model_name itself, it is already
-        # covered: the base prefetch above warmed model_name, and on the vLLM path
-        # (_vllm_owns_weights) vLLM downloads its model repo -- including the tokenizer -- before
-        # the in-process tokenizer load. We must NOT warm model_name here on the vLLM path: that
-        # warm runs before fast_inference_setup may rewrite a "*-unsloth-bnb-4bit" name to the
-        # "*-bnb-4bit" repo the load actually reads, so it would warm the wrong (pre-remap) repo.
+        # The tokenizer / processor loads in-process below, so warm a SEPARATE tokenizer repo (explicit
+        # tokenizer_name) through the killable subprocess. When the tokenizer is model_name itself it is
+        # already covered (base prefetch, or vLLM's own download on the vLLM path). Do NOT warm model_name
+        # here on the vLLM path: this runs before fast_inference_setup may remap "*-unsloth-bnb-4bit" ->
+        # "*-bnb-4bit", so it would warm the wrong repo.
         _tokenizer_repo = (
             tokenizer_name if (isinstance(tokenizer_name, str) and tokenizer_name) else model_name
         )
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 41bcb106a5..b8f91fa8a4 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -564,12 +564,10 @@ def _load_correct_tokenizer(
         # Let's utilize them
         cache_dir = os.path.join(KAGGLE_TMP, cache_dir)
     elif cache_dir == "huggingface_tokenizers_cache":
-        # The default tokenizer-cache name is only meaningful on Colab/Kaggle; on a
-        # normal host fall back to the HF default cache.
+        # This default name is only meaningful on Colab/Kaggle; elsewhere use the HF default cache.
         cache_dir = None
-    # else: an explicit caller-supplied cache_dir is preserved, so the tokenizer loads
-    # from the same directory the snapshot prefetch warmed instead of probing the default
-    # cache and risking an unprotected in-process Hub/Xet transfer on a miss.
+    # else: a caller-supplied cache_dir is preserved, so the tokenizer loads from the directory the
+    # snapshot prefetch warmed rather than risking an in-process Hub/Xet transfer on a default-cache miss.
 
     # Try loading the slow tokenizer. If it fails, then try Fast only
     # Mainly to solve Deepseek models with no tokenizer.model file

From 44eded4e286d55cbd5818873875b3907a08e88c4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 29 Jun 2026 04:52:30 +0000
Subject: [PATCH 51/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/models/llama.py  | 6 ++++--
 unsloth/models/vision.py | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index c0c98a9595..2df3fa0b96 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2439,8 +2439,10 @@ def from_pretrained(
             # Bare load reads only ROOT weights; skip subdir weights (fp16/, experimental/). Ignored
             # when a subfolder is set.
             weights_at_root = True,
-            variant = kwargs.get("variant"),       # forward so the warm keeps the variant .bin
-            gguf_file = kwargs.get("gguf_file"),   # forward so the warm fetches the GGUF (else ignored)
+            variant = kwargs.get("variant"),  # forward so the warm keeps the variant .bin
+            gguf_file = kwargs.get(
+                "gguf_file"
+            ),  # forward so the warm fetches the GGUF (else ignored)
         )
         # Child already did the forced download; clear the flag so the load reuses the warm cache.
         if _prefetched and kwargs.get("force_download", False):
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 1d8cf94b90..9532be20e9 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -827,8 +827,10 @@ def from_pretrained(
             # Bare load reads only ROOT weights; skip subdir weights (fp16/, experimental/). Ignored
             # when a subfolder is set.
             weights_at_root = True,
-            variant = kwargs.get("variant"),       # forward so the warm keeps the variant .bin
-            gguf_file = kwargs.get("gguf_file"),   # forward so the warm fetches the GGUF (else ignored)
+            variant = kwargs.get("variant"),  # forward so the warm keeps the variant .bin
+            gguf_file = kwargs.get(
+                "gguf_file"
+            ),  # forward so the warm fetches the GGUF (else ignored)
         )
         # Child already did the forced download; clear the flag so the load reuses the warm cache.
         if _prefetched and kwargs.get("force_download", False):

From 6702228f61376cfba4eaf14e09ce2687cda83bb0 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 29 Jun 2026 10:25:42 +0000
Subject: [PATCH 52/80] Pre-download the snapshot for FastSentenceTransformer
 loads too

FastSentenceTransformer.from_pretrained loaded the repo in-process (SentenceTransformer(model_name)
plus hf_hub_download for modules.json) without first warming the cache, so a sentence-transformers
load could still hang on a stalled hf_xet transfer -- the exact failure the snapshot prefetch prevents
for the FastLanguage / FastModel / FastDiffusion entry points.

Warm the repo through maybe_prefetch_hf_snapshot at the top of from_pretrained, before any in-process
resolve, so every path (for_inference, fast-encoder, fallback) hits a warm cache. weights_at_root is
left False: a sentence-transformers repo can keep component weights in subfolders (pooling / dense
modules), so the whole snapshot is warmed rather than just the root. cache_folder is forwarded as the
cache_dir so the warm and the load share one location.

Add a lightweight static wiring guard (importing FastSentenceTransformer pulls heavy optional deps):
the prefetch call must appear before the SentenceTransformer instantiation in from_pretrained.
Prefetch-scope suite: 26 passed; ruff clean.
---
 tests/test_prefetch_snapshot_scope.py  | 17 +++++++++++++++++
 unsloth/models/sentence_transformer.py | 13 +++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 7c4dfbfcc8..fc0bce9ca5 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -476,3 +476,20 @@ def test_from_tf_root_load_ignores_nested_h5(capture):
     kept = _filter(["model.h5", "checkpoint-1/model.h5", "config.json"], st["allow_patterns"], ig)
     assert "model.h5" in kept  # root TF weight warmed
     assert "checkpoint-1/model.h5" not in kept  # nested TF checkpoint ignored
+
+
+def test_sentence_transformer_from_pretrained_is_prefetch_wired():
+    """FastSentenceTransformer.from_pretrained must warm the repo via maybe_prefetch_hf_snapshot
+    BEFORE it instantiates SentenceTransformer / fetches modules.json, else an ST load downloads over
+    unprotected in-process Xet. Static source guard (importing ST pulls heavy optional deps)."""
+    import os
+
+    src_path = os.path.join(os.path.dirname(U.__file__), "sentence_transformer.py")
+    with open(src_path, "r", encoding = "utf-8") as f:
+        src = f.read()
+    fp_idx = src.index("def from_pretrained", src.index("class FastSentenceTransformer"))
+    body = src[fp_idx:]
+    prefetch_idx = body.find("maybe_prefetch_hf_snapshot(")
+    st_idx = body.find("SentenceTransformer(")
+    assert prefetch_idx != -1, "from_pretrained must call maybe_prefetch_hf_snapshot"
+    assert st_idx != -1 and prefetch_idx < st_idx, "prefetch must run before the SentenceTransformer load"
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index 7e43442bfd..c7bfdd4492 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -19,6 +19,7 @@
     SUPPORTS_BFLOAT16,
     resolve_model_class,
     resolve_encoder_attention_implementation,
+    maybe_prefetch_hf_snapshot,
 )
 import inspect
 import json
@@ -1386,6 +1387,18 @@ def from_pretrained(
                 "Run `pip install sentence-transformers` to install it."
             )
 
+        # Pre-download in a killable subprocess (Xet -> HTTP on a no-progress stall) so the
+        # SentenceTransformer load below is a cache hit and cannot hang on a stalled Xet transfer.
+        # Covers every path (for_inference, fast-encoder, fallback), which all resolve the repo
+        # in-process. weights_at_root is left False: an ST repo's component weights can live in
+        # subfolders (the pooling / dense modules), so the whole snapshot is warmed.
+        maybe_prefetch_hf_snapshot(
+            model_name,
+            token = token,
+            revision = revision,
+            cache_dir = kwargs.get("cache_folder"),
+        )
+
         # if for_inference == True, skip Unsloth optimizations to avoid torch compile issues
         if for_inference:
             st_device = device_map

From 33255b1b127bb456798c9d582e70c2a5e89f28d8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 29 Jun 2026 10:26:45 +0000
Subject: [PATCH 53/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index fc0bce9ca5..b5dca0c621 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -492,4 +492,6 @@ def test_sentence_transformer_from_pretrained_is_prefetch_wired():
     prefetch_idx = body.find("maybe_prefetch_hf_snapshot(")
     st_idx = body.find("SentenceTransformer(")
     assert prefetch_idx != -1, "from_pretrained must call maybe_prefetch_hf_snapshot"
-    assert st_idx != -1 and prefetch_idx < st_idx, "prefetch must run before the SentenceTransformer load"
+    assert (
+        st_idx != -1 and prefetch_idx < st_idx
+    ), "prefetch must run before the SentenceTransformer load"

From 880f688f24c417557497d30e3c286ad91590b28b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Jun 2026 08:42:41 +0000
Subject: [PATCH 54/80] Strengthen the FastSentenceTransformer prefetch wiring
 test

Replace the substring source check with an AST check that the maybe_prefetch_hf_snapshot call is an UNCONDITIONAL top-level statement in from_pretrained (fires on every load path) and precedes any top-level return -- catching a prefetch accidentally nested in a dead branch, which the substring check could not.
---
 tests/test_prefetch_snapshot_scope.py | 39 ++++++++++++++++++---------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index b5dca0c621..9b0a4fc1a3 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -479,19 +479,34 @@ def test_from_tf_root_load_ignores_nested_h5(capture):
 
 
 def test_sentence_transformer_from_pretrained_is_prefetch_wired():
-    """FastSentenceTransformer.from_pretrained must warm the repo via maybe_prefetch_hf_snapshot
-    BEFORE it instantiates SentenceTransformer / fetches modules.json, else an ST load downloads over
-    unprotected in-process Xet. Static source guard (importing ST pulls heavy optional deps)."""
+    """FastSentenceTransformer.from_pretrained must warm the repo via maybe_prefetch_hf_snapshot as an
+    UNCONDITIONAL top-level statement (so it fires on every load path: for_inference, fast-encoder,
+    fallback) and before any top-level return. Static AST guard (importing ST pulls heavy optional
+    deps); checking the call is top-level -- not nested in an if/for/try -- catches a dead-branch wire."""
+    import ast
     import os
 
     src_path = os.path.join(os.path.dirname(U.__file__), "sentence_transformer.py")
     with open(src_path, "r", encoding = "utf-8") as f:
-        src = f.read()
-    fp_idx = src.index("def from_pretrained", src.index("class FastSentenceTransformer"))
-    body = src[fp_idx:]
-    prefetch_idx = body.find("maybe_prefetch_hf_snapshot(")
-    st_idx = body.find("SentenceTransformer(")
-    assert prefetch_idx != -1, "from_pretrained must call maybe_prefetch_hf_snapshot"
-    assert (
-        st_idx != -1 and prefetch_idx < st_idx
-    ), "prefetch must run before the SentenceTransformer load"
+        tree = ast.parse(f.read())
+    cls = next(
+        n for n in tree.body
+        if isinstance(n, ast.ClassDef) and n.name == "FastSentenceTransformer"
+    )
+    fp = next(
+        n for n in cls.body
+        if isinstance(n, ast.FunctionDef) and n.name == "from_pretrained"
+    )
+
+    def _is_prefetch_call(node):
+        return (
+            isinstance(node, ast.Expr)
+            and isinstance(node.value, ast.Call)
+            and isinstance(node.value.func, ast.Name)
+            and node.value.func.id == "maybe_prefetch_hf_snapshot"
+        )
+
+    prefetch_pos = next((i for i, n in enumerate(fp.body) if _is_prefetch_call(n)), None)
+    return_pos = next((i for i, n in enumerate(fp.body) if isinstance(n, ast.Return)), len(fp.body))
+    assert prefetch_pos is not None, "from_pretrained must call maybe_prefetch_hf_snapshot at top level"
+    assert prefetch_pos < return_pos, "prefetch must run before any top-level return"

From 20737ca2ce90d4c3a33d4041af5ade410ca722ed Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 30 Jun 2026 08:43:10 +0000
Subject: [PATCH 55/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 9b0a4fc1a3..3bf2b9a701 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -490,13 +490,9 @@ def test_sentence_transformer_from_pretrained_is_prefetch_wired():
     with open(src_path, "r", encoding = "utf-8") as f:
         tree = ast.parse(f.read())
     cls = next(
-        n for n in tree.body
-        if isinstance(n, ast.ClassDef) and n.name == "FastSentenceTransformer"
-    )
-    fp = next(
-        n for n in cls.body
-        if isinstance(n, ast.FunctionDef) and n.name == "from_pretrained"
+        n for n in tree.body if isinstance(n, ast.ClassDef) and n.name == "FastSentenceTransformer"
     )
+    fp = next(n for n in cls.body if isinstance(n, ast.FunctionDef) and n.name == "from_pretrained")
 
     def _is_prefetch_call(node):
         return (
@@ -508,5 +504,7 @@ def _is_prefetch_call(node):
 
     prefetch_pos = next((i for i, n in enumerate(fp.body) if _is_prefetch_call(n)), None)
     return_pos = next((i for i, n in enumerate(fp.body) if isinstance(n, ast.Return)), len(fp.body))
-    assert prefetch_pos is not None, "from_pretrained must call maybe_prefetch_hf_snapshot at top level"
+    assert (
+        prefetch_pos is not None
+    ), "from_pretrained must call maybe_prefetch_hf_snapshot at top level"
     assert prefetch_pos < return_pos, "prefetch must run before any top-level return"

From 83a7a93dc114f6dacadc854ff46bcdb259723259 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Jun 2026 11:02:36 +0000
Subject: [PATCH 56/80] Route a custom cache_dir through the ST module and
 tokenizer-repair fetches

Two in-process Hub fetches ignored a caller's custom cache directory, so with a custom cache_folder / cache_dir they hit the default HF cache that the snapshot prefetch did not warm (a redundant download, and in principle an unprotected fetch -- low risk in practice since these are small HTTP-served config files):

- FastSentenceTransformer's modules.json / pooling-config reads (_module_path, _read_pooling_mode, threaded through _load_modules) now forward the cache_folder the prefetch warmed.
- check_tokenizer's slow-tokenizer repair reload now honors a caller-supplied cache_dir (forwarded from FastLlamaModel.from_pretrained), falling back to the Colab/Kaggle sentinel only when none is given -- matching load_correct_tokenizer.

Prefetch-scope suite: 26 passed; ruff clean. No behavior change when no custom cache dir is passed.
---
 unsloth/models/llama.py                |  1 +
 unsloth/models/sentence_transformer.py | 25 ++++++++++++++++++-------
 unsloth/tokenizer_utils.py             | 13 ++++++++-----
 3 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 2df3fa0b96..2cc2c0b3ed 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2859,6 +2859,7 @@ def from_pretrained(
                 model_max_length = max_position_embeddings,
                 padding_side = "right",
                 token = token,
+                cache_dir = kwargs.get("cache_dir"),
             )
         patch_saving_functions(tokenizer)
 
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index c7bfdd4492..4a391c59c7 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -542,7 +542,7 @@ def _save_with_base_config(self, output_path, *args, **kwargs):
         return transformer_module
 
     @staticmethod
-    def _read_pooling_mode(model_name, token):
+    def _read_pooling_mode(model_name, token, cache_dir = None):
         """Read the pooling mode from modules.json, else return "mean"."""
         try:
             if os.path.exists(model_name) and os.path.exists(
@@ -550,7 +550,9 @@ def _read_pooling_mode(model_name, token):
             ):
                 modules_json_path = os.path.join(model_name, "modules.json")
             else:
-                modules_json_path = hf_hub_download(model_name, "modules.json", token = token)
+                modules_json_path = hf_hub_download(
+                    model_name, "modules.json", token = token, cache_dir = cache_dir
+                )
 
             with open(modules_json_path, "r", encoding = "utf-8") as f:
                 modules_config = json.load(f)
@@ -572,6 +574,7 @@ def _read_pooling_mode(model_name, token):
                                 model_name,
                                 os.path.join(pooling_path, "config.json"),
                                 token = token,
+                                cache_dir = cache_dir,
                             )
                         break
 
@@ -951,7 +954,7 @@ def _add_unsloth_branding(save_directory):
             f.write(content)
 
     @staticmethod
-    def _module_path(model_name, token = None):
+    def _module_path(model_name, token = None, cache_dir = None):
         """Return the path to the modules.json file, or None."""
         try:
             if os.path.exists(model_name) and os.path.isdir(model_name):
@@ -959,7 +962,9 @@ def _module_path(model_name, token = None):
                 return path if os.path.exists(path) else None
             else:
                 try:
-                    return hf_hub_download(model_name, "modules.json", token = token)
+                    return hf_hub_download(
+                        model_name, "modules.json", token = token, cache_dir = cache_dir
+                    )
                 except:
                     return None
         except:
@@ -1136,6 +1141,7 @@ def _load_modules(
         max_seq_length,
         pooling_mode,
         trust_remote_code = False,
+        cache_dir = None,
     ) -> tuple[OrderedDict, bool]:
         """Load modules from modules.json, else fall back to hard-coded modules.
 
@@ -1146,7 +1152,7 @@ def _load_modules(
         from sentence_transformers.models import Pooling, Normalize
 
         modules = OrderedDict()
-        modules_json_path = FastSentenceTransformer._module_path(model_name, token)
+        modules_json_path = FastSentenceTransformer._module_path(model_name, token, cache_dir = cache_dir)
 
         if modules_json_path:
             with open(modules_json_path, encoding = "utf8") as f:
@@ -1199,7 +1205,9 @@ def _load_modules(
         hidden_size = getattr(model.config, "hidden_size", 768)
 
         if pooling_mode == "mean":
-            pooling_mode = FastSentenceTransformer._read_pooling_mode(model_name, token)
+            pooling_mode = FastSentenceTransformer._read_pooling_mode(
+                model_name, token, cache_dir = cache_dir
+            )
 
         modules["1"] = Pooling(word_embedding_dimension = hidden_size, pooling_mode = pooling_mode)
         modules["2"] = Normalize()
@@ -1659,7 +1667,9 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
 
         # No modules.json -> force 16-bit: saving is custom for these models and
         # 4-bit would need dequant in save_pretrained_merged, not worth it.
-        has_modules_json = FastSentenceTransformer._module_path(model_name, token) is not None
+        has_modules_json = FastSentenceTransformer._module_path(
+            model_name, token, cache_dir = kwargs.get("cache_folder")
+        ) is not None
 
         if not has_modules_json and load_in_4bit:
             print(
@@ -1710,6 +1720,7 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
             max_seq_length,
             pooling_mode,
             trust_remote_code = trust_remote_code,
+            cache_dir = kwargs.get("cache_folder"),
         )
 
         st_device = device_map
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index b8f91fa8a4..db3d3253ae 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -1326,6 +1326,7 @@ def check_tokenizer(
     padding_side = "right",
     token = None,
     _reload = True,
+    cache_dir = None,
 ):
     # Checks tokenizer for out of bounds ids.
     # Mainly a fix for https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha
@@ -1416,10 +1417,11 @@ def check_tokenizer(
                     f"Fix your tokenizer since it'll perform out of bounds memory accesses."
                 )
 
-            if IS_COLAB_ENVIRONMENT or IS_KAGGLE_ENVIRONMENT:
-                cache_dir = "huggingface_tokenizers_cache"
-            else:
-                cache_dir = None
+            # Honor a caller-supplied cache_dir so the repair reload reuses any warmed cache; else fall
+            # back to the Colab/Kaggle sentinel (HF default elsewhere), as load_correct_tokenizer does.
+            reload_cache_dir = cache_dir
+            if reload_cache_dir is None and (IS_COLAB_ENVIRONMENT or IS_KAGGLE_ENVIRONMENT):
+                reload_cache_dir = "huggingface_tokenizers_cache"
 
             # Sometimes slow tokenizer does not work like Deepseek
             try:
@@ -1433,7 +1435,7 @@ def check_tokenizer(
                     use_fast = False,
                     legacy = False,
                     from_slow = True,
-                    cache_dir = cache_dir,
+                    cache_dir = reload_cache_dir,
                 )
                 return check_tokenizer(
                     model = model,
@@ -1443,6 +1445,7 @@ def check_tokenizer(
                     padding_side = padding_side,
                     token = token,
                     _reload = False,
+                    cache_dir = cache_dir,
                 )
                 break
             except:

From a8696abdf9f139f9e6b03d8ed00a6357a11ec2c0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 30 Jun 2026 11:03:07 +0000
Subject: [PATCH 57/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/models/sentence_transformer.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index 4a391c59c7..ab364e6ac6 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -542,7 +542,11 @@ def _save_with_base_config(self, output_path, *args, **kwargs):
         return transformer_module
 
     @staticmethod
-    def _read_pooling_mode(model_name, token, cache_dir = None):
+    def _read_pooling_mode(
+        model_name,
+        token,
+        cache_dir = None,
+    ):
         """Read the pooling mode from modules.json, else return "mean"."""
         try:
             if os.path.exists(model_name) and os.path.exists(
@@ -954,7 +958,11 @@ def _add_unsloth_branding(save_directory):
             f.write(content)
 
     @staticmethod
-    def _module_path(model_name, token = None, cache_dir = None):
+    def _module_path(
+        model_name,
+        token = None,
+        cache_dir = None,
+    ):
         """Return the path to the modules.json file, or None."""
         try:
             if os.path.exists(model_name) and os.path.isdir(model_name):
@@ -1152,7 +1160,9 @@ def _load_modules(
         from sentence_transformers.models import Pooling, Normalize
 
         modules = OrderedDict()
-        modules_json_path = FastSentenceTransformer._module_path(model_name, token, cache_dir = cache_dir)
+        modules_json_path = FastSentenceTransformer._module_path(
+            model_name, token, cache_dir = cache_dir
+        )
 
         if modules_json_path:
             with open(modules_json_path, encoding = "utf8") as f:
@@ -1667,9 +1677,12 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
 
         # No modules.json -> force 16-bit: saving is custom for these models and
         # 4-bit would need dequant in save_pretrained_merged, not worth it.
-        has_modules_json = FastSentenceTransformer._module_path(
-            model_name, token, cache_dir = kwargs.get("cache_folder")
-        ) is not None
+        has_modules_json = (
+            FastSentenceTransformer._module_path(
+                model_name, token, cache_dir = kwargs.get("cache_folder")
+            )
+            is not None
+        )
 
         if not has_modules_json and load_in_4bit:
             print(

From 010bb50c22edfe3998047d55fb813d7d6392c582 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 1 Jul 2026 00:51:46 +0000
Subject: [PATCH 58/80] Forward variant to the killable prewarm so the
 cache-skip gate can defer

maybe_prefetch_hf_snapshot now passes variant to
snapshot_download_with_xet_fallback. Without it the pre-download cache-skip gate
was variant-blind: a cache holding only the default canonical weight fast-pathed
a variant="fp16" request, leaving the in-process load to fetch the missing
variant weight over un-killable Xet. Paired with the unsloth_zoo change that
defers the skip when a variant is requested.
---
 tests/test_prefetch_snapshot_scope.py | 12 ++++++++++++
 unsloth/models/_utils.py              |  1 +
 2 files changed, 13 insertions(+)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 3bf2b9a701..b4a7a7052f 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -48,6 +48,7 @@ def fake_download(repo_id, **kw):
         state["repo_id"] = repo_id
         state["allow_patterns"] = kw.get("allow_patterns")
         state["ignore_patterns"] = kw.get("ignore_patterns")
+        state["variant"] = kw.get("variant")
         return "/tmp/fake-snapshot"
 
     fake_module = types.ModuleType("unsloth_zoo.hf_xet_fallback")
@@ -257,6 +258,17 @@ def test_filename_has_variant_matches_single_and_sharded():
     assert U._filename_has_variant("model-00001-of-00002.safetensors", "fp16") is False
 
 
+def test_variant_is_forwarded_to_downloader(capture):
+    """maybe_prefetch_hf_snapshot must forward `variant` to snapshot_download_with_xet_fallback so the
+    PRE cache-skip gate can defer on a variant load: a cache holding only the default canonical weight
+    must not fast-path a variant='fp16' request, else the in-process load fetches the missing variant
+    weight over un-killable Xet. Absent a variant, nothing is forwarded (the fast path stays live)."""
+    _, st = capture(weights_at_root = True, use_safetensors = True, variant = "fp16")
+    assert st["variant"] == "fp16"
+    _, st = capture(weights_at_root = True, use_safetensors = True)
+    assert st["variant"] is None
+
+
 def test_variant_drops_bin_for_sharded_variant_safetensors(monkeypatch):
     """A SHARDED variant safetensors (model.fp16-00001-of-00002.safetensors) is recognized, so its
     redundant variant .bin is dropped rather than both formats warmed (gemini #6638)."""
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 3721ac5eaf..2045b6cae2 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1287,6 +1287,7 @@ def maybe_prefetch_hf_snapshot(
             allow_patterns = allow_patterns,
             ignore_patterns = ignore_patterns,
             force_download = force_download,
+            variant = variant,
         )
         return True
     except DownloadStallError:

From cb8cf9172a207368304d6b412eb39539459d9548 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 1 Jul 2026 01:44:00 +0000
Subject: [PATCH 59/80] Adapt the Studio force_download test to the
 deduplicated shim

test_force_download_bypasses_cache_first_early_return (from #5388) monkeypatched
the old private downloader's os and _run_download_attempt and asserted the
cache-first early-return inside Studio's copy. This PR moved that logic into the
shared unsloth_zoo helper, so the shim no longer imports os or defines those, and
the test failed with AttributeError: module 'utils.hf_xet_fallback' has no
attribute 'os'.

Rewrite it to assert the shim's actual contract: force_download is forwarded
unchanged to the shared helper. The cache-first bypass behavior itself is covered
in the zoo suite (test_force_download_file_skips_cache_probe /
test_force_download_skips_fast_path_and_threads).
---
 .../tests/test_model_update_robustness.py     | 52 ++++++-------------
 1 file changed, 17 insertions(+), 35 deletions(-)

diff --git a/studio/backend/tests/test_model_update_robustness.py b/studio/backend/tests/test_model_update_robustness.py
index 9cf2a62c39..a7b9a9caaa 100644
--- a/studio/backend/tests/test_model_update_robustness.py
+++ b/studio/backend/tests/test_model_update_robustness.py
@@ -5,8 +5,9 @@
 Covers:
   * GGUF variant listing computes update_available from the already-fetched
     sibling metadata instead of a second Hub call.
-  * hf_hub_download_with_xet_fallback(force_download=True) bypasses the
-    try_to_load_from_cache cache-first early-return.
+  * hf_hub_download_with_xet_fallback forwards force_download through the
+    deduplicated shim to the shared unsloth_zoo helper (which owns the
+    try_to_load_from_cache cache-first early-return and its bypass).
 
 The cache "Update" action now runs through the download manager as a normal
 managed download (so it shows in the Downloads panel with progress + cancel),
@@ -341,44 +342,25 @@ def test_cached_model_scan_keeps_local_safetensors_repo(monkeypatch, tmp_path):
 # ── hf_hub_download_with_xet_fallback force_download bypass (X2/F2) ───
 
 
-def test_force_download_bypasses_cache_first_early_return(monkeypatch):
-    """force_download=True skips the try_to_load_from_cache early-return and
-    proceeds to the real download path; force_download=False returns the cached
-    path without ever attempting a download (X2/F2)."""
-    import huggingface_hub as hf
+def test_force_download_is_forwarded_through_the_shim(monkeypatch):
+    """The deduplicated shim delegates to the shared unsloth_zoo helper, which owns the
+    try_to_load_from_cache early-return and its force_download bypass (covered by the zoo suite's
+    test_force_download_file_skips_cache_probe). The shim's own contract is to forward force_download
+    unchanged so Studio's model-update path re-fetches a newer blob; verify both force_download=False
+    and force_download=True reach the shared helper (X2/F2)."""
     import utils.hf_xet_fallback as X
 
-    cached_path = "/cache/blob/cached.gguf"
+    seen = []
 
-    # Pretend the blob IS cached on disk (try_to_load_from_cache is imported
-    # inside the function from huggingface_hub, and os.path.exists must agree).
-    monkeypatch.setattr(hf, "try_to_load_from_cache", lambda *a, **k: cached_path, raising = False)
-    monkeypatch.setattr(X.os.path, "exists", lambda p: True, raising = False)
+    def fake_shared(repo_id, filename, token, **kwargs):
+        seen.append(kwargs.get("force_download"))
+        return "/downloaded/path"
 
-    attempts = []
+    monkeypatch.setattr(X, "_shared_hf_hub_download_with_xet_fallback", fake_shared, raising = True)
 
-    def fake_attempt(repo_id, filename, token, **kwargs):
-        attempts.append(
-            {"repo_id": repo_id, "filename": filename, "force": kwargs.get("force_download")}
-        )
-        return ("ok", "/freshly/downloaded/path")
-
-    monkeypatch.setattr(X, "_run_download_attempt", fake_attempt, raising = True)
-
-    # force_download=False: cache-first early-return, no download attempt.
-    out = X.hf_hub_download_with_xet_fallback(
-        "unsloth/repo", "model.gguf", token = None, force_download = False
-    )
-    assert out == cached_path
-    assert attempts == []  # never reached the real download
-
-    # force_download=True: bypass the early-return, run the real download.
-    out2 = X.hf_hub_download_with_xet_fallback(
-        "unsloth/repo", "model.gguf", token = None, force_download = True
-    )
-    assert out2 == "/freshly/downloaded/path"
-    assert len(attempts) == 1
-    assert attempts[0]["force"] is True
+    X.hf_hub_download_with_xet_fallback("unsloth/repo", "model.gguf", token = None, force_download = False)
+    X.hf_hub_download_with_xet_fallback("unsloth/repo", "model.gguf", token = None, force_download = True)
+    assert seen == [False, True]  # the shim forwards force_download to the shared helper unchanged
 
 
 # ── multi-revision GGUF blob comparison and update reclaim ──

From 3203753db6457131d442f279925aec6b22545065 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 1 Jul 2026 01:45:33 +0000
Subject: [PATCH 60/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_model_update_robustness.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/studio/backend/tests/test_model_update_robustness.py b/studio/backend/tests/test_model_update_robustness.py
index a7b9a9caaa..00bda5f14f 100644
--- a/studio/backend/tests/test_model_update_robustness.py
+++ b/studio/backend/tests/test_model_update_robustness.py
@@ -358,8 +358,12 @@ def fake_shared(repo_id, filename, token, **kwargs):
 
     monkeypatch.setattr(X, "_shared_hf_hub_download_with_xet_fallback", fake_shared, raising = True)
 
-    X.hf_hub_download_with_xet_fallback("unsloth/repo", "model.gguf", token = None, force_download = False)
-    X.hf_hub_download_with_xet_fallback("unsloth/repo", "model.gguf", token = None, force_download = True)
+    X.hf_hub_download_with_xet_fallback(
+        "unsloth/repo", "model.gguf", token = None, force_download = False
+    )
+    X.hf_hub_download_with_xet_fallback(
+        "unsloth/repo", "model.gguf", token = None, force_download = True
+    )
     assert seen == [False, True]  # the shim forwards force_download to the shared helper unchanged
 
 

From 7423f33992a32c8717c54ac6c69df72179cb9a77 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 1 Jul 2026 03:44:48 +0000
Subject: [PATCH 61/80] Close four SentenceTransformer prefetch gaps

A: FastSentenceTransformer.from_pretrained now forwards local_files_only to the
prefetch, so an offline / cache-only load does not start a Hub snapshot download
(and add a network timeout on firewalled hosts) before the ST load sees the flag.

B: _load_modules forwards the custom cache_folder into load_dir_path, so per-module
subdirs are read from the warmed cache instead of the default one (a custom
cache_folder no longer triggers a second in-process fetch / cache mix).

C: _prefetch_ignore_patterns no longer drops the redundant weight format for a
whole multi-component snapshot (weights_at_root=False, no subfolder: ST / diffusers
repos whose modules ship weights in per-module subfolders). HF fnmatch "*" spans
"/", so dropping "*.bin" there stripped a subdir module's only weight and left the
module load to an in-process Xet fetch. The drop still applies to root / subfolder
scoped loads.

D: add sentencepiece.model (RemBERT) and vocab-src.json / vocab-tgt.json (FSMT) to
the tokenizer aux allowlist so a distinct-tokenizer-repo warm caches those vocab
files instead of leaving them to an in-process Xet fetch.
---
 tests/test_prefetch_snapshot_scope.py  | 62 ++++++++++++++++++++++----
 unsloth/models/_utils.py               | 25 +++++++++--
 unsloth/models/sentence_transformer.py |  5 ++-
 3 files changed, 80 insertions(+), 12 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index b4a7a7052f..a2eb710421 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -233,10 +233,10 @@ def test_variant_keeps_bin_when_only_default_safetensors(monkeypatch):
     pytorch_model.fp16.bin redundant: dropping it would leave the variant load to fetch the .bin
     in-process over Xet. The .bin stays warmed (Codex #6638)."""
     _install_fake_model_info(monkeypatch, ["model.safetensors", "pytorch_model.fp16.bin"])
-    ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16")
+    ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16", weights_at_root = True)
     assert "*.bin" not in ig
     # No variant: the default safetensors DOES make .bin redundant (existing behavior).
-    ig_default = U._prefetch_ignore_patterns("org/repo")
+    ig_default = U._prefetch_ignore_patterns("org/repo", weights_at_root = True)
     assert "*.bin" in ig_default
 
 
@@ -244,7 +244,7 @@ def test_variant_drops_bin_when_variant_safetensors_present(monkeypatch):
     """When a variant-matching safetensors (model.fp16.safetensors) is shipped, the variant load
     reads it and the variant .bin is redundant, so .bin is dropped from the warm."""
     _install_fake_model_info(monkeypatch, ["model.fp16.safetensors", "pytorch_model.fp16.bin"])
-    ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16")
+    ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16", weights_at_root = True)
     assert "*.bin" in ig
 
 
@@ -280,7 +280,7 @@ def test_variant_drops_bin_for_sharded_variant_safetensors(monkeypatch):
             "pytorch_model.fp16-00001-of-00002.bin",
         ],
     )
-    ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16")
+    ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16", weights_at_root = True)
     assert "*.bin" in ig
 
 
@@ -332,7 +332,7 @@ def test_optimizer_safetensors_does_not_drop_bin(monkeypatch):
     whose real weights are pytorch_model.bin alongside an optimizer.safetensors must keep its .bin,
     else the in-process load fetches the only weights over Xet without the fallback (Codex #6638)."""
     _install_fake_model_info(monkeypatch, ["pytorch_model.bin", "optimizer.safetensors"])
-    ig = U._prefetch_ignore_patterns("org/repo")
+    ig = U._prefetch_ignore_patterns("org/repo", weights_at_root = True)
     assert "*.bin" not in ig  # .bin is the only real weight -> not dropped
 
 
@@ -342,10 +342,23 @@ def test_model_safetensors_still_drops_bin(monkeypatch):
     _install_fake_model_info(
         monkeypatch, ["model.safetensors", "pytorch_model.bin", "optimizer.safetensors"]
     )
-    ig = U._prefetch_ignore_patterns("org/repo")
+    ig = U._prefetch_ignore_patterns("org/repo", weights_at_root = True)
     assert "*.bin" in ig
 
 
+def test_whole_multi_component_snapshot_keeps_subdir_bin(monkeypatch):
+    """A whole multi-component snapshot (weights_at_root=False, no subfolder: a SentenceTransformer /
+    diffusers repo) must NOT drop *.bin even when root safetensors exist -- HF's "*" spans "/", so the
+    drop would strip a subdir module's only weight (1_Dense/pytorch_model.bin) and leave the module load
+    to an in-process Xet fetch. A root-scoped load of the same repo still drops the redundant root .bin
+    (Codex #6638)."""
+    _install_fake_model_info(monkeypatch, ["model.safetensors", "1_Dense/pytorch_model.bin"])
+    ig = U._prefetch_ignore_patterns("org/repo", weights_at_root = False)
+    assert "*.bin" not in ig
+    ig_root = U._prefetch_ignore_patterns("org/repo", weights_at_root = True)
+    assert "*.bin" in ig_root
+
+
 def test_is_model_weight_safetensors_classification():
     """Direct unit coverage: real model weights count, adapter / trainer-state sidecars do not."""
     assert U._is_model_weight_safetensors("model.safetensors") is True
@@ -361,10 +374,14 @@ def test_is_model_weight_safetensors_classification():
 def test_tokenizer_only_warms_slow_sentencepiece_vocab(capture):
     """tokenizer_only must warm the slow-tokenizer SentencePiece / BPE vocab files AutoTokenizer
     fetches first (sentencepiece.bpe.model for XLM-R / mBART, source.spm / target.spm for Marian,
-    bpe.codes / vocab.bpe), so they are not left to an in-process Xet fetch (Codex #6638)."""
+    bpe.codes / vocab.bpe, sentencepiece.model for RemBERT, vocab-src.json / vocab-tgt.json for FSMT),
+    so they are not left to an in-process Xet fetch (Codex #6638)."""
     _, st = capture(tokenizer_only = True)
     allow = st["allow_patterns"]
-    for name in ("sentencepiece.bpe.model", "source.spm", "target.spm", "bpe.codes", "vocab.bpe"):
+    for name in (
+        "sentencepiece.bpe.model", "source.spm", "target.spm", "bpe.codes", "vocab.bpe",
+        "sentencepiece.model", "vocab-src.json", "vocab-tgt.json",
+    ):
         assert name in allow, name
 
 
@@ -520,3 +537,32 @@ def _is_prefetch_call(node):
         prefetch_pos is not None
     ), "from_pretrained must call maybe_prefetch_hf_snapshot at top level"
     assert prefetch_pos < return_pos, "prefetch must run before any top-level return"
+    # local_files_only must be forwarded so an offline / cache-only load does not start a Hub download
+    # via the prefetch before the ST load sees the flag (Codex #6638).
+    prefetch_call = fp.body[prefetch_pos].value
+    assert "local_files_only" in {kw.arg for kw in prefetch_call.keywords}, (
+        "prefetch must forward local_files_only"
+    )
+
+
+def test_st_module_download_forwards_cache_folder():
+    """_load_modules must forward the custom cache_folder into load_dir_path so per-module subdirs are
+    read from the warmed cache rather than the default one, avoiding a second in-process Hub/Xet fetch
+    (Codex #6638). Static AST guard (importing ST pulls heavy optional deps)."""
+    import ast
+    import os
+
+    src_path = os.path.join(os.path.dirname(U.__file__), "sentence_transformer.py")
+    with open(src_path, "r", encoding = "utf-8") as f:
+        tree = ast.parse(f.read())
+    calls = [
+        n
+        for n in ast.walk(tree)
+        if isinstance(n, ast.Call)
+        and isinstance(n.func, ast.Name)
+        and n.func.id == "load_dir_path"
+    ]
+    assert calls, "expected a load_dir_path call in sentence_transformer.py"
+    assert all("cache_folder" in {kw.arg for kw in c.keywords} for c in calls), (
+        "every load_dir_path call must forward cache_folder"
+    )
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 2045b6cae2..d99df02404 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -958,6 +958,11 @@ def _run_temporary_patches(phase):
     "target.spm",
     "bpe.codes",
     "vocab.bpe",
+    # sentencepiece.model (RemBERT) and vocab-src.json / vocab-tgt.json (FSMT) are VOCAB_FILES_NAMES
+    # not covered by the names above; a distinct-tokenizer-repo warm must cache them too.
+    "sentencepiece.model",
+    "vocab-src.json",
+    "vocab-tgt.json",
     "chat_template.jinja",
     "chat_template.json",
     # A non-default chat_template="<name>" load fetches additional_chat_templates/<name>.jinja.
@@ -1083,12 +1088,19 @@ def _prefetch_ignore_patterns(
     from_tf = False,
     from_flax = False,
     variant = None,
+    weights_at_root = False,
 ):
     """ignore_patterns for the prewarm snapshot: the static skip list, minus the checkpoint guard when
     loading from a checkpoint-* subfolder, minus the weight format the load will not read. Explicit
     use_safetensors acts as a format allowlist (True -> skip *.bin, False -> skip *.safetensors); auto
     (None) skips *.bin only when in-scope safetensors are also shipped (Transformers prefers them).
-    from_tf / from_flax keep the *.h5 / *.msgpack weights they read as the actual weights."""
+    from_tf / from_flax keep the *.h5 / *.msgpack weights they read as the actual weights.
+
+    The cross-format drop is suppressed for a WHOLE multi-component snapshot (weights_at_root=False and
+    no subfolder: a SentenceTransformer / diffusers repo whose modules ship weights in per-module
+    subfolders, each in its own format). HF fnmatch "*" spans "/", so dropping "*.bin" there would strip
+    a subdir module's only weight and leave the module load to an in-process Xet fetch. The redundancy
+    reasoning holds only for a single-scope (root / subfolder) load."""
     # Keep checkpoint-*/* when loading from such a subfolder; keep *.h5 / *.msgpack under from_tf/flax.
     ignore_patterns = [
         pattern
@@ -1104,8 +1116,14 @@ def _prefetch_ignore_patterns(
         )
     ]
     # Drop the format the load will not read (Transformers reads exactly one; the other doubles the
-    # download we optimize).
-    if from_tf or from_flax:
+    # download we optimize). Skipped for a whole multi-component snapshot (see docstring): each subdir
+    # module ships its own format, so keep every format rather than strip a module's only weight.
+    whole_multi_component = not weights_at_root and not (
+        isinstance(subfolder, str) and subfolder.strip("/")
+    )
+    if whole_multi_component:
+        pass
+    elif from_tf or from_flax:
         # TF / Flax loads never touch the PyTorch formats; drop safetensors and .bin outright.
         ignore_patterns.extend(
             (
@@ -1233,6 +1251,7 @@ def maybe_prefetch_hf_snapshot(
             from_tf = from_tf,
             from_flax = from_flax,
             variant = variant,
+            weights_at_root = weights_at_root,
         )
     )
     # Narrow the warm to what the load reads, so a repo shipping extra weights (alternate checkpoints,
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index ab364e6ac6..5d07e97b33 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -1188,7 +1188,9 @@ def _load_modules(
                         load_path = os.path.join(model_name, module_path)
                     else:
                         try:
-                            load_path = load_dir_path(model_name, module_path, token = token)
+                            load_path = load_dir_path(
+                                model_name, module_path, token = token, cache_folder = cache_dir
+                            )
                         except Exception as e:
                             print(f"Unsloth Warning: Could not download module {module_path}: {e}")
                             continue
@@ -1415,6 +1417,7 @@ def from_pretrained(
             token = token,
             revision = revision,
             cache_dir = kwargs.get("cache_folder"),
+            local_files_only = kwargs.get("local_files_only", False),
         )
 
         # if for_inference == True, skip Unsloth optimizations to avoid torch compile issues

From 83b60f2b508d0317a33f737664132e3dfcd9bfcb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 1 Jul 2026 03:45:17 +0000
Subject: [PATCH 62/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index a2eb710421..5a13a63f0e 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -379,8 +379,14 @@ def test_tokenizer_only_warms_slow_sentencepiece_vocab(capture):
     _, st = capture(tokenizer_only = True)
     allow = st["allow_patterns"]
     for name in (
-        "sentencepiece.bpe.model", "source.spm", "target.spm", "bpe.codes", "vocab.bpe",
-        "sentencepiece.model", "vocab-src.json", "vocab-tgt.json",
+        "sentencepiece.bpe.model",
+        "source.spm",
+        "target.spm",
+        "bpe.codes",
+        "vocab.bpe",
+        "sentencepiece.model",
+        "vocab-src.json",
+        "vocab-tgt.json",
     ):
         assert name in allow, name
 
@@ -540,9 +546,9 @@ def _is_prefetch_call(node):
     # local_files_only must be forwarded so an offline / cache-only load does not start a Hub download
     # via the prefetch before the ST load sees the flag (Codex #6638).
     prefetch_call = fp.body[prefetch_pos].value
-    assert "local_files_only" in {kw.arg for kw in prefetch_call.keywords}, (
-        "prefetch must forward local_files_only"
-    )
+    assert "local_files_only" in {
+        kw.arg for kw in prefetch_call.keywords
+    }, "prefetch must forward local_files_only"
 
 
 def test_st_module_download_forwards_cache_folder():
@@ -558,11 +564,9 @@ def test_st_module_download_forwards_cache_folder():
     calls = [
         n
         for n in ast.walk(tree)
-        if isinstance(n, ast.Call)
-        and isinstance(n.func, ast.Name)
-        and n.func.id == "load_dir_path"
+        if isinstance(n, ast.Call) and isinstance(n.func, ast.Name) and n.func.id == "load_dir_path"
     ]
     assert calls, "expected a load_dir_path call in sentence_transformer.py"
-    assert all("cache_folder" in {kw.arg for kw in c.keywords} for c in calls), (
-        "every load_dir_path call must forward cache_folder"
-    )
+    assert all(
+        "cache_folder" in {kw.arg for kw in c.keywords} for c in calls
+    ), "every load_dir_path call must forward cache_folder"

From f1fe53a794a46e966d2ab1d64e62347810a005d0 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 1 Jul 2026 07:58:56 +0000
Subject: [PATCH 63/80] Forward cache_folder to the fast-encoder
 SentenceTransformer load

FastSentenceTransformer.from_pretrained prefetches the snapshot into the caller's
cache_folder, and the for_inference branch already forwards cache_folder to the
native SentenceTransformer load, but the fast-encoder branch did not. With a custom
cache_folder the fast-encoder load then read the default cache, missed the warmed
snapshot, and started an unprotected in-process Hub/Xet download of the weights the
prefetch was meant to protect. Forward cache_folder there too; None keeps the
default cache, matching the prefetch's cache_dir.
---
 tests/test_prefetch_snapshot_scope.py  | 35 ++++++++++++++++++++++++++
 unsloth/models/sentence_transformer.py |  6 ++++-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 5a13a63f0e..9ec682e20b 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -570,3 +570,38 @@ def test_st_module_download_forwards_cache_folder():
     assert all(
         "cache_folder" in {kw.arg for kw in c.keywords} for c in calls
     ), "every load_dir_path call must forward cache_folder"
+
+
+def test_st_native_sentence_transformer_calls_forward_cache_folder():
+    """Every native SentenceTransformer(model_name, ...) load (for_inference AND fast-encoder) must
+    forward cache_folder, so a custom cache_folder reads the cache the prefetch warmed instead of
+    missing it and starting an unprotected in-process Hub/Xet download (Codex #6638). The modules-based
+    SentenceTransformer(modules=...) call builds from already-loaded modules and needs no cache_folder.
+    Static AST guard (importing ST pulls heavy optional deps)."""
+    import ast
+    import os
+
+    src_path = os.path.join(os.path.dirname(U.__file__), "sentence_transformer.py")
+    with open(src_path, "r", encoding = "utf-8") as f:
+        tree = ast.parse(f.read())
+    weight_loading_calls = []
+    for n in ast.walk(tree):
+        if not (isinstance(n, ast.Call) and isinstance(n.func, ast.Name)
+                and n.func.id == "SentenceTransformer"):
+            continue
+        kw_names = {kw.arg for kw in n.keywords}
+        # A modules-based build (SentenceTransformer(modules=...)) downloads nothing; only a
+        # repo-name load (positional model_name, no modules=) reads the cache.
+        if "modules" in kw_names:
+            continue
+        weight_loading_calls.append(n)
+    assert weight_loading_calls, "expected a repo-name SentenceTransformer load in sentence_transformer.py"
+    # cache_folder is forwarded either explicitly (fast-encoder branch) or via a **kwargs unpacking
+    # (for_inference branch builds st_kwargs incl. cache_folder). A ** unpacking has kw.arg == None.
+    for c in weight_loading_calls:
+        kw_names = {kw.arg for kw in c.keywords}
+        forwards = "cache_folder" in kw_names or None in kw_names
+        assert forwards, (
+            "a repo-name SentenceTransformer load must forward cache_folder "
+            f"(explicitly or via **kwargs) at line {c.lineno}"
+        )
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index 5d07e97b33..bcea82f71d 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -1567,7 +1567,10 @@ def from_pretrained(
                 elif is_mpnet:
                     FastSentenceTransformer._patch_mpnet_v5()
 
-            # Load via native SentenceTransformer (bypasses Unsloth patching)
+            # Load via native SentenceTransformer (bypasses Unsloth patching). Forward cache_folder so
+            # this load reads the cache the prefetch warmed (as the for_inference branch does); a custom
+            # cache_folder would otherwise miss the warm and start an unprotected in-process Hub/Xet
+            # download. None keeps the default cache, matching the prefetch's cache_dir.
             st_model = SentenceTransformer(
                 model_name,
                 device = st_device,
@@ -1575,6 +1578,7 @@ def from_pretrained(
                 token = token,
                 revision = revision,
                 model_kwargs = model_kwargs,
+                cache_folder = kwargs.get("cache_folder"),
             )
 
             # Store metadata for get_peft_model

From 43ffea73838846a1f5bcb1411609701f98463d54 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 1 Jul 2026 07:59:38 +0000
Subject: [PATCH 64/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 9ec682e20b..55a151609c 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -586,8 +586,11 @@ def test_st_native_sentence_transformer_calls_forward_cache_folder():
         tree = ast.parse(f.read())
     weight_loading_calls = []
     for n in ast.walk(tree):
-        if not (isinstance(n, ast.Call) and isinstance(n.func, ast.Name)
-                and n.func.id == "SentenceTransformer"):
+        if not (
+            isinstance(n, ast.Call)
+            and isinstance(n.func, ast.Name)
+            and n.func.id == "SentenceTransformer"
+        ):
             continue
         kw_names = {kw.arg for kw in n.keywords}
         # A modules-based build (SentenceTransformer(modules=...)) downloads nothing; only a
@@ -595,7 +598,9 @@ def test_st_native_sentence_transformer_calls_forward_cache_folder():
         if "modules" in kw_names:
             continue
         weight_loading_calls.append(n)
-    assert weight_loading_calls, "expected a repo-name SentenceTransformer load in sentence_transformer.py"
+    assert (
+        weight_loading_calls
+    ), "expected a repo-name SentenceTransformer load in sentence_transformer.py"
     # cache_folder is forwarded either explicitly (fast-encoder branch) or via a **kwargs unpacking
     # (for_inference branch builds st_kwargs incl. cache_folder). A ** unpacking has kw.arg == None.
     for c in weight_loading_calls:

From a5e67ba4c1f111fcfb097d9c9fd8208acfa605e8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 1 Jul 2026 08:28:34 +0000
Subject: [PATCH 65/80] Close three SentenceTransformer / prefetch scoping gaps

- A no-variant load reads the canonical weights, so a lone variant safetensors
  (model.fp16.safetensors) must not prove the default pytorch_model.bin redundant.
  _prefetch_ignore_patterns now counts only a CANONICAL safetensors when no variant
  is requested (a new _is_canonical_model_weight_safetensors), so a repo shipping
  only a variant safetensors plus the canonical .bin keeps the .bin the load reads
  instead of dropping it and fetching it in-process over Xet. Errs strict: an
  unrecognized name keeps both formats.

- FastSentenceTransformer.from_pretrained validated the mutually-exclusive load
  modes (load_in_4bit / 8bit / 16bit) only after the prefetch, so a config rejected
  locally first downloaded the whole snapshot. Move the validation (and the
  full_finetuning normalization) before the prefetch, guarded to the non-for_inference
  path so the for_inference branch keeps its current behavior.

- The ST prefetch warmed cache_dir=kwargs.get("cache_folder") (the default HF cache
  when unset), but SentenceTransformer resolves its cache from
  SENTENCE_TRANSFORMERS_HOME when no cache_folder is passed. A load relying on that
  env then missed the warm and started an unprotected in-process download. Resolve
  the same env-backed cache folder for the prefetch.
---
 tests/test_prefetch_snapshot_scope.py  | 54 ++++++++++++++++++++++++++
 unsloth/models/_utils.py               | 28 +++++++++++--
 unsloth/models/sentence_transformer.py | 48 +++++++++++++----------
 3 files changed, 107 insertions(+), 23 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 55a151609c..e722a7d47a 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -248,6 +248,60 @@ def test_variant_drops_bin_when_variant_safetensors_present(monkeypatch):
     assert "*.bin" in ig
 
 
+def test_no_variant_keeps_bin_when_only_variant_safetensors(monkeypatch):
+    """A no-variant load reads pytorch_model.bin; a lone variant safetensors (model.fp16.safetensors)
+    must NOT prove the .bin redundant -- only a CANONICAL safetensors does. Else the .bin the load reads
+    is dropped from the warm and fetched in-process over Xet (Codex #6638)."""
+    _install_fake_model_info(monkeypatch, ["model.fp16.safetensors", "pytorch_model.bin"])
+    ig = U._prefetch_ignore_patterns("org/repo", weights_at_root = True)  # variant unset
+    assert "*.bin" not in ig
+    # A canonical safetensors DOES make the .bin redundant for a no-variant load.
+    _install_fake_model_info(monkeypatch, ["model.safetensors", "pytorch_model.bin"])
+    ig2 = U._prefetch_ignore_patterns("org/repo", weights_at_root = True)
+    assert "*.bin" in ig2
+
+
+def test_is_canonical_model_weight_safetensors():
+    """The canonical detector matches only the non-variant model-weight safetensors names a default
+    load reads, and rejects variant / sidecar names (Codex #6638)."""
+    assert U._is_canonical_model_weight_safetensors("model.safetensors") is True
+    assert U._is_canonical_model_weight_safetensors("model-00001-of-00002.safetensors") is True
+    assert U._is_canonical_model_weight_safetensors("model.safetensors.index.json") is True
+    assert U._is_canonical_model_weight_safetensors("model.fp16.safetensors") is False
+    assert U._is_canonical_model_weight_safetensors("model.fp16-00001-of-00002.safetensors") is False
+    assert U._is_canonical_model_weight_safetensors("adapter_model.safetensors") is False
+
+
+def test_st_prefetch_resolves_env_cache_and_runs_after_validation():
+    """The ST prefetch must resolve SENTENCE_TRANSFORMERS_HOME for its cache (so a load relying on that
+    env is a cache hit, not an unprotected in-process download) and must run AFTER the mutually-exclusive
+    load-mode validation (so a config rejected locally wastes no multi-GB download) (Codex #6638). Static
+    guard: importing ST pulls heavy optional deps."""
+    import ast
+    import os
+
+    src_path = os.path.join(os.path.dirname(U.__file__), "sentence_transformer.py")
+    with open(src_path, "r", encoding = "utf-8") as f:
+        src = f.read()
+    tree = ast.parse(src)
+    prefetch_calls = [
+        n for n in ast.walk(tree)
+        if isinstance(n, ast.Call) and isinstance(n.func, ast.Name)
+        and n.func.id == "maybe_prefetch_hf_snapshot"
+    ]
+    assert len(prefetch_calls) == 1, "expected exactly one ST prefetch call"
+    call = prefetch_calls[0]
+    # F3: the cache_dir kwarg resolves SENTENCE_TRANSFORMERS_HOME.
+    cache_dir_kw = next((kw for kw in call.keywords if kw.arg == "cache_dir"), None)
+    assert cache_dir_kw is not None, "ST prefetch must pass cache_dir"
+    assert "SENTENCE_TRANSFORMERS_HOME" in ast.dump(cache_dir_kw.value), (
+        "ST prefetch cache_dir must resolve SENTENCE_TRANSFORMERS_HOME"
+    )
+    # F2: the load-mode validation runs before the prefetch (fewer source lines = earlier).
+    val_lineno = src[: src.index("Can only load in 4bit or 8bit or 16bit")].count("\n")
+    assert val_lineno < call.lineno, "load-mode validation must precede the ST prefetch"
+
+
 def test_filename_has_variant_matches_single_and_sharded():
     """The variant detector matches both the single-file (.fp16.) and SHARDED (.fp16-) infixes and
     rejects the default (non-variant) names (gemini #6638)."""
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index d99df02404..017b438641 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1055,6 +1055,22 @@ def _filename_has_variant(filename, variant):
     return f".{variant}." in base or f".{variant}-" in base
 
 
+_CANONICAL_MODEL_WEIGHT_SAFETENSORS_RE = re.compile(
+    r"^(?:model\.safetensors|model-\d{5}-of-\d{5}\.safetensors|model\.safetensors\.index\.json)$"
+)
+
+
+def _is_canonical_model_weight_safetensors(filename):
+    """True for a CANONICAL (non-variant) model-weights safetensors a DEFAULT (no-variant) load reads:
+    ``model.safetensors``, a numbered shard, or the shard index. A variant file
+    (``model.fp16.safetensors``) does NOT match, so a repo that ships only a variant safetensors plus
+    the canonical ``pytorch_model.bin`` does not get its ``.bin`` wrongly dropped for a no-variant load
+    (which reads the ``.bin``). Errs strict: an unrecognized name keeps both formats (a safe over-fetch),
+    never dropping a ``.bin`` the load reads."""
+    name = filename.replace("\\", "/").rsplit("/", 1)[-1]
+    return bool(_CANONICAL_MODEL_WEIGHT_SAFETENSORS_RE.match(name))
+
+
 def _adapter_repo_has_safetensors(
     model_name,
     *,
@@ -1156,12 +1172,18 @@ def _prefetch_ignore_patterns(
                 or []
             )
             # Only count model-weights safetensors the load actually reads (in-scope subfolder/root,
-            # not an adapter / sidecar), so a .bin-only subfolder is not stripped of its weights. With
-            # a variant, only a variant-matching safetensors proves the variant's .bin redundant.
+            # not an adapter / sidecar), so a .bin-only subfolder is not stripped of its weights. With a
+            # variant, only a variant-matching safetensors proves the variant's .bin redundant; WITHOUT a
+            # variant, only a CANONICAL safetensors does (a lone variant model.fp16.safetensors does not
+            # make the default pytorch_model.bin redundant for a no-variant load, which reads the .bin).
             has_safetensors = any(
                 _is_model_weight_safetensors(sibling.rfilename)
                 and _in_requested_load_scope(sibling.rfilename, subfolder)
-                and (not variant or _filename_has_variant(sibling.rfilename, variant))
+                and (
+                    _filename_has_variant(sibling.rfilename, variant)
+                    if variant
+                    else _is_canonical_model_weight_safetensors(sibling.rfilename)
+                )
                 for sibling in siblings
             )
             if has_safetensors:
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index bcea82f71d..72a965efea 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -1407,16 +1407,41 @@ def from_pretrained(
                 "Run `pip install sentence-transformers` to install it."
             )
 
+        # Validate the mutually-exclusive load modes BEFORE the prefetch (a config rejected locally must
+        # not first download many GB of weights). Only the non-for_inference path uses these flags; the
+        # for_inference branch below skips them, so guard the check to preserve its behavior.
+        if not for_inference:
+            # sanity check, thanks Etherl:
+            if full_finetuning and (load_in_4bit or load_in_8bit):
+                print(
+                    "Unsloth: You selected full finetuning support, but 4bit / 8bit is enabled - disabling LoRA / QLoRA."
+                )
+                load_in_4bit = False
+                load_in_8bit = False
+                load_in_fp8 = False
+                load_in_16bit = False
+
+            if int(load_in_4bit) + int(load_in_8bit) + int(load_in_16bit) >= 2:
+                raise RuntimeError(
+                    "Unsloth: Can only load in 4bit or 8bit or 16bit, not a combination!\n"
+                    "Also, we by default set `load_in_16bit = True`.\n"
+                    "If you want 4bit LoRA finetuning, set `load_in_16bit = False` and `load_in_4bit = True`\n"
+                    "If you want 8bit finetuning, set both `load_in_16bit = False` and `load_in_8bit = True`"
+                )
+
         # Pre-download in a killable subprocess (Xet -> HTTP on a no-progress stall) so the
         # SentenceTransformer load below is a cache hit and cannot hang on a stalled Xet transfer.
         # Covers every path (for_inference, fast-encoder, fallback), which all resolve the repo
         # in-process. weights_at_root is left False: an ST repo's component weights can live in
-        # subfolders (the pooling / dense modules), so the whole snapshot is warmed.
+        # subfolders (the pooling / dense modules), so the whole snapshot is warmed. Resolve the same
+        # cache the ST load will use: an explicit cache_folder, else SENTENCE_TRANSFORMERS_HOME (which
+        # SentenceTransformer honors when cache_folder is unset), else the default HF cache -- otherwise
+        # a warm into the wrong cache is missed and the load starts an unprotected in-process download.
         maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
             revision = revision,
-            cache_dir = kwargs.get("cache_folder"),
+            cache_dir = kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
             local_files_only = kwargs.get("local_files_only", False),
         )
 
@@ -1453,24 +1478,7 @@ def from_pretrained(
             st_model = SentenceTransformer(model_name, **st_kwargs)
             return st_model
 
-        # sanity check, thanks Etherl:
-        if full_finetuning and (load_in_4bit or load_in_8bit):
-            print(
-                "Unsloth: You selected full finetuning support, but 4bit / 8bit is enabled - disabling LoRA / QLoRA."
-            )
-            load_in_4bit = False
-            load_in_8bit = False
-            load_in_fp8 = False
-            load_in_16bit = False
-
-        if int(load_in_4bit) + int(load_in_8bit) + int(load_in_16bit) >= 2:
-            raise RuntimeError(
-                "Unsloth: Can only load in 4bit or 8bit or 16bit, not a combination!\n"
-                "Also, we by default set `load_in_16bit = True`.\n"
-                "If you want 4bit LoRA finetuning, set `load_in_16bit = False` and `load_in_4bit = True`\n"
-                "If you want 8bit finetuning, set both `load_in_16bit = False` and `load_in_8bit = True`"
-            )
-
+        # Load-mode validation + full_finetuning normalization already ran before the prefetch above.
         if "auto_model" not in kwargs:
             kwargs["auto_model"] = AutoModel
 

From 8998e9acb6a9d82fe1903cf5b0d1e2eafecbc347 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 1 Jul 2026 08:29:06 +0000
Subject: [PATCH 66/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index e722a7d47a..86039ec637 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -268,7 +268,9 @@ def test_is_canonical_model_weight_safetensors():
     assert U._is_canonical_model_weight_safetensors("model-00001-of-00002.safetensors") is True
     assert U._is_canonical_model_weight_safetensors("model.safetensors.index.json") is True
     assert U._is_canonical_model_weight_safetensors("model.fp16.safetensors") is False
-    assert U._is_canonical_model_weight_safetensors("model.fp16-00001-of-00002.safetensors") is False
+    assert (
+        U._is_canonical_model_weight_safetensors("model.fp16-00001-of-00002.safetensors") is False
+    )
     assert U._is_canonical_model_weight_safetensors("adapter_model.safetensors") is False
 
 
@@ -285,8 +287,10 @@ def test_st_prefetch_resolves_env_cache_and_runs_after_validation():
         src = f.read()
     tree = ast.parse(src)
     prefetch_calls = [
-        n for n in ast.walk(tree)
-        if isinstance(n, ast.Call) and isinstance(n.func, ast.Name)
+        n
+        for n in ast.walk(tree)
+        if isinstance(n, ast.Call)
+        and isinstance(n.func, ast.Name)
         and n.func.id == "maybe_prefetch_hf_snapshot"
     ]
     assert len(prefetch_calls) == 1, "expected exactly one ST prefetch call"
@@ -294,9 +298,9 @@ def test_st_prefetch_resolves_env_cache_and_runs_after_validation():
     # F3: the cache_dir kwarg resolves SENTENCE_TRANSFORMERS_HOME.
     cache_dir_kw = next((kw for kw in call.keywords if kw.arg == "cache_dir"), None)
     assert cache_dir_kw is not None, "ST prefetch must pass cache_dir"
-    assert "SENTENCE_TRANSFORMERS_HOME" in ast.dump(cache_dir_kw.value), (
-        "ST prefetch cache_dir must resolve SENTENCE_TRANSFORMERS_HOME"
-    )
+    assert "SENTENCE_TRANSFORMERS_HOME" in ast.dump(
+        cache_dir_kw.value
+    ), "ST prefetch cache_dir must resolve SENTENCE_TRANSFORMERS_HOME"
     # F2: the load-mode validation runs before the prefetch (fewer source lines = earlier).
     val_lineno = src[: src.index("Can only load in 4bit or 8bit or 16bit")].count("\n")
     assert val_lineno < call.lineno, "load-mode validation must precede the ST prefetch"

From 286683845958257eb6e512a5b77d68c4586f9393 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 1 Jul 2026 08:51:24 +0000
Subject: [PATCH 67/80] Resolve SENTENCE_TRANSFORMERS_HOME for fallback module
 loads

The fallback path resolved cache_dir for _module_path / _load_modules from
kwargs.get("cache_folder") only, which is None when a caller relies on
SENTENCE_TRANSFORMERS_HOME. Those helpers call hf_hub_download directly, which
does not honor SENTENCE_TRANSFORMERS_HOME, so modules.json and module files were
fetched into the default HF cache and missed the snapshot the prefetch warmed at
SENTENCE_TRANSFORMERS_HOME, leaving them to an unprotected in-process Xet
download. Resolve the same cache the prefetch uses (cache_folder, else the env
var) at both fallback sites so those loads hit the warm.

Add an AST regression test asserting every kwargs-derived cache_dir on the
fallback _module_path / _load_modules calls also resolves SENTENCE_TRANSFORMERS_HOME.
---
 tests/test_prefetch_snapshot_scope.py  | 39 ++++++++++++++++++++++++++
 unsloth/models/sentence_transformer.py | 12 ++++++--
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 86039ec637..5ebf6f53cc 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -306,6 +306,45 @@ def test_st_prefetch_resolves_env_cache_and_runs_after_validation():
     assert val_lineno < call.lineno, "load-mode validation must precede the ST prefetch"
 
 
+def test_st_fallback_module_loads_resolve_env_cache():
+    """The fallback module loads must resolve the SAME cache the prefetch warmed. _module_path /
+    _read_pooling_mode call hf_hub_download directly, which does NOT honor SENTENCE_TRANSFORMERS_HOME,
+    so any cache_dir derived from cache_folder must also fall back to the env var; otherwise, when a
+    caller relies on SENTENCE_TRANSFORMERS_HOME without passing cache_folder, modules.json / module
+    files miss the warm and are fetched in-process over Xet (Codex #6638). Static guard: importing ST
+    pulls heavy optional deps."""
+    import ast
+    import os
+
+    src_path = os.path.join(os.path.dirname(U.__file__), "sentence_transformer.py")
+    with open(src_path, "r", encoding = "utf-8") as f:
+        src = f.read()
+    tree = ast.parse(src)
+
+    # Every _module_path / _load_modules call whose cache_dir is derived from cache_folder (i.e. the
+    # from_pretrained fallback sites, not the internal `cache_dir = cache_dir` pass-throughs) must also
+    # resolve SENTENCE_TRANSFORMERS_HOME so the resolution matches the prefetch above.
+    checked = 0
+    for node in ast.walk(tree):
+        if not (isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute)):
+            continue
+        if node.func.attr not in ("_module_path", "_load_modules"):
+            continue
+        cache_dir_kw = next((kw for kw in node.keywords if kw.arg == "cache_dir"), None)
+        if cache_dir_kw is None:
+            continue
+        dumped = ast.dump(cache_dir_kw.value)
+        if "cache_folder" not in dumped:
+            continue  # internal pass-through (cache_dir = cache_dir): not a resolution site
+        checked += 1
+        assert "SENTENCE_TRANSFORMERS_HOME" in dumped, (
+            f"{node.func.attr} cache_dir resolves cache_folder but not SENTENCE_TRANSFORMERS_HOME"
+        )
+    assert checked >= 2, (
+        "expected the fallback _module_path and _load_modules calls to resolve the env cache"
+    )
+
+
 def test_filename_has_variant_matches_single_and_sharded():
     """The variant detector matches both the single-file (.fp16.) and SHARDED (.fp16-) infixes and
     rejects the default (non-variant) names (gemini #6638)."""
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index 72a965efea..ba873cdba5 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -1692,9 +1692,15 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
 
         # No modules.json -> force 16-bit: saving is custom for these models and
         # 4-bit would need dequant in save_pretrained_merged, not worth it.
+        # Resolve the same cache the prefetch warmed: hf_hub_download (used by _module_path /
+        # _read_pooling_mode) does NOT honor SENTENCE_TRANSFORMERS_HOME, so passing only cache_folder
+        # (None when the caller relies on the env var) would miss the warm and fetch these files
+        # in-process over Xet.
         has_modules_json = (
             FastSentenceTransformer._module_path(
-                model_name, token, cache_dir = kwargs.get("cache_folder")
+                model_name,
+                token,
+                cache_dir = kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
             )
             is not None
         )
@@ -1748,7 +1754,9 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
             max_seq_length,
             pooling_mode,
             trust_remote_code = trust_remote_code,
-            cache_dir = kwargs.get("cache_folder"),
+            # Reuse the prefetch's resolved cache (see _module_path note above) so the fallback
+            # modules.json / module-file loads hit the warm instead of an in-process Xet download.
+            cache_dir = kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
         )
 
         st_device = device_map

From 636ff82f871cca6221007a2f91d44b2fd164689b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 1 Jul 2026 08:51:58 +0000
Subject: [PATCH 68/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py  | 12 ++++++------
 unsloth/models/sentence_transformer.py |  3 ++-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 5ebf6f53cc..b4584ca255 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -337,12 +337,12 @@ def test_st_fallback_module_loads_resolve_env_cache():
         if "cache_folder" not in dumped:
             continue  # internal pass-through (cache_dir = cache_dir): not a resolution site
         checked += 1
-        assert "SENTENCE_TRANSFORMERS_HOME" in dumped, (
-            f"{node.func.attr} cache_dir resolves cache_folder but not SENTENCE_TRANSFORMERS_HOME"
-        )
-    assert checked >= 2, (
-        "expected the fallback _module_path and _load_modules calls to resolve the env cache"
-    )
+        assert (
+            "SENTENCE_TRANSFORMERS_HOME" in dumped
+        ), f"{node.func.attr} cache_dir resolves cache_folder but not SENTENCE_TRANSFORMERS_HOME"
+    assert (
+        checked >= 2
+    ), "expected the fallback _module_path and _load_modules calls to resolve the env cache"
 
 
 def test_filename_has_variant_matches_single_and_sharded():
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index ba873cdba5..d692033ae6 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -1700,7 +1700,8 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
             FastSentenceTransformer._module_path(
                 model_name,
                 token,
-                cache_dir = kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
+                cache_dir = kwargs.get("cache_folder")
+                or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
             )
             is not None
         )

From 76e0ced192ea5a0af74853bf35017cec01943764 Mon Sep 17 00:00:00 2001
From: danielhanchen <danielhanchen@gmail.com>
Date: Wed, 1 Jul 2026 09:24:51 +0000
Subject: [PATCH 69/80] Load ST fallback modules from the same revision as the
 weights

The custom SentenceTransformer fallback path loads the model weights via
FastModel.from_pretrained(revision=revision), which forwards revision to the
weight load, but resolved the module files (modules.json, the pooling config,
per-module directories) with no revision. On a revision-pinned repo that left
those files on the repo default branch: they missed the prefetch's revision-
pinned warm and were fetched in-process over Hub/Xet, and default-branch module
configs could be mixed with the revision-pinned weights.

Thread revision through _module_path, _read_pooling_mode, and _load_modules (and
the load_dir_path / hf_hub_download calls inside them), and forward it from the
from_pretrained fallback call sites. A None revision resolves to the default
branch exactly as before, so the common unpinned load is unchanged.

The llama loader is intentionally left alone: FastLlamaModel.from_pretrained
accepts revision as a named parameter but never passes it to the weight load
(it is not in **kwargs), so its prefetch and its load both resolve the default
branch. Forwarding revision only to that prefetch would warm a branch the load
never reads.

Adds a static AST regression test asserting the three fallback helpers accept
revision, every download primitive inside them forwards it, _load_modules
threads it into its internal helper calls, and the from_pretrained fallback
sites forward it.
---
 tests/test_prefetch_snapshot_scope.py  | 77 ++++++++++++++++++++++++++
 unsloth/models/sentence_transformer.py | 23 ++++++--
 2 files changed, 95 insertions(+), 5 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index b4584ca255..92b79bacaf 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -345,6 +345,83 @@ def test_st_fallback_module_loads_resolve_env_cache():
     ), "expected the fallback _module_path and _load_modules calls to resolve the env cache"
 
 
+def test_st_fallback_module_loads_forward_revision():
+    """A revision-pinned ST repo loaded via the custom fallback path loads its model WEIGHTS from the
+    requested revision (FastModel forwards revision to the weight load), so the module files (modules.json,
+    pooling config, per-module dirs) must load from the SAME revision. Otherwise they resolve the repo
+    default branch: fetched in-process over Xet (missing the prefetch's revision-pinned warm) and mixed
+    with the revision-pinned weights (Codex #6638). Static guard: (a) _module_path / _read_pooling_mode /
+    _load_modules accept a revision arg, (b) every hf_hub_download / load_dir_path inside them forwards
+    revision, (c) _load_modules threads revision into its internal _module_path / _read_pooling_mode
+    calls, (d) the from_pretrained fallback _module_path / _load_modules calls forward revision. Importing
+    ST pulls heavy optional deps."""
+    import ast
+    import os
+
+    src_path = os.path.join(os.path.dirname(U.__file__), "sentence_transformer.py")
+    with open(src_path, "r", encoding = "utf-8") as f:
+        tree = ast.parse(f.read())
+
+    funcs = {
+        n.name: n
+        for n in ast.walk(tree)
+        if isinstance(n, ast.FunctionDef)
+        and n.name in ("_module_path", "_read_pooling_mode", "_load_modules")
+    }
+    assert set(funcs) == {"_module_path", "_read_pooling_mode", "_load_modules"}
+
+    # (a) each helper takes a revision parameter.
+    for name, fn in funcs.items():
+        arg_names = {a.arg for a in fn.args.args + fn.args.kwonlyargs}
+        assert "revision" in arg_names, f"{name} must accept a revision argument"
+
+    # (b) every direct download primitive inside the helpers forwards revision.
+    downloads = 0
+    for name, fn in funcs.items():
+        for node in ast.walk(fn):
+            if not (isinstance(node, ast.Call) and isinstance(node.func, ast.Name)):
+                continue
+            if node.func.id not in ("hf_hub_download", "load_dir_path"):
+                continue
+            downloads += 1
+            assert any(
+                kw.arg == "revision" for kw in node.keywords
+            ), f"{node.func.id} in {name} must forward revision"
+    assert downloads >= 3, "expected the module-download primitives to be revision-guarded"
+
+    # (c) _load_modules threads revision into its internal _module_path / _read_pooling_mode calls.
+    internal = 0
+    for node in ast.walk(funcs["_load_modules"]):
+        if not (isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute)):
+            continue
+        if node.func.attr not in ("_module_path", "_read_pooling_mode"):
+            continue
+        internal += 1
+        assert any(
+            kw.arg == "revision" for kw in node.keywords
+        ), f"_load_modules must forward revision to {node.func.attr}"
+    assert internal >= 2, "expected _load_modules to call _module_path and _read_pooling_mode"
+
+    # (d) the from_pretrained fallback _module_path / _load_modules sites forward revision.
+    checked = 0
+    for node in ast.walk(tree):
+        if not (isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute)):
+            continue
+        if node.func.attr not in ("_module_path", "_load_modules"):
+            continue
+        cache_dir_kw = next((kw for kw in node.keywords if kw.arg == "cache_dir"), None)
+        if cache_dir_kw is None or "cache_folder" not in ast.dump(cache_dir_kw.value):
+            continue  # internal pass-through, not a from_pretrained fallback site
+        checked += 1
+        rev_kw = next((kw for kw in node.keywords if kw.arg == "revision"), None)
+        assert rev_kw is not None and "revision" in ast.dump(
+            rev_kw.value
+        ), f"{node.func.attr} fallback call must forward revision"
+    assert (
+        checked >= 2
+    ), "expected the fallback _module_path and _load_modules calls to forward revision"
+
+
 def test_filename_has_variant_matches_single_and_sharded():
     """The variant detector matches both the single-file (.fp16.) and SHARDED (.fp16-) infixes and
     rejects the default (non-variant) names (gemini #6638)."""
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index d692033ae6..098e21c64e 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -546,6 +546,7 @@ def _read_pooling_mode(
         model_name,
         token,
         cache_dir = None,
+        revision = None,
     ):
         """Read the pooling mode from modules.json, else return "mean"."""
         try:
@@ -555,7 +556,8 @@ def _read_pooling_mode(
                 modules_json_path = os.path.join(model_name, "modules.json")
             else:
                 modules_json_path = hf_hub_download(
-                    model_name, "modules.json", token = token, cache_dir = cache_dir
+                    model_name, "modules.json", token = token, cache_dir = cache_dir,
+                    revision = revision,
                 )
 
             with open(modules_json_path, "r", encoding = "utf-8") as f:
@@ -579,6 +581,7 @@ def _read_pooling_mode(
                                 os.path.join(pooling_path, "config.json"),
                                 token = token,
                                 cache_dir = cache_dir,
+                                revision = revision,
                             )
                         break
 
@@ -962,6 +965,7 @@ def _module_path(
         model_name,
         token = None,
         cache_dir = None,
+        revision = None,
     ):
         """Return the path to the modules.json file, or None."""
         try:
@@ -971,7 +975,8 @@ def _module_path(
             else:
                 try:
                     return hf_hub_download(
-                        model_name, "modules.json", token = token, cache_dir = cache_dir
+                        model_name, "modules.json", token = token, cache_dir = cache_dir,
+                        revision = revision,
                     )
                 except:
                     return None
@@ -1150,6 +1155,7 @@ def _load_modules(
         pooling_mode,
         trust_remote_code = False,
         cache_dir = None,
+        revision = None,
     ) -> tuple[OrderedDict, bool]:
         """Load modules from modules.json, else fall back to hard-coded modules.
 
@@ -1161,7 +1167,7 @@ def _load_modules(
 
         modules = OrderedDict()
         modules_json_path = FastSentenceTransformer._module_path(
-            model_name, token, cache_dir = cache_dir
+            model_name, token, cache_dir = cache_dir, revision = revision
         )
 
         if modules_json_path:
@@ -1189,7 +1195,8 @@ def _load_modules(
                     else:
                         try:
                             load_path = load_dir_path(
-                                model_name, module_path, token = token, cache_folder = cache_dir
+                                model_name, module_path, token = token,
+                                cache_folder = cache_dir, revision = revision,
                             )
                         except Exception as e:
                             print(f"Unsloth Warning: Could not download module {module_path}: {e}")
@@ -1218,7 +1225,7 @@ def _load_modules(
 
         if pooling_mode == "mean":
             pooling_mode = FastSentenceTransformer._read_pooling_mode(
-                model_name, token, cache_dir = cache_dir
+                model_name, token, cache_dir = cache_dir, revision = revision
             )
 
         modules["1"] = Pooling(word_embedding_dimension = hidden_size, pooling_mode = pooling_mode)
@@ -1702,6 +1709,7 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
                 token,
                 cache_dir = kwargs.get("cache_folder")
                 or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
+                revision = revision,
             )
             is not None
         )
@@ -1758,6 +1766,11 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
             # Reuse the prefetch's resolved cache (see _module_path note above) so the fallback
             # modules.json / module-file loads hit the warm instead of an in-process Xet download.
             cache_dir = kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
+            # Read the modules from the SAME revision the model weights load from (FastModel forwards
+            # revision to the weight load), so a revision-pinned repo hits the prefetch's warm instead
+            # of fetching default-branch module files in-process over Xet (and mixing them with the
+            # revision-pinned weights). A None revision resolves to the default branch as before.
+            revision = revision,
         )
 
         st_device = device_map

From a5ca1bce40d1722919e45477f36a00cd35b2f3cd Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 1 Jul 2026 09:26:17 +0000
Subject: [PATCH 70/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/models/sentence_transformer.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index 098e21c64e..cd2e6fcfde 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -556,7 +556,10 @@ def _read_pooling_mode(
                 modules_json_path = os.path.join(model_name, "modules.json")
             else:
                 modules_json_path = hf_hub_download(
-                    model_name, "modules.json", token = token, cache_dir = cache_dir,
+                    model_name,
+                    "modules.json",
+                    token = token,
+                    cache_dir = cache_dir,
                     revision = revision,
                 )
 
@@ -975,7 +978,10 @@ def _module_path(
             else:
                 try:
                     return hf_hub_download(
-                        model_name, "modules.json", token = token, cache_dir = cache_dir,
+                        model_name,
+                        "modules.json",
+                        token = token,
+                        cache_dir = cache_dir,
                         revision = revision,
                     )
                 except:
@@ -1195,8 +1201,11 @@ def _load_modules(
                     else:
                         try:
                             load_path = load_dir_path(
-                                model_name, module_path, token = token,
-                                cache_folder = cache_dir, revision = revision,
+                                model_name,
+                                module_path,
+                                token = token,
+                                cache_folder = cache_dir,
+                                revision = revision,
                             )
                         except Exception as e:
                             print(f"Unsloth Warning: Could not download module {module_path}: {e}")

From ca1fd06a39e2569f283736d369af9a5c59408dbd Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 1 Jul 2026 09:50:29 +0000
Subject: [PATCH 71/80] Tighten comments (comment-only, no logic change)

---
 unsloth/models/_utils.py               | 96 ++++++++++----------------
 unsloth/models/diffusion.py            |  8 +--
 unsloth/models/llama.py                | 14 ++--
 unsloth/models/loader.py               | 12 ++--
 unsloth/models/sentence_transformer.py | 27 +++-----
 unsloth/models/vision.py               | 13 ++--
 6 files changed, 63 insertions(+), 107 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 017b438641..2513cfa9ad 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1046,11 +1046,8 @@ def _is_model_weight_safetensors(filename):
 
 
 def _filename_has_variant(filename, variant):
-    """True if a weight *filename* belongs to the requested *variant* (variant="fp16"). Transformers
-    inserts the variant token before the extension, so a single-file weight carries it as a
-    ``.{variant}.`` infix (``model.fp16.safetensors``) and a sharded weight as a ``.{variant}-`` infix
-    (``model.fp16-00001-of-00002.safetensors``). Match both so a sharded variant is recognized and its
-    redundant ``.bin`` dropped. Callers gate on *variant* being truthy."""
+    """True if a weight filename carries the variant token: `.{variant}.` (single file) or
+    `.{variant}-` (sharded) infix. Matching both recognizes a sharded variant."""
     base = filename.replace("\\", "/").rsplit("/", 1)[-1]
     return f".{variant}." in base or f".{variant}-" in base
 
@@ -1061,12 +1058,10 @@ def _filename_has_variant(filename, variant):
 
 
 def _is_canonical_model_weight_safetensors(filename):
-    """True for a CANONICAL (non-variant) model-weights safetensors a DEFAULT (no-variant) load reads:
-    ``model.safetensors``, a numbered shard, or the shard index. A variant file
-    (``model.fp16.safetensors``) does NOT match, so a repo that ships only a variant safetensors plus
-    the canonical ``pytorch_model.bin`` does not get its ``.bin`` wrongly dropped for a no-variant load
-    (which reads the ``.bin``). Errs strict: an unrecognized name keeps both formats (a safe over-fetch),
-    never dropping a ``.bin`` the load reads."""
+    """True for a canonical (non-variant) model-weights safetensors a default load reads
+    (model.safetensors, a numbered shard, or the index). Errs strict: an unrecognized name keeps
+    both formats, so a repo shipping only a variant safetensors plus pytorch_model.bin never has
+    its .bin wrongly dropped for a no-variant load."""
     name = filename.replace("\\", "/").rsplit("/", 1)[-1]
     return bool(_CANONICAL_MODEL_WEIGHT_SAFETENSORS_RE.match(name))
 
@@ -1077,10 +1072,8 @@ def _adapter_repo_has_safetensors(
     token = None,
     revision = None,
 ):
-    """Best-effort: does the adapter repo ship a ROOT safetensors adapter weight? PeftModel prefers
-    safetensors, so when one is present the ``.bin`` form is redundant. Scoped to repo-ROOT files (the
-    adapter warm only pulls root ``adapter_model*``), so an unrelated ``checkpoint-*/adapter_model.safetensors``
-    does not make us drop the root ``.bin``. Any failure returns False so both formats stay eligible."""
+    """Best-effort: does the adapter repo ship a root safetensors adapter weight (making the
+    .bin redundant)? Scoped to root adapter_model* files; any failure returns False."""
     try:
         from huggingface_hub import HfApi
         siblings = HfApi().model_info(model_name, revision = revision, token = token).siblings or []
@@ -1106,17 +1099,14 @@ def _prefetch_ignore_patterns(
     variant = None,
     weights_at_root = False,
 ):
-    """ignore_patterns for the prewarm snapshot: the static skip list, minus the checkpoint guard when
-    loading from a checkpoint-* subfolder, minus the weight format the load will not read. Explicit
-    use_safetensors acts as a format allowlist (True -> skip *.bin, False -> skip *.safetensors); auto
-    (None) skips *.bin only when in-scope safetensors are also shipped (Transformers prefers them).
-    from_tf / from_flax keep the *.h5 / *.msgpack weights they read as the actual weights.
-
-    The cross-format drop is suppressed for a WHOLE multi-component snapshot (weights_at_root=False and
-    no subfolder: a SentenceTransformer / diffusers repo whose modules ship weights in per-module
-    subfolders, each in its own format). HF fnmatch "*" spans "/", so dropping "*.bin" there would strip
-    a subdir module's only weight and leave the module load to an in-process Xet fetch. The redundancy
-    reasoning holds only for a single-scope (root / subfolder) load."""
+    """ignore_patterns for the prewarm snapshot: the static skip list, minus the checkpoint guard
+    when loading from a checkpoint-* subfolder, minus the weight format the load will not read.
+    use_safetensors is a format allowlist (True -> skip *.bin, False -> skip *.safetensors); auto
+    (None) skips *.bin only when in-scope safetensors are shipped. from_tf/from_flax keep *.h5/*.msgpack.
+
+    Suppressed for a whole multi-component snapshot (weights_at_root=False, no subfolder: ST/diffusers
+    repos whose modules ship weights per-subfolder, each in its own format), since HF fnmatch "*" spans
+    "/" so dropping "*.bin" would strip a module's only weight."""
     # Keep checkpoint-*/* when loading from such a subfolder; keep *.h5 / *.msgpack under from_tf/flax.
     ignore_patterns = [
         pattern
@@ -1131,9 +1121,8 @@ def _prefetch_ignore_patterns(
             or (from_flax and pattern == "*.msgpack")
         )
     ]
-    # Drop the format the load will not read (Transformers reads exactly one; the other doubles the
-    # download we optimize). Skipped for a whole multi-component snapshot (see docstring): each subdir
-    # module ships its own format, so keep every format rather than strip a module's only weight.
+    # Drop the format the load will not read (the other doubles the download). Skipped for a whole
+    # multi-component snapshot (see docstring) so a subdir module's only weight is not stripped.
     whole_multi_component = not weights_at_root and not (
         isinstance(subfolder, str) and subfolder.strip("/")
     )
@@ -1171,11 +1160,8 @@ def _prefetch_ignore_patterns(
                 .siblings
                 or []
             )
-            # Only count model-weights safetensors the load actually reads (in-scope subfolder/root,
-            # not an adapter / sidecar), so a .bin-only subfolder is not stripped of its weights. With a
-            # variant, only a variant-matching safetensors proves the variant's .bin redundant; WITHOUT a
-            # variant, only a CANONICAL safetensors does (a lone variant model.fp16.safetensors does not
-            # make the default pytorch_model.bin redundant for a no-variant load, which reads the .bin).
+            # Only count in-scope model-weights safetensors the load reads (not adapters/sidecars):
+            # with a variant, a variant-matching one; without, a canonical one proves the .bin redundant.
             has_safetensors = any(
                 _is_model_weight_safetensors(sibling.rfilename)
                 and _in_requested_load_scope(sibling.rfilename, subfolder)
@@ -1214,16 +1200,13 @@ def maybe_prefetch_hf_snapshot(
 ):
     """Warm the Hugging Face cache for a remote repo before the in-process load.
 
-    Xet downloads can hang on a blob with no progress and no exception, and a blocked native Xet thread
-    cannot be killed in-process. We pull the snapshot first in a killable subprocess that falls back
-    from Xet to HTTP on a no-progress stall (unsloth_zoo.hf_xet_fallback); the following from_pretrained
-    is then a cache hit and cannot stall on Xet.
+    Xet downloads can hang on a blob with no progress and no exception, and a blocked native Xet
+    thread cannot be killed in-process. We pull the snapshot first in a killable subprocess that falls
+    back Xet -> HTTP on a stall (unsloth_zoo.hf_xet_fallback), so from_pretrained is then a cache hit.
 
-    Returns True iff the snapshot was warmed in the subprocess, so the caller can clear force_download
-    for the in-process load (else a forced reload re-downloads over the Xet path this avoids). Returns
-    False when warming was skipped (local path / offline / local_files_only / fast_inference, or an
-    older unsloth_zoo) or failed. Best-effort: a deterministic failure (missing repo, auth, disk) is
-    left for from_pretrained to surface; only a both-transports-stalled DownloadStallError is raised.
+    Returns True iff warmed (caller can then clear force_download), False when skipped (local / offline /
+    local_files_only / fast_inference / old unsloth_zoo) or failed. Best-effort: only a both-transports-
+    stalled DownloadStallError is raised; other failures are left for from_pretrained to surface.
     """
     try:
         from unsloth_zoo.hf_xet_fallback import (
@@ -1257,10 +1240,8 @@ def maybe_prefetch_hf_snapshot(
     if fast_inference:  # vLLM has its own download path
         return False
 
-    # A tokenizer-only / adapter-only warm allow-lists exact files below, so the weight-format ignore
-    # list is moot -- skipping it also avoids the auto branch's model_info call. The format probe keys
-    # off an ACTUAL weight file, not config.json (which AutoConfig caches before this runs in the Llama
-    # / diffusion paths), so a config-based guess would skip the .bin-drop and over-fetch both formats.
+    # tokenizer-only / adapter-only warms allow-list exact files below, so the weight-format ignore
+    # list (and its auto-branch model_info call) is moot and skipped.
     ignore_patterns = (
         None
         if tokenizer_only or adapter_only or gguf_file
@@ -1276,9 +1257,8 @@ def maybe_prefetch_hf_snapshot(
             weights_at_root = weights_at_root,
         )
     )
-    # Narrow the warm to what the load reads, so a repo shipping extra weights (alternate checkpoints,
-    # merged models, alternate precisions) is not pulled in full. Every branch still warms the root
-    # tokenizer / config / custom-code assets, so those never fall to an unprotected in-process download.
+    # Narrow the warm to what the load reads (a repo with extra checkpoints/precisions is not pulled in
+    # full); every branch still warms root tokenizer/config/custom-code so those never fall in-process.
     allow_patterns = None
     if gguf_file:
         # from_pretrained(gguf_file=NAME) reads exactly that GGUF (de-quantized on load), but the static
@@ -1294,13 +1274,11 @@ def maybe_prefetch_hf_snapshot(
         # (not what the tokenizer load reads) even if it ships them.
         allow_patterns = list(_ROOT_AUX_PREFETCH_PATTERNS)
     elif adapter_only:
-        # A PEFT adapter load reads only adapter_config.json + adapter_model.* (plus root tokenizer /
-        # config), so restrict to those: an adapter repo also publishing merged weights does not pull
-        # them.
+        # A PEFT adapter load reads only adapter_config.json + adapter_model.* (plus root aux), so an
+        # adapter repo also publishing merged weights does not pull them.
         allow_patterns = [*_ADAPTER_PREFETCH_PATTERNS, *_ROOT_AUX_PREFETCH_PATTERNS]
-        # An adapter ships ONE format and PeftModel reads ONE (safetensors when present), so don't warm
-        # both: explicit use_safetensors wins, else prefer safetensors when the repo ships it
-        # (best-effort; any failure keeps both, never under-warming a .bin-only adapter).
+        # PeftModel reads one format (safetensors when present): explicit use_safetensors wins, else
+        # prefer safetensors when shipped (best-effort; any failure keeps both).
         if use_safetensors is False:
             ignore_patterns = [
                 "adapter_model*.safetensors",
@@ -1314,10 +1292,8 @@ def maybe_prefetch_hf_snapshot(
         # subfolder=X: the load resolves every weight under X/, so warm that subfolder (plus root aux).
         allow_patterns = [f"{subfolder.strip('/')}/*", *_ROOT_AUX_PREFETCH_PATTERNS]
     elif weights_at_root:
-        # A bare load reads only ROOT weights: keep the warm otherwise unfiltered but drop weights
-        # nested in subdirs (fp16/, experimental/, checkpoint dirs) the root load never reads. Only
-        # weight files are excluded, so a subdir config stays warmed. Not applied to diffusion (its
-        # component weights live in subfolders; FastDiffusionModel leaves weights_at_root False).
+        # A bare load reads only root weights: drop weights nested in subdirs (fp16/, checkpoint dirs)
+        # it never reads, while keeping subdir configs. Diffusion leaves weights_at_root False.
         ignore_patterns = [*(ignore_patterns or []), *_SUBDIR_WEIGHT_IGNORE_PATTERNS]
     try:
         snapshot_download_with_xet_fallback(
diff --git a/unsloth/models/diffusion.py b/unsloth/models/diffusion.py
index 512074b3fb..d12ea58eff 100644
--- a/unsloth/models/diffusion.py
+++ b/unsloth/models/diffusion.py
@@ -181,11 +181,9 @@ def from_pretrained(
 
         model_cls = _resolve_diffusion_model_class(config)
 
-        # Repo confirmed a diffusion model: pre-download it in a killable subprocess (Xet -> HTTP on a
-        # stall) so the weight load is a cache hit. After validation, so a non-diffusion repo fails on
-        # config metadata alone. subfolder is NOT forwarded: the pipeline loads the repo ROOT (the whole
-        # DiffusionPipeline -- model_index.json + every component subfolder), so narrowing to one
-        # subfolder would leave the other components (unet/, vae/, text_encoder/) to in-process Xet.
+        # Pre-download the confirmed diffusion repo (Xet -> HTTP on a stall) so the weight load is a cache
+        # hit. subfolder is NOT forwarded: the pipeline loads the whole repo root (every component
+        # subfolder), so narrowing to one would leave unet/, vae/, text_encoder/ to in-process Xet.
         maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 2cc2c0b3ed..2158c5d23c 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2420,11 +2420,10 @@ def from_pretrained(
 
         preferred_attn_impl = resolve_attention_implementation(model_function, model_config)
 
-        # Pre-download the repo in a killable subprocess (Xet -> HTTP on a no-progress stall) so the
-        # in-process weight load below is a cache hit and cannot hang. Runs AFTER the AutoConfig +
-        # model-class check, so an unsupported repo fails on its small config fetch without pulling
-        # weights. revision is NOT forwarded: the load resolves model_name (possibly a remapped
-        # prequantized repo, where the caller's revision does not exist) on its default branch.
+        # Pre-download the repo in a killable subprocess (Xet -> HTTP on a stall) so the weight load is
+        # a cache hit. After the AutoConfig/model-class check, so an unsupported repo fails on its small
+        # config fetch. revision is NOT forwarded: the load resolves model_name (possibly a remapped
+        # prequantized repo where the caller's revision does not exist) on its default branch.
         _prefetched = maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
@@ -2448,9 +2447,8 @@ def from_pretrained(
         if _prefetched and kwargs.get("force_download", False):
             kwargs["force_download"] = False
 
-        # The tokenizer loads in-process below regardless of the vLLM weight path, so warm its files
-        # through the same killable subprocess. The base prefetch already covered model_name, so only
-        # warm here when the tokenizer is a different repo, or fast_inference skipped the base warm.
+        # The tokenizer loads in-process regardless of the vLLM path; the base prefetch already covered
+        # model_name, so only warm here for a different tokenizer repo, or when fast_inference skipped it.
         _tokenizer_repo = (
             tokenizer_name if (isinstance(tokenizer_name, str) and tokenizer_name) else model_name
         )
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 75bb122fbc..f4d2e6c1cf 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -877,10 +877,8 @@ def from_pretrained(
                 # fast_inference (vLLM owns only the base model's download path).
                 fast_inference = False,
                 force_download = kwargs.get("force_download", False),
-                # Do NOT inherit the base use_safetensors (it selects the BASE format): the adapter has
-                # its own (usually adapter_model.safetensors), and use_safetensors=False would skip a
-                # safetensors-only adapter. Leave it auto. adapter_only restricts the warm to the
-                # adapter's own files + root aux, so a repo also publishing merged weights does not pull them.
+                # Leave use_safetensors auto: inheriting the base format could skip a safetensors-only
+                # adapter. adapter_only restricts the warm to the adapter's own files + root aux.
                 adapter_only = True,
             )
             # Child already did the forced download; clear the flag so the load reuses the warm cache.
@@ -1828,10 +1826,8 @@ def _patched_car(
                 # fast_inference (vLLM owns only the base model's download path).
                 fast_inference = False,
                 force_download = kwargs.get("force_download", False),
-                # Do NOT inherit the base use_safetensors (it selects the BASE format): the adapter has
-                # its own (usually adapter_model.safetensors), and use_safetensors=False would skip a
-                # safetensors-only adapter. Leave it auto. adapter_only restricts the warm to the
-                # adapter's own files + root aux, so a repo also publishing merged weights does not pull them.
+                # Leave use_safetensors auto: inheriting the base format could skip a safetensors-only
+                # adapter. adapter_only restricts the warm to the adapter's own files + root aux.
                 adapter_only = True,
             )
             # Child already did the forced download; clear the flag so the load reuses the warm cache.
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index cd2e6fcfde..ca774cff9d 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -1445,14 +1445,10 @@ def from_pretrained(
                     "If you want 8bit finetuning, set both `load_in_16bit = False` and `load_in_8bit = True`"
                 )
 
-        # Pre-download in a killable subprocess (Xet -> HTTP on a no-progress stall) so the
-        # SentenceTransformer load below is a cache hit and cannot hang on a stalled Xet transfer.
-        # Covers every path (for_inference, fast-encoder, fallback), which all resolve the repo
-        # in-process. weights_at_root is left False: an ST repo's component weights can live in
-        # subfolders (the pooling / dense modules), so the whole snapshot is warmed. Resolve the same
-        # cache the ST load will use: an explicit cache_folder, else SENTENCE_TRANSFORMERS_HOME (which
-        # SentenceTransformer honors when cache_folder is unset), else the default HF cache -- otherwise
-        # a warm into the wrong cache is missed and the load starts an unprotected in-process download.
+        # Pre-download in a killable subprocess (Xet -> HTTP on a stall) so the ST load below is a cache
+        # hit. weights_at_root stays False since ST component weights live in per-module subfolders.
+        # Resolve the cache the load uses: cache_folder, else SENTENCE_TRANSFORMERS_HOME (which ST honors
+        # when cache_folder is unset), else the default HF cache -- a wrong-cache warm would be missed.
         maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
@@ -1591,10 +1587,8 @@ def from_pretrained(
                 elif is_mpnet:
                     FastSentenceTransformer._patch_mpnet_v5()
 
-            # Load via native SentenceTransformer (bypasses Unsloth patching). Forward cache_folder so
-            # this load reads the cache the prefetch warmed (as the for_inference branch does); a custom
-            # cache_folder would otherwise miss the warm and start an unprotected in-process Hub/Xet
-            # download. None keeps the default cache, matching the prefetch's cache_dir.
+            # Forward cache_folder so this load reads the cache the prefetch warmed (None lets ST honor
+            # SENTENCE_TRANSFORMERS_HOME, matching the prefetch); a custom one would otherwise miss it.
             st_model = SentenceTransformer(
                 model_name,
                 device = st_device,
@@ -1708,10 +1702,8 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
 
         # No modules.json -> force 16-bit: saving is custom for these models and
         # 4-bit would need dequant in save_pretrained_merged, not worth it.
-        # Resolve the same cache the prefetch warmed: hf_hub_download (used by _module_path /
-        # _read_pooling_mode) does NOT honor SENTENCE_TRANSFORMERS_HOME, so passing only cache_folder
-        # (None when the caller relies on the env var) would miss the warm and fetch these files
-        # in-process over Xet.
+        # Resolve the same cache the prefetch warmed: hf_hub_download (used here and by
+        # _load_modules) ignores SENTENCE_TRANSFORMERS_HOME, so passing bare cache_folder would miss it.
         has_modules_json = (
             FastSentenceTransformer._module_path(
                 model_name,
@@ -1772,8 +1764,7 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
             max_seq_length,
             pooling_mode,
             trust_remote_code = trust_remote_code,
-            # Reuse the prefetch's resolved cache (see _module_path note above) so the fallback
-            # modules.json / module-file loads hit the warm instead of an in-process Xet download.
+            # Same resolved cache as above so the fallback module loads hit the warm, not Xet.
             cache_dir = kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
             # Read the modules from the SAME revision the model weights load from (FastModel forwards
             # revision to the weight load), so a revision-pinned repo hits the prefetch's warm instead
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 9532be20e9..33d69879c3 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -805,9 +805,8 @@ def from_pretrained(
         # For debugging - we use a download counter to see if environments are not breaking or if HF is down
         get_statistics(kwargs.get("local_files_only", False))
 
-        # vLLM owns the weight download only when actually available; if fast_inference was requested
-        # but vLLM is missing, the load falls through to the in-process HF path (fast_inference_setup
-        # flips the flag below), so the weights must still be warmed here. Resolve availability now.
+        # vLLM owns the weight download only when available; if requested but missing, the load falls
+        # through to the in-process HF path, so weights must still be warmed here.
         _vllm_owns_weights = fast_inference and is_vLLM_available()
 
         # Pre-download the repo in a killable subprocess (Xet -> HTTP on a no-progress stall) so the
@@ -836,11 +835,9 @@ def from_pretrained(
         if _prefetched and kwargs.get("force_download", False):
             kwargs["force_download"] = False
 
-        # The tokenizer / processor loads in-process below, so warm a SEPARATE tokenizer repo (explicit
-        # tokenizer_name) through the killable subprocess. When the tokenizer is model_name itself it is
-        # already covered (base prefetch, or vLLM's own download on the vLLM path). Do NOT warm model_name
-        # here on the vLLM path: this runs before fast_inference_setup may remap "*-unsloth-bnb-4bit" ->
-        # "*-bnb-4bit", so it would warm the wrong repo.
+        # Warm a SEPARATE tokenizer repo (explicit tokenizer_name); when it is model_name it is already
+        # covered. Do NOT warm model_name here on the vLLM path: this runs before fast_inference_setup may
+        # remap "*-unsloth-bnb-4bit" -> "*-bnb-4bit", so it would warm the wrong repo.
         _tokenizer_repo = (
             tokenizer_name if (isinstance(tokenizer_name, str) and tokenizer_name) else model_name
         )

From 3f750f7d7484d403435648ccf0750e922bfab049 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 1 Jul 2026 09:56:11 +0000
Subject: [PATCH 72/80] Tighten frontend comments (comment-only, no logic
 change)

---
 .../model-selector/model-update-action.tsx         | 11 ++++-------
 .../assistant-ui/model-selector/pickers.tsx        | 14 ++++----------
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/studio/frontend/src/components/assistant-ui/model-selector/model-update-action.tsx b/studio/frontend/src/components/assistant-ui/model-selector/model-update-action.tsx
index d00c812325..ee0952f7c3 100644
--- a/studio/frontend/src/components/assistant-ui/model-selector/model-update-action.tsx
+++ b/studio/frontend/src/components/assistant-ui/model-selector/model-update-action.tsx
@@ -42,10 +42,8 @@ export function ModelUpdateAction({
 }: ModelUpdateActionProps) {
   const [open, setOpen] = useState(false);
 
-  // The update is a managed download (it surfaces in the global Downloads panel
-  // with progress + cancel). When this exact repo+variant finishes, refresh the
-  // caller so the "update available" cue clears once the new revision is on
-  // disk. A ref keeps the subscription stable across renders without resubscribing.
+  // When this repo+variant's managed download finishes, refresh the caller so the
+  // "update available" cue clears. A ref keeps the subscription stable across renders.
   const onUpdatedRef = useRef(onUpdated);
   onUpdatedRef.current = onUpdated;
   useEffect(() => {
@@ -60,9 +58,8 @@ export function ModelUpdateAction({
   }, [repoId, variant]);
 
   const handleConfirm = useCallback(() => {
-    // Start the background re-download and close the dialog immediately; the
-    // Downloads panel owns progress + cancel from here. Only a failure to START
-    // surfaces a toast — a failed download reports itself in the panel.
+    // Start the background re-download and close the dialog; the Downloads panel owns
+    // progress + cancel. Only a failure to START toasts (a failed download shows in the panel).
     void Promise.resolve()
       .then(onConfirm)
       .catch((err) => {
diff --git a/studio/frontend/src/components/assistant-ui/model-selector/pickers.tsx b/studio/frontend/src/components/assistant-ui/model-selector/pickers.tsx
index e11a08f8ab..9a6b93eb8e 100644
--- a/studio/frontend/src/components/assistant-ui/model-selector/pickers.tsx
+++ b/studio/frontend/src/components/assistant-ui/model-selector/pickers.tsx
@@ -1245,11 +1245,8 @@ export function HubModelPicker({
   onEject?: () => void;
 }) {
   const gpu = useGpuInfo();
-  // The currently-loaded/running model id. We read params.checkpoint from the
-  // runtime store (backend-mirrored from /api/inference/status.active_model, see
-  // chat-runtime-store) rather than the dropdown `isSelected` highlight (which is
-  // just `value === repo_id` and can reflect a staged, not-yet-loaded pick). Used
-  // to disable the cached-row update action for the model that's live in memory.
+  // The live model id from the runtime store (backend-mirrored active_model), not the
+  // dropdown highlight which can be a staged pick. Disables the update action for it.
   const loadedModelId = useChatRuntimeStore((s) => s.params.checkpoint);
   // Last-loaded timestamps power the "Recent" sort (vs "Downloaded" = file date).
   const loadTimes = useModelLoadTimes(value);
@@ -1584,11 +1581,8 @@ export function HubModelPicker({
     refreshLocalModelsList();
   }, [hfToken, refreshLocalModelsList]);
 
-  // Updates run as MANAGED downloads (they show in the global Downloads panel
-  // with manifest-based progress + a working Cancel), instead of a blocking
-  // call. The worker re-resolves `main` and pulls only changed blobs, so the
-  // cached copy stays usable until the new revision lands. The row's
-  // ModelUpdateAction refreshes the list when this repo+variant completes.
+  // Updates run as managed downloads (Downloads panel: progress + Cancel), not a blocking
+  // call. The worker pulls only changed blobs, so the cached copy stays usable until done.
   const startManagedUpdate = useCallback((repoId: string, variant: string, expectedBytes: number) => {
     return downloadManager
       .requestStart({

From a89ec0da0d79905a09e401dfc5fa909f1c63d00e Mon Sep 17 00:00:00 2001
From: danielhanchen <danielhanchen@gmail.com>
Date: Wed, 1 Jul 2026 09:59:15 +0000
Subject: [PATCH 73/80] Warm classification loads and gate the vision prefetch
 on load-mode validation

Two paths let a real in-process weight download run without the Xet stall
fallback, or ran the prefetch before it could fail locally:

- A num_labels classification load takes the AutoModelForSequenceClassification
  branch (an in-process download) even under fast_inference=True, but the llama
  prefetch skipped the warm whenever fast_inference was set, so those weights
  fetched over un-killable Xet. Gate the skip on num_labels is None, so only a
  real vLLM-owned load bypasses the warm.

- The FastBaseModel (vision / FastModel) prefetch ran before the mutually-
  exclusive load_in_4bit/8bit/16bit validation, so an invalid combination
  downloaded a multi-GB snapshot before failing. SentenceTransformer already
  moved that check ahead of its prefetch; the vision loader cannot, because
  check_and_disable_bitsandbytes_loading only resolves the effective flags after
  the config fetch (moving the check earlier would false-reject a valid load of
  an already-quantized model). Move the base + tokenizer prefetch to run after
  the validation instead, so an invalid call fails without downloading. The moved
  block is unchanged and still precedes the in-process load and the vLLM
  fast_inference_setup remap.

Adds static AST regression tests asserting the vision prefetch follows the
load-mode validation and the llama prefetch's fast_inference skip is gated on
num_labels.
---
 tests/test_prefetch_snapshot_scope.py |  54 +++++++++++++
 unsloth/models/llama.py               |   5 +-
 unsloth/models/vision.py              | 105 ++++++++++++++------------
 3 files changed, 114 insertions(+), 50 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 92b79bacaf..3aa328df98 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -306,6 +306,60 @@ def test_st_prefetch_resolves_env_cache_and_runs_after_validation():
     assert val_lineno < call.lineno, "load-mode validation must precede the ST prefetch"
 
 
+def test_vision_prefetch_runs_after_load_mode_validation():
+    """The FastBaseModel (vision / FastModel) prefetch must run AFTER the mutually-exclusive load-mode
+    validation, so an invalid load_in_4bit/8bit/16bit combination fails locally without first downloading
+    a multi-GB snapshot (Codex #6638). check_and_disable_bitsandbytes_loading can only resolve after the
+    config fetch, so the check cannot move earlier; the prefetch moves after it instead. Static guard:
+    importing the loader pulls heavy optional deps."""
+    import ast
+    import os
+
+    src_path = os.path.join(os.path.dirname(U.__file__), "vision.py")
+    with open(src_path, "r", encoding = "utf-8") as f:
+        src = f.read()
+    tree = ast.parse(src)
+    prefetch_calls = [
+        n
+        for n in ast.walk(tree)
+        if isinstance(n, ast.Call)
+        and isinstance(n.func, ast.Name)
+        and n.func.id == "maybe_prefetch_hf_snapshot"
+    ]
+    assert prefetch_calls, "expected a vision prefetch call"
+    first_prefetch = min(call.lineno for call in prefetch_calls)
+    val_lineno = src[: src.index("Can only load in 4bit or 8bit or 16bit")].count("\n")
+    assert val_lineno < first_prefetch, "load-mode validation must precede the vision prefetch"
+
+
+def test_llama_prefetch_skips_only_real_vllm_loads():
+    """A num_labels classification load takes the AutoModelForSequenceClassification branch (an in-process
+    download) even under fast_inference=True, so the llama prefetch's fast_inference skip must be gated on
+    num_labels is None -- else that load's weights fetch over un-killable Xet (Codex #6638). Static guard:
+    the base prefetch's fast_inference kwarg references both fast_inference and num_labels."""
+    import ast
+    import os
+
+    src_path = os.path.join(os.path.dirname(U.__file__), "llama.py")
+    with open(src_path, "r", encoding = "utf-8") as f:
+        tree = ast.parse(f.read())
+    gated = False
+    for n in ast.walk(tree):
+        if not (
+            isinstance(n, ast.Call)
+            and isinstance(n.func, ast.Name)
+            and n.func.id == "maybe_prefetch_hf_snapshot"
+        ):
+            continue
+        fi_kw = next((kw for kw in n.keywords if kw.arg == "fast_inference"), None)
+        if fi_kw is None:
+            continue
+        dumped = ast.dump(fi_kw.value)
+        if "fast_inference" in dumped and "num_labels" in dumped:
+            gated = True
+    assert gated, "llama prefetch fast_inference must be gated on num_labels is None"
+
+
 def test_st_fallback_module_loads_resolve_env_cache():
     """The fallback module loads must resolve the SAME cache the prefetch warmed. _module_path /
     _read_pooling_mode call hf_hub_download directly, which does NOT honor SENTENCE_TRANSFORMERS_HOME,
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 2158c5d23c..dac95f3129 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2429,7 +2429,10 @@ def from_pretrained(
             token = token,
             cache_dir = kwargs.get("cache_dir"),
             local_files_only = kwargs.get("local_files_only", False),
-            fast_inference = fast_inference,
+            # Only a real vLLM-owned load skips the warm. A num_labels classification load takes the
+            # AutoModelForSequenceClassification branch below (an in-process download) even under
+            # fast_inference=True, so it must still be warmed or its weights fetch over un-killable Xet.
+            fast_inference = fast_inference and num_labels is None,
             subfolder = kwargs.get("subfolder"),
             force_download = kwargs.get("force_download", False),
             use_safetensors = kwargs.get("use_safetensors"),
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 33d69879c3..813b71d5e1 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -805,55 +805,10 @@ def from_pretrained(
         # For debugging - we use a download counter to see if environments are not breaking or if HF is down
         get_statistics(kwargs.get("local_files_only", False))
 
-        # vLLM owns the weight download only when available; if requested but missing, the load falls
-        # through to the in-process HF path, so weights must still be warmed here.
-        _vllm_owns_weights = fast_inference and is_vLLM_available()
-
-        # Pre-download the repo in a killable subprocess (Xet -> HTTP on a no-progress stall) so the
-        # in-process load below is a cache hit and cannot hang.
-        _prefetched = maybe_prefetch_hf_snapshot(
-            model_name,
-            token = token,
-            revision = kwargs.get("revision"),
-            cache_dir = kwargs.get("cache_dir"),
-            local_files_only = kwargs.get("local_files_only", False),
-            fast_inference = _vllm_owns_weights,
-            subfolder = kwargs.get("subfolder"),
-            force_download = kwargs.get("force_download", False),
-            use_safetensors = kwargs.get("use_safetensors"),
-            from_tf = kwargs.get("from_tf", False),
-            from_flax = kwargs.get("from_flax", False),
-            # Bare load reads only ROOT weights; skip subdir weights (fp16/, experimental/). Ignored
-            # when a subfolder is set.
-            weights_at_root = True,
-            variant = kwargs.get("variant"),  # forward so the warm keeps the variant .bin
-            gguf_file = kwargs.get(
-                "gguf_file"
-            ),  # forward so the warm fetches the GGUF (else ignored)
-        )
-        # Child already did the forced download; clear the flag so the load reuses the warm cache.
-        if _prefetched and kwargs.get("force_download", False):
-            kwargs["force_download"] = False
-
-        # Warm a SEPARATE tokenizer repo (explicit tokenizer_name); when it is model_name it is already
-        # covered. Do NOT warm model_name here on the vLLM path: this runs before fast_inference_setup may
-        # remap "*-unsloth-bnb-4bit" -> "*-bnb-4bit", so it would warm the wrong repo.
-        _tokenizer_repo = (
-            tokenizer_name if (isinstance(tokenizer_name, str) and tokenizer_name) else model_name
-        )
-        _warm_tokenizer_repo = (
-            isinstance(_tokenizer_repo, str)
-            and bool(_tokenizer_repo)
-            and _tokenizer_repo != model_name
-        )
-        if _warm_tokenizer_repo:
-            maybe_prefetch_hf_snapshot(
-                _tokenizer_repo,
-                token = token,
-                cache_dir = kwargs.get("cache_dir"),
-                local_files_only = kwargs.get("local_files_only", False),
-                tokenizer_only = True,
-            )
+        # NOTE: the base + tokenizer prefetch (the Xet -> HTTP stall fallback warm) runs AFTER the
+        # load-mode validation below, so an invalid load_in_4bit/8bit/16bit combination fails locally
+        # without first downloading a multi-GB snapshot. See the maybe_prefetch_hf_snapshot block placed
+        # right after that check.
 
         if dtype is None:
             dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
@@ -951,6 +906,58 @@ def from_pretrained(
             raise RuntimeError(
                 "Unsloth: Can only load in 4bit or 8bit or 16bit, not a combination!"
             )
+
+        # Pre-download the repo in a killable subprocess (Xet -> HTTP on a no-progress stall) so the
+        # in-process load below is a cache hit and cannot hang. Runs AFTER the load-mode validation
+        # above so an invalid load_in_* combination fails without first pulling a multi-GB snapshot.
+        # vLLM owns the weight download only when actually available; if fast_inference was requested
+        # but vLLM is missing, the load falls through to the in-process HF path (fast_inference_setup
+        # flips the flag below), so the weights must still be warmed here. Resolve availability now.
+        _vllm_owns_weights = fast_inference and is_vLLM_available()
+        _prefetched = maybe_prefetch_hf_snapshot(
+            model_name,
+            token = token,
+            revision = kwargs.get("revision"),
+            cache_dir = kwargs.get("cache_dir"),
+            local_files_only = kwargs.get("local_files_only", False),
+            fast_inference = _vllm_owns_weights,
+            subfolder = kwargs.get("subfolder"),
+            force_download = kwargs.get("force_download", False),
+            use_safetensors = kwargs.get("use_safetensors"),
+            from_tf = kwargs.get("from_tf", False),
+            from_flax = kwargs.get("from_flax", False),
+            # Bare load reads only ROOT weights; skip subdir weights (fp16/, experimental/). Ignored
+            # when a subfolder is set.
+            weights_at_root = True,
+            variant = kwargs.get("variant"),  # forward so the warm keeps the variant .bin
+            gguf_file = kwargs.get(
+                "gguf_file"
+            ),  # forward so the warm fetches the GGUF (else ignored)
+        )
+        # Child already did the forced download; clear the flag so the load reuses the warm cache.
+        if _prefetched and kwargs.get("force_download", False):
+            kwargs["force_download"] = False
+
+        # Warm a SEPARATE tokenizer repo (explicit tokenizer_name); when it is model_name it is already
+        # covered. Do NOT warm model_name here on the vLLM path: this runs before fast_inference_setup may
+        # remap "*-unsloth-bnb-4bit" -> "*-bnb-4bit", so it would warm the wrong repo.
+        _tokenizer_repo = (
+            tokenizer_name if (isinstance(tokenizer_name, str) and tokenizer_name) else model_name
+        )
+        _warm_tokenizer_repo = (
+            isinstance(_tokenizer_repo, str)
+            and bool(_tokenizer_repo)
+            and _tokenizer_repo != model_name
+        )
+        if _warm_tokenizer_repo:
+            maybe_prefetch_hf_snapshot(
+                _tokenizer_repo,
+                token = token,
+                cache_dir = kwargs.get("cache_dir"),
+                local_files_only = kwargs.get("local_files_only", False),
+                tokenizer_only = True,
+            )
+
         _skip_modules = SKIP_QUANTIZATION_MODULES.copy()
         # Nemotron-H uses 'mixer' (not 'mamba') for Mamba layers.
         # Mamba fused kernels pass out_proj.weight directly to F.linear,

From 4f5f8da7b5eddb8b20c9fa93cde9038d726d5ff6 Mon Sep 17 00:00:00 2001
From: danielhanchen <danielhanchen@gmail.com>
Date: Wed, 1 Jul 2026 10:37:27 +0000
Subject: [PATCH 74/80] Point the fallback FastModel weight load at the warmed
 ST cache

The custom SentenceTransformer fallback path prefetches into the cache the ST
load uses (cache_folder, else SENTENCE_TRANSFORMERS_HOME), and the module-file
loads were already resolved to that same cache. But the model WEIGHTS load runs
through FastModel.from_pretrained, which resolves its cache from the HF cache_dir
and never reads cache_folder / SENTENCE_TRANSFORMERS_HOME. With either set, the
weights loaded from the default HF cache and missed the warm, so they still ran
the unprotected in-process Xet download the prefetch is meant to avoid.

Resolve the same cache into kwargs["cache_dir"] before the FastModel load (when a
custom ST cache is in play and the caller passed no explicit HF cache_dir, which
wins), so the weights hit the warm. A None resolution leaves the default HF
cache, matching the prefetch.

Adds a static AST regression test asserting from_pretrained resolves the ST cache
into kwargs["cache_dir"] before the FastModel.from_pretrained call.
---
 tests/test_prefetch_snapshot_scope.py  | 60 ++++++++++++++++++++++++++
 unsloth/models/sentence_transformer.py |  8 ++++
 2 files changed, 68 insertions(+)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 3aa328df98..083453c57b 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -476,6 +476,66 @@ def test_st_fallback_module_loads_forward_revision():
     ), "expected the fallback _module_path and _load_modules calls to forward revision"
 
 
+def test_st_fallback_model_load_resolves_env_cache():
+    """The fallback FastModel weight load resolves its cache from the HF cache_dir, not ST's cache_folder /
+    SENTENCE_TRANSFORMERS_HOME. from_pretrained must therefore resolve the SAME cache the prefetch warmed
+    into kwargs['cache_dir'] BEFORE the FastModel.from_pretrained call -- else the weights miss the warm
+    and start an unprotected in-process Xet download (Codex #6638). Static guard: importing ST pulls heavy
+    optional deps."""
+    import ast
+    import os
+
+    src_path = os.path.join(os.path.dirname(U.__file__), "sentence_transformer.py")
+    with open(src_path, "r", encoding = "utf-8") as f:
+        tree = ast.parse(f.read())
+
+    def _resolves_st_cache(value_node):
+        # The resolution may be inline in the assigned value, or in the assignment to the intermediate
+        # variable the value references (kwargs['cache_dir'] = _st_cache_dir; _st_cache_dir = ...).
+        dumped = ast.dump(value_node)
+        if "cache_folder" in dumped and "SENTENCE_TRANSFORMERS_HOME" in dumped:
+            return True
+        if isinstance(value_node, ast.Name):
+            for n in ast.walk(tree):
+                if isinstance(n, ast.Assign) and any(
+                    isinstance(t, ast.Name) and t.id == value_node.id for t in n.targets
+                ):
+                    d = ast.dump(n.value)
+                    if "cache_folder" in d and "SENTENCE_TRANSFORMERS_HOME" in d:
+                        return True
+        return False
+
+    resolved_lines = []
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.Assign):
+            continue
+        for tgt in node.targets:
+            if (
+                isinstance(tgt, ast.Subscript)
+                and isinstance(tgt.value, ast.Name)
+                and tgt.value.id == "kwargs"
+                and isinstance(tgt.slice, ast.Constant)
+                and tgt.slice.value == "cache_dir"
+                and _resolves_st_cache(node.value)
+            ):
+                resolved_lines.append(node.lineno)
+    assert resolved_lines, "from_pretrained must resolve the ST cache into kwargs['cache_dir']"
+
+    fastmodel_calls = [
+        n.lineno
+        for n in ast.walk(tree)
+        if isinstance(n, ast.Call)
+        and isinstance(n.func, ast.Attribute)
+        and n.func.attr == "from_pretrained"
+        and isinstance(n.func.value, ast.Name)
+        and n.func.value.id == "FastModel"
+    ]
+    assert fastmodel_calls, "expected a FastModel.from_pretrained call"
+    assert min(resolved_lines) < min(
+        fastmodel_calls
+    ), "kwargs['cache_dir'] must be resolved before the fallback FastModel weight load"
+
+
 def test_filename_has_variant_matches_single_and_sharded():
     """The variant detector matches both the single-file (.fp16.) and SHARDED (.fp16-) infixes and
     rejects the default (non-variant) names (gemini #6638)."""
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index ca774cff9d..12280282ed 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -1723,6 +1723,14 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
             load_in_4bit = False
             load_in_16bit = True
 
+        # The fallback FastModel weight load resolves its cache from the HF cache_dir, not ST's
+        # cache_folder / SENTENCE_TRANSFORMERS_HOME. Point it at the SAME cache the prefetch warmed above,
+        # else the weights miss the warm and start an unprotected in-process Xet download. Only set it when
+        # a custom ST cache is in play and the caller passed no explicit HF cache_dir (which wins).
+        _st_cache_dir = kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME")
+        if _st_cache_dir is not None and "cache_dir" not in kwargs:
+            kwargs["cache_dir"] = _st_cache_dir
+
         try:
             model, tokenizer = FastModel.from_pretrained(
                 model_name = model_name,

From e883f71414dcc4dd29799519ae6124b59e5dba15 Mon Sep 17 00:00:00 2001
From: danielhanchen <danielhanchen@gmail.com>
Date: Wed, 1 Jul 2026 12:10:36 +0000
Subject: [PATCH 75/80] Close four prefetch cache/variant/tokenizer gaps that
 left an in-process Xet fetch

Four cases where the prewarm and the real load targeted different files or
different caches, so the load still ran an unprotected in-process Xet download:

- ST prefetch ignored an explicit HF cache_dir. FastSentenceTransformer resolves
  the prefetch (and fallback module-load) cache from cache_folder /
  SENTENCE_TRANSFORMERS_HOME, but the FastModel fallback weight load forwards
  kwargs["cache_dir"]. A caller passing cache_dir warmed one cache and read
  another, missing the warm. The three ST cache resolutions now honor
  kwargs.get("cache_dir") first.

- Variant .bin dropped for a non-canonical safetensors sidecar. For a variant
  load the prewarm dropped *.bin as soon as ANY variant-infixed safetensors was
  shipped, so a repo with consolidated.fp16.safetensors (a sidecar the load does
  not read) plus the real pytorch_model.fp16.bin had its only loadable weights
  dropped from the warm. The check now requires a CANONICAL model variant name
  (model.<variant>.safetensors, either shard infix, or the variant index) via
  _is_canonical_variant_model_weight_safetensors, matching the strictness already
  used for the no-variant path.

- vLLM path never warmed the tokenizer. With fast_inference and no separate
  tokenizer_name, the base warm is skipped (vLLM owns the weights) and the
  tokenizer warm was deferred because fast_inference_setup may remap model_name.
  The final tokenizer repo was then loaded in-process (AutoProcessor /
  AutoTokenizer) with no warm. Warm the final tokenizer repo (tokenizer-only)
  after the remap on the vLLM path.

- Diffusion variant not forwarded to the load. FastDiffusionModel forwarded
  variant to the prefetch but not to model_cls.from_pretrained, so the pipeline
  asked for the default weight variant -- missing the warmed variant weights
  (wrong precision, or a default weight a variant-only repo may not ship, fetched
  in-process). load_kwargs now forwards variant.

Adds regression tests for each (a non-canonical sidecar keeps its .bin, the ST
cache resolutions honor cache_dir, the vLLM tokenizer warm runs after the remap,
and the diffusion load forwards variant).
---
 tests/test_prefetch_snapshot_scope.py  | 97 +++++++++++++++++++++++---
 unsloth/models/_utils.py               | 23 ++++--
 unsloth/models/diffusion.py            |  5 ++
 unsloth/models/sentence_transformer.py | 12 ++--
 unsloth/models/vision.py               | 15 ++++
 5 files changed, 134 insertions(+), 18 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 083453c57b..a283c848f4 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -261,6 +261,22 @@ def test_no_variant_keeps_bin_when_only_variant_safetensors(monkeypatch):
     assert "*.bin" in ig2
 
 
+def test_variant_keeps_bin_for_noncanonical_sidecar(monkeypatch):
+    """With variant='fp16', a NON-canonical sidecar (consolidated.fp16.safetensors) must not prove the
+    variant pytorch_model.fp16.bin redundant: a transformers variant load reads model.fp16.safetensors,
+    not consolidated.*, so dropping the .bin would leave the only loadable weights to an in-process Xet
+    fetch. The .bin stays warmed (Codex #6638)."""
+    _install_fake_model_info(
+        monkeypatch, ["consolidated.fp16.safetensors", "pytorch_model.fp16.bin"]
+    )
+    ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16", weights_at_root = True)
+    assert "*.bin" not in ig
+    # A canonical variant safetensors DOES make the variant .bin redundant.
+    _install_fake_model_info(monkeypatch, ["model.fp16.safetensors", "pytorch_model.fp16.bin"])
+    ig2 = U._prefetch_ignore_patterns("org/repo", variant = "fp16", weights_at_root = True)
+    assert "*.bin" in ig2
+
+
 def test_is_canonical_model_weight_safetensors():
     """The canonical detector matches only the non-variant model-weight safetensors names a default
     load reads, and rejects variant / sidecar names (Codex #6638)."""
@@ -306,6 +322,63 @@ def test_st_prefetch_resolves_env_cache_and_runs_after_validation():
     assert val_lineno < call.lineno, "load-mode validation must precede the ST prefetch"
 
 
+def test_st_cache_resolutions_honor_explicit_hf_cache_dir():
+    """Every ST cache resolution (the prefetch and the fallback module loads) that falls back to
+    SENTENCE_TRANSFORMERS_HOME must first honor an explicit HF cache_dir. The FastModel fallback load
+    forwards kwargs['cache_dir'], so a caller passing cache_dir would otherwise warm one cache and read
+    another, missing the warm and fetching in-process over Xet (Codex #6638). Static guard."""
+    import ast
+    import os
+
+    src_path = os.path.join(os.path.dirname(U.__file__), "sentence_transformer.py")
+    with open(src_path, "r", encoding = "utf-8") as f:
+        tree = ast.parse(f.read())
+    resolutions = [
+        kw
+        for kw in ast.walk(tree)
+        if isinstance(kw, ast.keyword)
+        and kw.arg == "cache_dir"
+        and "SENTENCE_TRANSFORMERS_HOME" in ast.dump(kw.value)
+    ]
+    assert resolutions, "expected cache_dir resolutions referencing SENTENCE_TRANSFORMERS_HOME"
+    for kw in resolutions:
+        assert "'cache_dir'" in ast.dump(kw.value), (
+            "an ST cache_dir resolution must read an explicit kwargs.get('cache_dir') first"
+        )
+
+
+def test_vision_warms_vllm_tokenizer_after_remap():
+    """On the vLLM path the base warm is skipped and the tokenizer warm is deferred until after
+    fast_inference_setup may remap model_name. The final tokenizer repo must then be warmed (tokenizer
+    only) so the in-process processor / tokenizer load is a cache hit, not an unprotected Xet fetch
+    (Codex #6638). Static guard: the vLLM-gated tokenizer warm appears after the remap."""
+    import os
+
+    src_path = os.path.join(os.path.dirname(U.__file__), "vision.py")
+    with open(src_path, "r", encoding = "utf-8") as f:
+        src = f.read()
+    guard = "if _vllm_owns_weights and isinstance(tokenizer_name"
+    assert guard in src, "expected a vLLM-gated tokenizer warm"
+    assert src.index(guard) > src.index("fast_inference_setup("), (
+        "the vLLM tokenizer warm must run after the fast_inference_setup remap"
+    )
+
+
+def test_diffusion_forwards_variant_to_real_load():
+    """FastDiffusionModel must forward `variant` to the real model_cls.from_pretrained load, not only to
+    the prefetch: without it the pipeline asks for the default weight variant, missing the warmed variant
+    weights (wrong precision, or a default weight fetched in-process over Xet) (Codex #6638). Static
+    guard."""
+    import os
+
+    src_path = os.path.join(os.path.dirname(U.__file__), "diffusion.py")
+    with open(src_path, "r", encoding = "utf-8") as f:
+        src = f.read()
+    assert 'load_kwargs["variant"] = kwargs["variant"]' in src, (
+        "the diffusion load must forward variant to model_cls.from_pretrained"
+    )
+
+
 def test_vision_prefetch_runs_after_load_mode_validation():
     """The FastBaseModel (vision / FastModel) prefetch must run AFTER the mutually-exclusive load-mode
     validation, so an invalid load_in_4bit/8bit/16bit combination fails locally without first downloading
@@ -536,14 +609,22 @@ def _resolves_st_cache(value_node):
     ), "kwargs['cache_dir'] must be resolved before the fallback FastModel weight load"
 
 
-def test_filename_has_variant_matches_single_and_sharded():
-    """The variant detector matches both the single-file (.fp16.) and SHARDED (.fp16-) infixes and
-    rejects the default (non-variant) names (gemini #6638)."""
-    assert U._filename_has_variant("model.fp16.safetensors", "fp16") is True
-    assert U._filename_has_variant("model.fp16-00001-of-00002.safetensors", "fp16") is True
-    assert U._filename_has_variant("diffusion_pytorch_model.fp16.safetensors", "fp16") is True
-    assert U._filename_has_variant("model.safetensors", "fp16") is False
-    assert U._filename_has_variant("model-00001-of-00002.safetensors", "fp16") is False
+def test_canonical_variant_model_weight_matches_transformers_names():
+    """The variant safetensors detector matches only CANONICAL model variant names a transformers load
+    reads (single, either shard infix, and the variant index) and rejects a non-canonical sidecar
+    (consolidated.fp16.safetensors) so its variant .bin is never wrongly dropped, plus the default and
+    wrong-variant names (Codex #6638)."""
+    f = U._is_canonical_variant_model_weight_safetensors
+    assert f("model.fp16.safetensors", "fp16") is True
+    assert f("model.fp16-00001-of-00002.safetensors", "fp16") is True
+    assert f("model-00001-of-00002.fp16.safetensors", "fp16") is True
+    assert f("model.safetensors.index.fp16.json", "fp16") is True
+    # A non-canonical sidecar variant does NOT prove the .bin redundant (the M2 hang guard).
+    assert f("consolidated.fp16.safetensors", "fp16") is False
+    # Default (non-variant) and wrong-variant names are not a match for variant='fp16'.
+    assert f("model.safetensors", "fp16") is False
+    assert f("model-00001-of-00002.safetensors", "fp16") is False
+    assert f("model.bf16.safetensors", "fp16") is False
 
 
 def test_variant_is_forwarded_to_downloader(capture):
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 2513cfa9ad..dfdbeddcee 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1045,11 +1045,24 @@ def _is_model_weight_safetensors(filename):
     return True
 
 
-def _filename_has_variant(filename, variant):
-    """True if a weight filename carries the variant token: `.{variant}.` (single file) or
-    `.{variant}-` (sharded) infix. Matching both recognizes a sharded variant."""
+def _is_canonical_variant_model_weight_safetensors(filename, variant):
+    """True for a canonical model-weights safetensors carrying the requested *variant* token, in the
+    forms transformers reads: model.<variant>.safetensors (single), a numbered shard (either
+    model.<variant>-00001-of-00002.safetensors or model-00001-of-00002.<variant>.safetensors), or the
+    index model.safetensors.index.<variant>.json. Errs strict (base must be ``model``): a non-canonical
+    sidecar such as consolidated.<variant>.safetensors does NOT prove the variant .bin redundant, so its
+    .bin is not wrongly dropped from the warm and left to an unprotected in-process fetch."""
     base = filename.replace("\\", "/").rsplit("/", 1)[-1]
-    return f".{variant}." in base or f".{variant}-" in base
+    v = re.escape(variant)
+    return bool(
+        re.match(
+            rf"^(?:model\.{v}\.safetensors"
+            rf"|model\.{v}-\d{{5}}-of-\d{{5}}\.safetensors"
+            rf"|model-\d{{5}}-of-\d{{5}}\.{v}\.safetensors"
+            rf"|model\.safetensors\.index\.{v}\.json)$",
+            base,
+        )
+    )
 
 
 _CANONICAL_MODEL_WEIGHT_SAFETENSORS_RE = re.compile(
@@ -1166,7 +1179,7 @@ def _prefetch_ignore_patterns(
                 _is_model_weight_safetensors(sibling.rfilename)
                 and _in_requested_load_scope(sibling.rfilename, subfolder)
                 and (
-                    _filename_has_variant(sibling.rfilename, variant)
+                    _is_canonical_variant_model_weight_safetensors(sibling.rfilename, variant)
                     if variant
                     else _is_canonical_model_weight_safetensors(sibling.rfilename)
                 )
diff --git a/unsloth/models/diffusion.py b/unsloth/models/diffusion.py
index d12ea58eff..064b79112e 100644
--- a/unsloth/models/diffusion.py
+++ b/unsloth/models/diffusion.py
@@ -213,6 +213,11 @@ def from_pretrained(
         # use_safetensors=None (auto) already matches the prefetch's heuristic.
         if kwargs.get("use_safetensors") is not None:
             load_kwargs["use_safetensors"] = kwargs["use_safetensors"]
+        # Forward the variant to the real load too, so it reads the variant weights the prefetch warmed.
+        # Without it the pipeline asks for the default weight variant, missing the warm (wrong precision,
+        # or a default weight a variant-only repo may not ship, fetched in-process over un-killable Xet).
+        if kwargs.get("variant") is not None:
+            load_kwargs["variant"] = kwargs["variant"]
 
         # Optional bitsandbytes quant. The MoE experts (3D Parameters) are not nn.Linear so bnb skips
         # them; only attention + dense MLP Linears quantize, lm_head/embeddings stay full precision.
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index 12280282ed..d8462c4ec5 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -1447,13 +1447,14 @@ def from_pretrained(
 
         # Pre-download in a killable subprocess (Xet -> HTTP on a stall) so the ST load below is a cache
         # hit. weights_at_root stays False since ST component weights live in per-module subfolders.
-        # Resolve the cache the load uses: cache_folder, else SENTENCE_TRANSFORMERS_HOME (which ST honors
-        # when cache_folder is unset), else the default HF cache -- a wrong-cache warm would be missed.
+        # Resolve the cache the load uses: an explicit HF cache_dir wins (the FastModel fallback load
+        # forwards it), else cache_folder, else SENTENCE_TRANSFORMERS_HOME (which ST honors when
+        # cache_folder is unset), else the default HF cache -- a wrong-cache warm would be missed.
         maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
             revision = revision,
-            cache_dir = kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
+            cache_dir = kwargs.get("cache_dir") or kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
             local_files_only = kwargs.get("local_files_only", False),
         )
 
@@ -1708,7 +1709,8 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
             FastSentenceTransformer._module_path(
                 model_name,
                 token,
-                cache_dir = kwargs.get("cache_folder")
+                cache_dir = kwargs.get("cache_dir")
+                or kwargs.get("cache_folder")
                 or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
                 revision = revision,
             )
@@ -1773,7 +1775,7 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
             pooling_mode,
             trust_remote_code = trust_remote_code,
             # Same resolved cache as above so the fallback module loads hit the warm, not Xet.
-            cache_dir = kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
+            cache_dir = kwargs.get("cache_dir") or kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
             # Read the modules from the SAME revision the model weights load from (FastModel forwards
             # revision to the weight load), so a revision-pinned repo hits the prefetch's warm instead
             # of fetching default-branch module files in-process over Xet (and mixing them with the
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 813b71d5e1..01cf6aee1c 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -1257,6 +1257,21 @@ def from_pretrained(
         # Counteract saved tokenizers
         tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
 
+        # On the vLLM path the base warm above was skipped (vLLM owns the weight download) and the
+        # tokenizer warm was deferred because fast_inference_setup may remap model_name. Now that the
+        # final tokenizer repo is known, warm it (tokenizer-only) so the in-process processor / tokenizer
+        # load below is a cache hit rather than an unprotected in-process Xet fetch. A re-warm of an
+        # already-cached repo (or a local path) is a fast no-op.
+        if _vllm_owns_weights and isinstance(tokenizer_name, str) and tokenizer_name:
+            maybe_prefetch_hf_snapshot(
+                tokenizer_name,
+                token = token,
+                revision = kwargs.get("revision"),
+                cache_dir = kwargs.get("cache_dir"),
+                local_files_only = kwargs.get("local_files_only", False),
+                tokenizer_only = True,
+            )
+
         # Fix _Unsloth_Patched_ prefix in local config files from old saves (issue #4085)
         if os.path.isdir(tokenizer_name):
             import json as _json

From 08ba45175604fd07d0a4fee35f9ac12d75f432c6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 1 Jul 2026 12:11:06 +0000
Subject: [PATCH 76/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py  | 18 +++++++++---------
 unsloth/models/sentence_transformer.py |  8 ++++++--
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index a283c848f4..778fbfd2a0 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -342,9 +342,9 @@ def test_st_cache_resolutions_honor_explicit_hf_cache_dir():
     ]
     assert resolutions, "expected cache_dir resolutions referencing SENTENCE_TRANSFORMERS_HOME"
     for kw in resolutions:
-        assert "'cache_dir'" in ast.dump(kw.value), (
-            "an ST cache_dir resolution must read an explicit kwargs.get('cache_dir') first"
-        )
+        assert "'cache_dir'" in ast.dump(
+            kw.value
+        ), "an ST cache_dir resolution must read an explicit kwargs.get('cache_dir') first"
 
 
 def test_vision_warms_vllm_tokenizer_after_remap():
@@ -359,9 +359,9 @@ def test_vision_warms_vllm_tokenizer_after_remap():
         src = f.read()
     guard = "if _vllm_owns_weights and isinstance(tokenizer_name"
     assert guard in src, "expected a vLLM-gated tokenizer warm"
-    assert src.index(guard) > src.index("fast_inference_setup("), (
-        "the vLLM tokenizer warm must run after the fast_inference_setup remap"
-    )
+    assert src.index(guard) > src.index(
+        "fast_inference_setup("
+    ), "the vLLM tokenizer warm must run after the fast_inference_setup remap"
 
 
 def test_diffusion_forwards_variant_to_real_load():
@@ -374,9 +374,9 @@ def test_diffusion_forwards_variant_to_real_load():
     src_path = os.path.join(os.path.dirname(U.__file__), "diffusion.py")
     with open(src_path, "r", encoding = "utf-8") as f:
         src = f.read()
-    assert 'load_kwargs["variant"] = kwargs["variant"]' in src, (
-        "the diffusion load must forward variant to model_cls.from_pretrained"
-    )
+    assert (
+        'load_kwargs["variant"] = kwargs["variant"]' in src
+    ), "the diffusion load must forward variant to model_cls.from_pretrained"
 
 
 def test_vision_prefetch_runs_after_load_mode_validation():
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index d8462c4ec5..3c4ce7f4aa 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -1454,7 +1454,9 @@ def from_pretrained(
             model_name,
             token = token,
             revision = revision,
-            cache_dir = kwargs.get("cache_dir") or kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
+            cache_dir = kwargs.get("cache_dir")
+            or kwargs.get("cache_folder")
+            or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
             local_files_only = kwargs.get("local_files_only", False),
         )
 
@@ -1775,7 +1777,9 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
             pooling_mode,
             trust_remote_code = trust_remote_code,
             # Same resolved cache as above so the fallback module loads hit the warm, not Xet.
-            cache_dir = kwargs.get("cache_dir") or kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
+            cache_dir = kwargs.get("cache_dir")
+            or kwargs.get("cache_folder")
+            or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
             # Read the modules from the SAME revision the model weights load from (FastModel forwards
             # revision to the weight load), so a revision-pinned repo hits the prefetch's warm instead
             # of fetching default-branch module files in-process over Xet (and mixing them with the

From 6d4ce26cac1c875c2ef6e0a46f52e0cd0ba59a80 Mon Sep 17 00:00:00 2001
From: danielhanchen <danielhanchen@gmail.com>
Date: Thu, 2 Jul 2026 05:22:38 +0000
Subject: [PATCH 77/80] Map explicit HF cache_dir onto cache_folder for the
 native SentenceTransformer loads

The ST prefetch warms the cache resolved as cache_dir -> cache_folder ->
SENTENCE_TRANSFORMERS_HOME (an explicit HF cache_dir wins). But the for_inference and
fast-encoder branches then construct a native SentenceTransformer, which takes
cache_folder, not cache_dir: for_inference copied only a caller-supplied cache_folder
into st_kwargs, and the fast-encoder passed cache_folder = kwargs.get("cache_folder").
So a caller passing cache_dir warmed one cache and the native load read another
(cache_folder / SENTENCE_TRANSFORMERS_HOME / the default HF cache), missing the warm and
starting an unprotected in-process Hub/Xet download that defeats the stall fallback and
duplicates the transfer.

Both native loads now set cache_folder = kwargs.get("cache_dir") or
kwargs.get("cache_folder"): the explicit HF cache_dir wins, else the caller's
cache_folder, else None so ST honors SENTENCE_TRANSFORMERS_HOME -- matching the
prefetch's resolution so the native load hits the warmed cache. The FastModel fallback
branch already resolves cache_dir (unchanged). Adds a static regression guard.
---
 tests/test_prefetch_snapshot_scope.py  | 35 ++++++++++++++++++++++++++
 unsloth/models/sentence_transformer.py | 15 ++++++++---
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 778fbfd2a0..067d235551 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -347,6 +347,41 @@ def test_st_cache_resolutions_honor_explicit_hf_cache_dir():
         ), "an ST cache_dir resolution must read an explicit kwargs.get('cache_dir') first"
 
 
+def test_st_native_loads_map_hf_cache_dir_to_cache_folder():
+    """The for_inference and fast-encoder branches construct a native SentenceTransformer, which takes
+    cache_folder (not cache_dir). The prefetch warms cache_dir first, so an explicit HF cache_dir must be
+    mapped onto cache_folder for those native loads; otherwise the load reads a different cache, misses the
+    warm, and starts an unprotected in-process Xet download (Codex #6638). Static guard."""
+    import ast
+    import os
+
+    src_path = os.path.join(os.path.dirname(U.__file__), "sentence_transformer.py")
+    with open(src_path, "r", encoding = "utf-8") as f:
+        src = f.read()
+    tree = ast.parse(src)
+    # Every native SentenceTransformer(...) constructor that forwards cache_folder must read cache_dir.
+    st_calls = [
+        n
+        for n in ast.walk(tree)
+        if isinstance(n, ast.Call)
+        and isinstance(n.func, ast.Name)
+        and n.func.id == "SentenceTransformer"
+    ]
+    cache_folder_kws = [kw for call in st_calls for kw in call.keywords if kw.arg == "cache_folder"]
+    assert cache_folder_kws, "expected a native SentenceTransformer call forwarding cache_folder"
+    for kw in cache_folder_kws:
+        assert "'cache_dir'" in ast.dump(
+            kw.value
+        ), "a native SentenceTransformer cache_folder must map the explicit HF cache_dir first"
+    # The for_inference branch feeds cache_folder through st_kwargs; it must map cache_dir there too, and
+    # both native branches resolve cache_dir -> cache_folder (reformatting-tolerant normalized check).
+    normalized = "".join(src.split())
+    assert 'st_kwargs["cache_folder"]=' in normalized, "for_inference must set st_kwargs cache_folder"
+    assert (
+        normalized.count('kwargs.get("cache_dir")orkwargs.get("cache_folder")') >= 2
+    ), "both native ST branches (for_inference, fast-encoder) must map cache_dir -> cache_folder"
+
+
 def test_vision_warms_vllm_tokenizer_after_remap():
     """On the vLLM path the base warm is skipped and the tokenizer warm is deferred until after
     fast_inference_setup may remap model_name. The final tokenizer repo must then be warmed (tokenizer
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index 3c4ce7f4aa..f5d02c0ba7 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -1490,6 +1490,14 @@ def from_pretrained(
                 if k in kwargs:
                     st_kwargs[k] = kwargs[k]
 
+            # ST takes cache_folder, not cache_dir. Map an explicit HF cache_dir onto cache_folder so this
+            # native load reads the cache the prefetch warmed above (cache_dir wins, else the caller's
+            # cache_folder; None lets ST honor SENTENCE_TRANSFORMERS_HOME, matching the prefetch) -- else a
+            # cache_dir warm is missed and the load starts an unprotected in-process Xet download.
+            _st_cache = kwargs.get("cache_dir") or kwargs.get("cache_folder")
+            if _st_cache is not None:
+                st_kwargs["cache_folder"] = _st_cache
+
             st_model = SentenceTransformer(model_name, **st_kwargs)
             return st_model
 
@@ -1590,8 +1598,9 @@ def from_pretrained(
                 elif is_mpnet:
                     FastSentenceTransformer._patch_mpnet_v5()
 
-            # Forward cache_folder so this load reads the cache the prefetch warmed (None lets ST honor
-            # SENTENCE_TRANSFORMERS_HOME, matching the prefetch); a custom one would otherwise miss it.
+            # ST takes cache_folder, not cache_dir; map an explicit HF cache_dir onto it so this load reads
+            # the cache the prefetch warmed (cache_dir wins, else cache_folder; None lets ST honor
+            # SENTENCE_TRANSFORMERS_HOME, matching the prefetch) -- a mismatched cache would miss the warm.
             st_model = SentenceTransformer(
                 model_name,
                 device = st_device,
@@ -1599,7 +1608,7 @@ def from_pretrained(
                 token = token,
                 revision = revision,
                 model_kwargs = model_kwargs,
-                cache_folder = kwargs.get("cache_folder"),
+                cache_folder = kwargs.get("cache_dir") or kwargs.get("cache_folder"),
             )
 
             # Store metadata for get_peft_model

From 7a6fc8d89c4aef9adec663220f026d7ce102ea2c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 2 Jul 2026 05:23:11 +0000
Subject: [PATCH 78/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_prefetch_snapshot_scope.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 067d235551..4327472506 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -376,7 +376,9 @@ def test_st_native_loads_map_hf_cache_dir_to_cache_folder():
     # The for_inference branch feeds cache_folder through st_kwargs; it must map cache_dir there too, and
     # both native branches resolve cache_dir -> cache_folder (reformatting-tolerant normalized check).
     normalized = "".join(src.split())
-    assert 'st_kwargs["cache_folder"]=' in normalized, "for_inference must set st_kwargs cache_folder"
+    assert (
+        'st_kwargs["cache_folder"]=' in normalized
+    ), "for_inference must set st_kwargs cache_folder"
     assert (
         normalized.count('kwargs.get("cache_dir")orkwargs.get("cache_folder")') >= 2
     ), "both native ST branches (for_inference, fast-encoder) must map cache_dir -> cache_folder"

From af2f079168962154f4b489101fa1e86d826c92cd Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 3 Jul 2026 12:27:39 +0000
Subject: [PATCH 79/80] Trim verbose comments in the HF prefetch wiring and
 Studio shim

Condense PR-added comments/docstrings across the loader prefetch wiring (_utils.py, vision/llama/loader/diffusion/sentence_transformer.py, tokenizer_utils.py), the prefetch-scope tests, and the Studio Xet->HTTP shim (backend utils + tests, model-selector tsx). Comments/docstrings/JSDoc only, no code changed (AST + TS-compiler verified). Preserves the prefetch-placement and Studio marker-injection rationale tersely. Prefetch-scope tests 43 passed; Studio backend tests 18 passed.
---
 studio/backend/tests/test_hf_xet_fallback.py  |  67 ++--
 .../tests/test_model_update_robustness.py     |  12 +-
 studio/backend/utils/hf_xet_fallback.py       |  53 ++--
 .../model-selector/model-update-action.tsx    |   8 +-
 .../assistant-ui/model-selector/pickers.tsx   |   4 +-
 tests/test_prefetch_snapshot_scope.py         | 289 +++++-------------
 unsloth/models/_utils.py                      | 160 +++++-----
 unsloth/models/diffusion.py                   |  16 +-
 unsloth/models/llama.py                       |  24 +-
 unsloth/models/loader.py                      |  30 +-
 unsloth/models/sentence_transformer.py        |  40 +--
 unsloth/models/vision.py                      |  34 +--
 unsloth/tokenizer_utils.py                    |  10 +-
 13 files changed, 272 insertions(+), 475 deletions(-)

diff --git a/studio/backend/tests/test_hf_xet_fallback.py b/studio/backend/tests/test_hf_xet_fallback.py
index 8b7893678a..4d73213d15 100644
--- a/studio/backend/tests/test_hf_xet_fallback.py
+++ b/studio/backend/tests/test_hf_xet_fallback.py
@@ -1,14 +1,11 @@
 # SPDX-License-Identifier: AGPL-3.0-only
 # Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
 
-"""Studio shim over the shared unsloth_zoo Xet -> HTTP stall fallback.
-
-The watchdog and transport-policy matrix (cached short-circuit, cancel, error
-propagation, the single Xet -> HTTP retry, the snapshot variant, the knobs) is
-tested once in unsloth_zoo (tests/test_hf_xet_fallback.py). Here we assert only
-the Studio-specific seam: the shim re-exports the shared API and injects Studio's
-marker-aware prepare_cache_for_transport on the HTTP retry. CPU-only, no network,
-no real subprocess (the per-attempt download seam is monkeypatched).
+"""Tests for the Studio shim over the shared unsloth_zoo Xet -> HTTP fallback.
+
+The transport-policy matrix is tested once in unsloth_zoo; here we assert only the
+Studio seam: re-exporting the shared API and injecting the marker-aware
+prepare_cache_for_transport on the HTTP retry. CPU-only, no network, no real subprocess.
 """
 
 from __future__ import annotations
@@ -23,8 +20,8 @@
 if _BACKEND_DIR not in sys.path:
     sys.path.insert(0, _BACKEND_DIR)
 
-# Stub heavy/unavailable deps before importing the module under test. Use the real structlog when
-# present; a bare stub would break later modules that log at import time.
+# Stub heavy/unavailable deps before importing the module under test. Use real structlog when present;
+# a bare stub would break later modules that log at import time.
 _loggers_stub = _types.ModuleType("loggers")
 _loggers_stub.get_logger = lambda name: __import__("logging").getLogger(name)
 sys.modules.setdefault("loggers", _loggers_stub)
@@ -38,9 +35,7 @@
 try:
     import unsloth_zoo.hf_xet_fallback as _shared_mod
     shared = _shared_mod
-except Exception:  # noqa: BLE001
-    # Still collect the degraded-path test when unsloth_zoo is missing / too old / torch-less /
-    # GPU-less -- the same failure cases the shim itself degrades for.
+except Exception:  # noqa: BLE001 - still collect degraded-path tests when unsloth_zoo is unavailable
     shared = None
 
 import utils.hf_xet_fallback as xf
@@ -74,9 +69,8 @@ def test_child_should_disable_xet_truth_table():
 
 
 def test_shim_injects_studio_prepare_on_http_retry(monkeypatch):
-    """A stall on Xet retries over HTTP, and the shim runs Studio's marker-aware
-    ``prepare_cache_for_transport(..., 'http')`` before the retry (not the generic
-    delete-incompletes default)."""
+    """A Xet stall retries over HTTP and the shim runs Studio's marker-aware
+    ``prepare_cache_for_transport(..., 'http')`` before the retry."""
     _requires_shared()
     for var in ("UNSLOTH_DISABLE_XET", "UNSLOTH_STABLE_DOWNLOADS", "HF_HUB_DISABLE_XET"):
         monkeypatch.delenv(var, raising = False)
@@ -133,9 +127,8 @@ def fake_snapshot(repo_id, **kwargs):
 
 
 def test_degrades_gracefully_without_shared_helper(monkeypatch):
-    """On an older unsloth_zoo that lacks the shared helper, the shim must still
-    import (Studio boots) and provide stub API that does plain HF downloads with
-    the watchdog disabled, instead of crashing at startup."""
+    """On an older unsloth_zoo lacking the shared helper, the shim still imports (Studio
+    boots) and exposes stub API doing plain HF downloads with the watchdog disabled."""
     import importlib
 
     class _BlockShared:
@@ -163,8 +156,7 @@ def find_spec(
         event = degraded.start_watchdog(repo_ids = ["x"], on_stall = lambda m: None)
         assert hasattr(event, "set") and not event.is_set()  # never fires
 
-        # Degraded mode still emits heartbeats so the orchestrator's inactivity
-        # deadline is not tripped during a long load.
+        # Degraded mode still emits heartbeats so the inactivity deadline is not tripped.
         import time as _time
 
         beats = []
@@ -193,8 +185,7 @@ def _fake_snapshot(repo_id, **kwargs):
         assert degraded.snapshot_download_with_xet_fallback("org/model") == "/snap-dir"
         assert called["repo_id"] == "org/model"
 
-        # Cancellation contract is preserved even in degraded mode: an already-set
-        # cancel_event must abort before starting the plain HF download.
+        # Cancellation still holds: an already-set cancel_event aborts before the HF download.
         import threading as _threading
 
         cancelled = _threading.Event()
@@ -213,11 +204,9 @@ def _fake_snapshot(repo_id, **kwargs):
 
 
 def test_degrades_when_unsloth_zoo_entirely_absent():
-    """Studio-only test/build environments do not install unsloth_zoo at all. The
-    shim must degrade for that case too: ``import unsloth_zoo.hf_xet_fallback``
-    then raises ModuleNotFoundError(name='unsloth_zoo') (the top-level package),
-    not name='unsloth_zoo.hf_xet_fallback'. Regression guard for the shim re-raising
-    and breaking every Studio import that transitively pulls it in."""
+    """When unsloth_zoo is absent entirely, the import raises
+    ModuleNotFoundError(name='unsloth_zoo') (top-level package). Guard that the shim still
+    degrades and does not re-raise, breaking every Studio import that pulls it in."""
     import importlib
 
     class _BlockZoo:
@@ -227,8 +216,7 @@ def find_spec(
             path = None,
             target = None,
         ):
-            # Simulate the whole package being absent: the first missing component
-            # is 'unsloth_zoo', so that is what ModuleNotFoundError.name carries.
+            # Whole package absent, so ModuleNotFoundError.name is the top-level 'unsloth_zoo'.
             if name == "unsloth_zoo" or name.startswith("unsloth_zoo."):
                 raise ModuleNotFoundError("No module named 'unsloth_zoo'", name = "unsloth_zoo")
             return None
@@ -259,11 +247,8 @@ def find_spec(
 
 
 def test_degrades_when_shared_helper_import_raises_importerror():
-    """unsloth_zoo can be installed yet fail to import because a heavy dependency
-    it initializes at package import (torch) is missing -- a llama.cpp/GGUF-only
-    Studio install has no torch. That raises ImportError (not ModuleNotFoundError),
-    so the shim must degrade for it too rather than crash the Studio server at
-    startup. Regression guard for the inference E2E jobs."""
+    """unsloth_zoo can be installed yet fail to import when torch is missing (llama.cpp/GGUF-only
+    Studio), raising ImportError not ModuleNotFoundError. The shim must degrade for that too."""
     import importlib
 
     class _BlockWithImportError:
@@ -274,7 +259,7 @@ def find_spec(
             target = None,
         ):
             if name == "unsloth_zoo.hf_xet_fallback":
-                # Mirror unsloth_zoo/__init__ on a torch-less install: a plain ImportError with no .name.
+                # Mirror a torch-less install: a plain ImportError with no .name.
                 raise ImportError("Unsloth: Pytorch is not installed.")
             return None
 
@@ -301,9 +286,8 @@ def find_spec(
 
 
 def test_retries_under_light_gpu_init_when_import_fails(monkeypatch):
-    """unsloth_zoo's __init__ runs GPU detection that raises NotImplementedError on a GPU-less host
-    (CPU GGUF Studio). The shim must retry the import under UNSLOTH_ZOO_DISABLE_GPU_INIT=1 (its light
-    path), then restore the env; if even the retry fails, it degrades instead of crashing."""
+    """GPU detection in unsloth_zoo's __init__ raises NotImplementedError on a GPU-less host. The shim
+    retries under UNSLOTH_ZOO_DISABLE_GPU_INIT=1, restores the env, and degrades if the retry fails."""
     import importlib
     import os
 
@@ -317,11 +301,10 @@ def find_spec(
             path = None,
             target = None,
         ):
-            # The crash is in unsloth_zoo's __init__ (the PARENT import fails before the submodule), so
-            # intercept "unsloth_zoo" itself (works whether or not it is installed; finder is first).
+            # Crash is in unsloth_zoo's __init__, so intercept "unsloth_zoo" itself (the parent).
             if name == "unsloth_zoo":
                 # Record the env each attempt sees; raise the no-GPU error both times so the shim
-                # degrades (the recovery-succeeds path is covered by real unsloth_zoo on a CPU host).
+                # degrades.
                 seen_env.append(os.environ.get("UNSLOTH_ZOO_DISABLE_GPU_INIT"))
                 raise NotImplementedError("Unsloth cannot find any torch accelerator")
             return None
diff --git a/studio/backend/tests/test_model_update_robustness.py b/studio/backend/tests/test_model_update_robustness.py
index 00bda5f14f..300eb587b3 100644
--- a/studio/backend/tests/test_model_update_robustness.py
+++ b/studio/backend/tests/test_model_update_robustness.py
@@ -5,9 +5,8 @@
 Covers:
   * GGUF variant listing computes update_available from the already-fetched
     sibling metadata instead of a second Hub call.
-  * hf_hub_download_with_xet_fallback forwards force_download through the
-    deduplicated shim to the shared unsloth_zoo helper (which owns the
-    try_to_load_from_cache cache-first early-return and its bypass).
+  * hf_hub_download_with_xet_fallback forwards force_download through the shim to the
+    shared unsloth_zoo helper (which owns the cache-first early-return and its bypass).
 
 The cache "Update" action now runs through the download manager as a normal
 managed download (so it shows in the Downloads panel with progress + cancel),
@@ -343,11 +342,8 @@ def test_cached_model_scan_keeps_local_safetensors_repo(monkeypatch, tmp_path):
 
 
 def test_force_download_is_forwarded_through_the_shim(monkeypatch):
-    """The deduplicated shim delegates to the shared unsloth_zoo helper, which owns the
-    try_to_load_from_cache early-return and its force_download bypass (covered by the zoo suite's
-    test_force_download_file_skips_cache_probe). The shim's own contract is to forward force_download
-    unchanged so Studio's model-update path re-fetches a newer blob; verify both force_download=False
-    and force_download=True reach the shared helper (X2/F2)."""
+    """The shim's contract is to forward force_download unchanged to the shared helper (which owns the
+    cache-first early-return and bypass). Verify both False and True reach it (X2/F2)."""
     import utils.hf_xet_fallback as X
 
     seen = []
diff --git a/studio/backend/utils/hf_xet_fallback.py b/studio/backend/utils/hf_xet_fallback.py
index 5451099da6..2dd2247396 100644
--- a/studio/backend/utils/hf_xet_fallback.py
+++ b/studio/backend/utils/hf_xet_fallback.py
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: AGPL-3.0-only
 # Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
 
-"""Studio shim over the shared Xet -> HTTP stall fallback.
+"""Studio shim over the shared ``unsloth_zoo.hf_xet_fallback`` Xet -> HTTP stall fallback.
 
-The watchdog, spawn-child download, and Xet -> HTTP retry live once in
-``unsloth_zoo.hf_xet_fallback`` (shared by Unsloth main and Studio). This module re-exports that API
-and injects Studio's marker-aware cache purge (``prepare_cache_for_transport``) so the download
-manager keeps its ``.transport`` marker semantics on the HTTP retry. Call sites and the orchestrator's
-``DownloadStallError`` import are unchanged.
+Re-exports the shared API and injects Studio's marker-aware cache purge
+(``prepare_cache_for_transport``) so the download manager keeps its ``.transport``
+marker semantics on the HTTP retry.
 """
 
 from __future__ import annotations
@@ -20,11 +18,9 @@
     import unsloth_zoo.hf_xet_fallback as _shared
     _shared_available = True
 except Exception as _exc:  # noqa: BLE001 - any import failure must degrade, not crash
-    # unsloth_zoo's package __init__ runs torch/GPU detection, which raises on a Studio host without
-    # torch / without a GPU (CPU / llama.cpp GGUF-only). The download helper needs none of that, so
-    # retry on the light import path (UNSLOTH_ZOO_DISABLE_GPU_INIT) before giving up. The full GPU
-    # path above is unchanged on a normal host; a failed import is dropped from sys.modules so __init__
-    # re-runs here.
+    # unsloth_zoo's __init__ runs torch/GPU detection, which raises on a torch-less/GPU-less Studio
+    # host. The download helper needs none of it, so retry via the light UNSLOTH_ZOO_DISABLE_GPU_INIT
+    # path before giving up.
     _shared_import_error = _exc
     import os as _os
 
@@ -34,8 +30,7 @@
         import unsloth_zoo.hf_xet_fallback as _shared
         _shared_available = True
         _shared_import_error = None
-    except Exception as _exc2:  # noqa: BLE001
-        # unsloth_zoo absent / too old / broken: degrade so Studio still boots with plain HF downloads.
+    except Exception as _exc2:  # noqa: BLE001 - degrade so Studio still boots with plain HF downloads
         _shared_import_error = _exc2
         _shared_available = False
     finally:
@@ -45,8 +40,7 @@
             _os.environ["UNSLOTH_ZOO_DISABLE_GPU_INIT"] = _prev_gpu_init
 
 if _shared_available:
-    # Bind by assignment (not `from ... import`) so each public name has one module-level binding
-    # shared between this branch and the degraded one below.
+    # Bind by assignment so each public name shares one module-level binding with the degraded branch.
     DEFAULT_GRACE_PERIOD = _shared.DEFAULT_GRACE_PERIOD
     DEFAULT_HEARTBEAT_INTERVAL = _shared.DEFAULT_HEARTBEAT_INTERVAL
     DEFAULT_STALL_TIMEOUT = _shared.DEFAULT_STALL_TIMEOUT
@@ -57,9 +51,8 @@
     _shared_hf_hub_download_with_xet_fallback = _shared.hf_hub_download_with_xet_fallback
     _shared_snapshot_download_with_xet_fallback = _shared.snapshot_download_with_xet_fallback
 else:
-    # Degrade gracefully instead of crashing Studio: plain HF downloads with the stall watchdog
-    # disabled (the same best-effort posture core Unsloth uses). Recovery returns once unsloth_zoo is
-    # upgraded. Thin stubs, not a second copy of the orchestration.
+    # Degrade instead of crashing Studio: plain HF downloads, stall watchdog disabled. Thin stubs,
+    # not a second copy of the orchestration; recovery returns once unsloth_zoo is upgraded.
     import logging as _logging
 
     _logging.getLogger(__name__).warning(
@@ -74,8 +67,7 @@
     DEFAULT_GRACE_PERIOD = 10.0
 
     class DownloadStallError(RuntimeError):
-        """Stub mirror of the shared type so callers' ``except`` clauses still resolve; never raised in
-        degraded mode (no watchdog to detect a stall)."""
+        """Stub mirror so callers' ``except`` clauses resolve; never raised in degraded mode."""
 
     def child_should_disable_xet(config: dict) -> bool:
         return bool(config.get("disable_xet"))
@@ -90,8 +82,8 @@ def start_watchdog(
         xet_disabled: bool = False,
         **kwargs: Any,
     ) -> "threading.Event":
-        # No stall detection here, but keep emitting heartbeats so the orchestrator's inactivity
-        # deadline is not tripped during a legitimately long download.
+        # No stall detection, but keep emitting heartbeats so the orchestrator's inactivity deadline
+        # is not tripped during a long download.
         stop = threading.Event()
         if on_heartbeat is None:
             return stop
@@ -126,8 +118,7 @@ def _shared_hf_hub_download_with_xet_fallback(
         cancel_event: "Optional[threading.Event]" = None,
         **_ignored: Any,
     ) -> str:
-        # No subprocess to interrupt here, but keep the cancellation contract: do not start or return
-        # a download once cancelled.
+        # Keep the cancellation contract: do not start or return a download once cancelled.
         if _degraded_cancelled(cancel_event):
             raise RuntimeError("Cancelled")
 
@@ -193,9 +184,9 @@ def _shared_snapshot_download_with_xet_fallback(
 
 
 def _studio_prepare_for_http(repo_type: str, repo_id: str) -> None:
-    """Make the partial safe for an HTTP resume using Studio's marker-aware purge, so the download
-    manager's ``.transport`` accounting stays consistent (vs unsloth_zoo's generic default). Guarded so
-    a purge failure (locked file, missing dir) is logged rather than aborting the HTTP retry."""
+    """Studio's marker-aware purge before an HTTP resume, keeping the download manager's ``.transport``
+    accounting consistent (vs unsloth_zoo's generic default). Guarded: a purge failure is logged,
+    not fatal to the retry."""
     try:
         from hub.utils.download_registry import prepare_cache_for_transport
         prepare_cache_for_transport(repo_type, repo_id, "http")
@@ -224,9 +215,8 @@ def hf_hub_download_with_xet_fallback(
     on_status: Optional[Callable[[str], None]] = None,
     force_download: bool = False,
 ) -> str:
-    """Single-file download with the shared Xet -> HTTP stall fallback, using
-    Studio's marker-aware cache prep on the HTTP retry. ``force_download`` re-fetches a newer
-    remote blob even if an older one is cached (Studio's model-update path)."""
+    """Single-file download via the shared fallback with Studio's marker-aware HTTP-retry prep.
+    ``force_download`` re-fetches a newer blob over a cached one (Studio's model-update path)."""
     return _shared_hf_hub_download_with_xet_fallback(
         repo_id,
         filename,
@@ -244,7 +234,6 @@ def hf_hub_download_with_xet_fallback(
 
 
 def snapshot_download_with_xet_fallback(repo_id: str, **kwargs: Any) -> str:
-    """Whole-repo download with the shared Xet -> HTTP stall fallback, using Studio's
-    marker-aware cache prep on the HTTP retry (same injection as the single-file path)."""
+    """Whole-repo download via the shared fallback with Studio's marker-aware HTTP-retry prep."""
     kwargs.setdefault("prepare_for_http_fn", _studio_prepare_for_http)
     return _shared_snapshot_download_with_xet_fallback(repo_id, **kwargs)
diff --git a/studio/frontend/src/components/assistant-ui/model-selector/model-update-action.tsx b/studio/frontend/src/components/assistant-ui/model-selector/model-update-action.tsx
index ee0952f7c3..db7628777a 100644
--- a/studio/frontend/src/components/assistant-ui/model-selector/model-update-action.tsx
+++ b/studio/frontend/src/components/assistant-ui/model-selector/model-update-action.tsx
@@ -42,8 +42,8 @@ export function ModelUpdateAction({
 }: ModelUpdateActionProps) {
   const [open, setOpen] = useState(false);
 
-  // When this repo+variant's managed download finishes, refresh the caller so the
-  // "update available" cue clears. A ref keeps the subscription stable across renders.
+  // Refresh the caller when this repo+variant's download finishes so the "update available" cue
+  // clears. A ref keeps the subscription stable across renders.
   const onUpdatedRef = useRef(onUpdated);
   onUpdatedRef.current = onUpdated;
   useEffect(() => {
@@ -58,8 +58,8 @@ export function ModelUpdateAction({
   }, [repoId, variant]);
 
   const handleConfirm = useCallback(() => {
-    // Start the background re-download and close the dialog; the Downloads panel owns
-    // progress + cancel. Only a failure to START toasts (a failed download shows in the panel).
+    // Start the re-download and close the dialog; the Downloads panel owns progress + cancel.
+    // Only a failure to START toasts (a failed download shows in the panel).
     void Promise.resolve()
       .then(onConfirm)
       .catch((err) => {
diff --git a/studio/frontend/src/components/assistant-ui/model-selector/pickers.tsx b/studio/frontend/src/components/assistant-ui/model-selector/pickers.tsx
index 9a6b93eb8e..d3ae638e5c 100644
--- a/studio/frontend/src/components/assistant-ui/model-selector/pickers.tsx
+++ b/studio/frontend/src/components/assistant-ui/model-selector/pickers.tsx
@@ -1245,8 +1245,8 @@ export function HubModelPicker({
   onEject?: () => void;
 }) {
   const gpu = useGpuInfo();
-  // The live model id from the runtime store (backend-mirrored active_model), not the
-  // dropdown highlight which can be a staged pick. Disables the update action for it.
+  // Live model id from the runtime store (backend-mirrored active_model), not the dropdown
+  // highlight which can be a staged pick. Disables the update action for it.
   const loadedModelId = useChatRuntimeStore((s) => s.params.checkpoint);
   // Last-loaded timestamps power the "Recent" sort (vs "Downloaded" = file date).
   const loadTimes = useModelLoadTimes(value);
diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 4327472506..9fc2878c17 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -1,12 +1,9 @@
-"""Pure-CPU, no-network unit tests for the prefetch snapshot scoping in
-unsloth/models/_utils.py.
-
-maybe_prefetch_hf_snapshot warms the HF cache before the in-process load so the load is a cache
-hit and cannot hang on a stalled Xet transfer. The warm must download AT LEAST what the load
-reads (else the missing file falls to an unprotected in-process Xet fetch) but should not pull
-weights the load never reads. These tests lock the allow_patterns / ignore_patterns each mode
-hands snapshot_download_with_xet_fallback (Codex #6638: adapter-only, weights-at-root, subfolder).
-No network, no subprocess: the zoo downloader is monkeypatched to capture its kwargs.
+"""Pure-CPU, no-network unit tests for prefetch snapshot scoping in unsloth/models/_utils.py.
+
+maybe_prefetch_hf_snapshot warms the HF cache before the in-process load. The warm must cover at
+least what the load reads (else the missing file falls to an unprotected in-process Xet fetch) but
+not pull weights the load never reads. These tests lock the allow/ignore patterns each mode hands
+snapshot_download_with_xet_fallback. The zoo downloader is monkeypatched to capture its kwargs.
 """
 
 import fnmatch
@@ -19,9 +16,7 @@
 
 
 def _filter(names, allow_patterns, ignore_patterns):
-    """Mirror Hugging Face filter_repo_objects: keep a name if it matches any allow pattern
-    (or allow is None), then drop it if it matches any ignore pattern. fnmatch '*' spans '/'
-    exactly as HF's matcher does, so this reproduces the real selection over a sample file list."""
+    """Mirror HF filter_repo_objects: keep on allow match (or None), drop on ignore match."""
     kept = []
     for name in names:
         if allow_patterns is not None and not any(fnmatch.fnmatch(name, p) for p in allow_patterns):
@@ -34,11 +29,8 @@ def _filter(names, allow_patterns, ignore_patterns):
 
 @pytest.fixture
 def capture(monkeypatch):
-    """Call maybe_prefetch_hf_snapshot with a fake repo id and capture the allow/ignore patterns
-    it forwards to the zoo downloader. A fake unsloth_zoo.hf_xet_fallback module is injected into
-    sys.modules so the test is independent of the installed unsloth_zoo version (the published
-    package may predate the helper, which maybe_prefetch_hf_snapshot then imports lazily). Offline
-    env vars are cleared so the warm is not skipped."""
+    """Run maybe_prefetch_hf_snapshot with a fake repo, capturing the patterns forwarded to a
+    fake injected zoo downloader (independent of the installed unsloth_zoo). Offline env cleared."""
     monkeypatch.delenv("HF_HUB_OFFLINE", raising = False)
     monkeypatch.delenv("TRANSFORMERS_OFFLINE", raising = False)
 
@@ -56,9 +48,8 @@ def fake_download(repo_id, **kw):
     fake_module.DownloadStallError = type("DownloadStallError", (RuntimeError,), {})
     monkeypatch.setitem(sys.modules, "unsloth_zoo.hf_xet_fallback", fake_module)
 
-    # Neutralize the model_info network call (adapter format selection / use_safetensors auto
-    # branch) by default so the pure-CPU tests never reach the Hub. A best-effort failure leaves
-    # both weight formats eligible; tests that exercise format selection install their own.
+    # Neutralize the model_info network call by default; tests exercising format selection
+    # install their own.
     import huggingface_hub
 
     class _NoNetworkApi:
@@ -75,8 +66,7 @@ def run(**call_kwargs):
     return run
 
 
-# A representative repo file listing: root weights + tokenizer/config, plus an alternate-precision
-# subdir, an adapter, a checkpoint dir, and merged full-model weights an adapter repo might ship.
+# Representative repo listing: root weights + aux, subdir, adapter, checkpoint, merged weights.
 _SAMPLE_FILES = [
     "config.json",
     "tokenizer.json",
@@ -94,16 +84,13 @@ def run(**call_kwargs):
 
 
 def test_weights_at_root_excludes_subdir_weights(capture):
-    """A bare root load reads only root weight files, so weights nested in subdirs (fp16/,
-    experimental/, checkpoint-500/) must be ignored while the root weights stay warmed. An
-    explicit use_safetensors avoids the auto branch's model_info network call."""
+    """A root load ignores subdir weights (fp16/, experimental/, checkpoint-500/) but keeps root weights."""
     ok, st = capture(weights_at_root = True, use_safetensors = True)
     assert ok is True
-    assert st["allow_patterns"] is None  # the warm stays otherwise unfiltered
+    assert st["allow_patterns"] is None
     ig = st["ignore_patterns"]
     assert "*/*.safetensors" in ig and "*/*.bin" in ig
     kept = _filter(_SAMPLE_FILES, st["allow_patterns"], ig)
-    # Root weights + config/tokenizer survive; subdir weights are dropped.
     assert "model-00001-of-00002.safetensors" in kept
     assert "model.safetensors.index.json" in kept
     assert "config.json" in kept
@@ -113,27 +100,23 @@ def test_weights_at_root_excludes_subdir_weights(capture):
 
 
 def test_adapter_only_excludes_merged_weights(capture):
-    """An adapter warm reads only adapter_config.json + adapter_model.* (plus root tokenizer /
-    config); a repo that also ships merged full-model weights must not pull them."""
+    """An adapter warm keeps adapter files + root aux, not merged full-model weights."""
     ok, st = capture(adapter_only = True)
     assert ok is True
-    assert st["ignore_patterns"] is None  # the exact allowlist makes the format filter moot
+    assert st["ignore_patterns"] is None
     allow = st["allow_patterns"]
     assert "adapter_config.json" in allow and "adapter_model*" in allow
     kept = _filter(_SAMPLE_FILES, allow, st["ignore_patterns"])
-    # The adapter's own files + the root aux files are warmed.
     assert "adapter_config.json" in kept
     assert "adapter_model.safetensors" in kept
     assert "config.json" in kept and "tokenizer.json" in kept
-    # The merged / full-model weights are NOT pulled.
     assert "model-00001-of-00002.safetensors" not in kept
     assert "pytorch_model.bin" not in kept
     assert "fp16/model.safetensors" not in kept
 
 
 def test_adapter_only_warms_sharded_adapter(capture):
-    """A sharded adapter (adapter_model-00001-of-00002.safetensors) is still covered by the
-    adapter_model* glob, so a large adapter is not left to an in-process Xet fetch."""
+    """A sharded adapter is still covered by the adapter_model* glob."""
     _, st = capture(adapter_only = True)
     sharded = [
         "adapter_config.json",
@@ -146,7 +129,7 @@ def test_adapter_only_warms_sharded_adapter(capture):
 
 
 def test_tokenizer_only_warms_only_aux_files(capture):
-    """A distinct tokenizer repo warms only its tokenizer / config / vocab files, never weights."""
+    """A tokenizer-only repo warms tokenizer/config/vocab files, never weights."""
     _, st = capture(tokenizer_only = True)
     assert st["ignore_patterns"] is None
     assert st["allow_patterns"] == list(U._ROOT_AUX_PREFETCH_PATTERNS)
@@ -157,18 +140,16 @@ def test_tokenizer_only_warms_only_aux_files(capture):
 
 
 def test_aux_warm_covers_arbitrary_remote_code_modules(capture):
-    """A trust_remote_code auto_map can name its module arbitrarily (modeling.py, tokenization.py,
-    my_code.py), not just the transformers modeling_*.py convention, so the aux warm must cover any
-    *.py -- else the load fetches the code file in-process over Xet (Codex #6638)."""
+    """The aux warm must cover any *.py, since trust_remote_code auto_map names modules freely."""
     _, st = capture(tokenizer_only = True)
     allow = st["allow_patterns"]
     assert "*.py" in allow
     remote_code = [
         "config.json",
-        "modeling.py",  # auto_map "modeling.Model" -- no underscore suffix
+        "modeling.py",
         "tokenization.py",
         "my_custom_code.py",
-        "configuration_foo.py",  # the convention still covered by *.py too
+        "configuration_foo.py",
     ]
     kept = _filter(remote_code, allow, st["ignore_patterns"])
     for name in ("modeling.py", "tokenization.py", "my_custom_code.py", "configuration_foo.py"):
@@ -176,8 +157,7 @@ def test_aux_warm_covers_arbitrary_remote_code_modules(capture):
 
 
 def test_subfolder_warms_subfolder_plus_root_aux(capture):
-    """A subfolder load warms that subfolder's weights plus the root tokenizer / config; the
-    root weights and OTHER subfolders are skipped."""
+    """A subfolder load warms that subfolder's weights plus root aux; other subdirs/root weights skipped."""
     _, st = capture(subfolder = "fp16")
     allow = st["allow_patterns"]
     assert "fp16/*" in allow
@@ -189,9 +169,7 @@ def test_subfolder_warms_subfolder_plus_root_aux(capture):
 
 
 def test_subfolder_takes_precedence_over_weights_at_root(capture):
-    """weights_at_root is a root-load assertion; when a subfolder IS requested the subfolder
-    branch wins (the load reads that subfolder), so the warm is the subfolder, not a
-    root-with-subdir-weights-excluded warm."""
+    """When a subfolder is requested the subfolder branch wins over weights_at_root."""
     _, st = capture(subfolder = "fp16", weights_at_root = True)
     assert "fp16/*" in st["allow_patterns"]
     kept = _filter(_SAMPLE_FILES, st["allow_patterns"], st["ignore_patterns"])
@@ -199,7 +177,7 @@ def test_subfolder_takes_precedence_over_weights_at_root(capture):
 
 
 def test_local_dir_is_not_warmed(capture, tmp_path):
-    """A local directory path has nothing to download: the warm is skipped (returns False)."""
+    """A local directory path skips the warm (returns False)."""
     d = tmp_path / "local-model"
     d.mkdir()
     ok = U.maybe_prefetch_hf_snapshot(str(d), weights_at_root = True)
@@ -207,7 +185,7 @@ def test_local_dir_is_not_warmed(capture, tmp_path):
 
 
 def _install_fake_model_info(monkeypatch, filenames):
-    """Make HfApi().model_info(...).siblings report *filenames*, with no network."""
+    """Make HfApi().model_info(...).siblings report filenames, with no network."""
     import huggingface_hub
 
     class _Sib:
@@ -229,57 +207,45 @@ def model_info(self, *a, **k):
 
 
 def test_variant_keeps_bin_when_only_default_safetensors(monkeypatch):
-    """With variant='fp16' requested, a DEFAULT model.safetensors must not prove the variant
-    pytorch_model.fp16.bin redundant: dropping it would leave the variant load to fetch the .bin
-    in-process over Xet. The .bin stays warmed (Codex #6638)."""
+    """A default model.safetensors must not prove a variant .bin redundant; without a variant it does."""
     _install_fake_model_info(monkeypatch, ["model.safetensors", "pytorch_model.fp16.bin"])
     ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16", weights_at_root = True)
     assert "*.bin" not in ig
-    # No variant: the default safetensors DOES make .bin redundant (existing behavior).
     ig_default = U._prefetch_ignore_patterns("org/repo", weights_at_root = True)
     assert "*.bin" in ig_default
 
 
 def test_variant_drops_bin_when_variant_safetensors_present(monkeypatch):
-    """When a variant-matching safetensors (model.fp16.safetensors) is shipped, the variant load
-    reads it and the variant .bin is redundant, so .bin is dropped from the warm."""
+    """A variant-matching safetensors makes the variant .bin redundant, so .bin is dropped."""
     _install_fake_model_info(monkeypatch, ["model.fp16.safetensors", "pytorch_model.fp16.bin"])
     ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16", weights_at_root = True)
     assert "*.bin" in ig
 
 
 def test_no_variant_keeps_bin_when_only_variant_safetensors(monkeypatch):
-    """A no-variant load reads pytorch_model.bin; a lone variant safetensors (model.fp16.safetensors)
-    must NOT prove the .bin redundant -- only a CANONICAL safetensors does. Else the .bin the load reads
-    is dropped from the warm and fetched in-process over Xet (Codex #6638)."""
+    """For a no-variant load, only a canonical safetensors (not a lone variant) makes .bin redundant."""
     _install_fake_model_info(monkeypatch, ["model.fp16.safetensors", "pytorch_model.bin"])
-    ig = U._prefetch_ignore_patterns("org/repo", weights_at_root = True)  # variant unset
+    ig = U._prefetch_ignore_patterns("org/repo", weights_at_root = True)
     assert "*.bin" not in ig
-    # A canonical safetensors DOES make the .bin redundant for a no-variant load.
     _install_fake_model_info(monkeypatch, ["model.safetensors", "pytorch_model.bin"])
     ig2 = U._prefetch_ignore_patterns("org/repo", weights_at_root = True)
     assert "*.bin" in ig2
 
 
 def test_variant_keeps_bin_for_noncanonical_sidecar(monkeypatch):
-    """With variant='fp16', a NON-canonical sidecar (consolidated.fp16.safetensors) must not prove the
-    variant pytorch_model.fp16.bin redundant: a transformers variant load reads model.fp16.safetensors,
-    not consolidated.*, so dropping the .bin would leave the only loadable weights to an in-process Xet
-    fetch. The .bin stays warmed (Codex #6638)."""
+    """A non-canonical variant sidecar must not prove the variant .bin redundant; a canonical one does."""
     _install_fake_model_info(
         monkeypatch, ["consolidated.fp16.safetensors", "pytorch_model.fp16.bin"]
     )
     ig = U._prefetch_ignore_patterns("org/repo", variant = "fp16", weights_at_root = True)
     assert "*.bin" not in ig
-    # A canonical variant safetensors DOES make the variant .bin redundant.
     _install_fake_model_info(monkeypatch, ["model.fp16.safetensors", "pytorch_model.fp16.bin"])
     ig2 = U._prefetch_ignore_patterns("org/repo", variant = "fp16", weights_at_root = True)
     assert "*.bin" in ig2
 
 
 def test_is_canonical_model_weight_safetensors():
-    """The canonical detector matches only the non-variant model-weight safetensors names a default
-    load reads, and rejects variant / sidecar names (Codex #6638)."""
+    """The canonical detector matches only non-variant model-weight safetensors names."""
     assert U._is_canonical_model_weight_safetensors("model.safetensors") is True
     assert U._is_canonical_model_weight_safetensors("model-00001-of-00002.safetensors") is True
     assert U._is_canonical_model_weight_safetensors("model.safetensors.index.json") is True
@@ -291,10 +257,7 @@ def test_is_canonical_model_weight_safetensors():
 
 
 def test_st_prefetch_resolves_env_cache_and_runs_after_validation():
-    """The ST prefetch must resolve SENTENCE_TRANSFORMERS_HOME for its cache (so a load relying on that
-    env is a cache hit, not an unprotected in-process download) and must run AFTER the mutually-exclusive
-    load-mode validation (so a config rejected locally wastes no multi-GB download) (Codex #6638). Static
-    guard: importing ST pulls heavy optional deps."""
+    """The ST prefetch must resolve SENTENCE_TRANSFORMERS_HOME and run after load-mode validation."""
     import ast
     import os
 
@@ -311,22 +274,19 @@ def test_st_prefetch_resolves_env_cache_and_runs_after_validation():
     ]
     assert len(prefetch_calls) == 1, "expected exactly one ST prefetch call"
     call = prefetch_calls[0]
-    # F3: the cache_dir kwarg resolves SENTENCE_TRANSFORMERS_HOME.
+    # cache_dir kwarg resolves SENTENCE_TRANSFORMERS_HOME.
     cache_dir_kw = next((kw for kw in call.keywords if kw.arg == "cache_dir"), None)
     assert cache_dir_kw is not None, "ST prefetch must pass cache_dir"
     assert "SENTENCE_TRANSFORMERS_HOME" in ast.dump(
         cache_dir_kw.value
     ), "ST prefetch cache_dir must resolve SENTENCE_TRANSFORMERS_HOME"
-    # F2: the load-mode validation runs before the prefetch (fewer source lines = earlier).
+    # Load-mode validation runs before the prefetch (fewer source lines = earlier).
     val_lineno = src[: src.index("Can only load in 4bit or 8bit or 16bit")].count("\n")
     assert val_lineno < call.lineno, "load-mode validation must precede the ST prefetch"
 
 
 def test_st_cache_resolutions_honor_explicit_hf_cache_dir():
-    """Every ST cache resolution (the prefetch and the fallback module loads) that falls back to
-    SENTENCE_TRANSFORMERS_HOME must first honor an explicit HF cache_dir. The FastModel fallback load
-    forwards kwargs['cache_dir'], so a caller passing cache_dir would otherwise warm one cache and read
-    another, missing the warm and fetching in-process over Xet (Codex #6638). Static guard."""
+    """Every ST cache resolution falling back to SENTENCE_TRANSFORMERS_HOME must first honor an explicit HF cache_dir."""
     import ast
     import os
 
@@ -348,10 +308,7 @@ def test_st_cache_resolutions_honor_explicit_hf_cache_dir():
 
 
 def test_st_native_loads_map_hf_cache_dir_to_cache_folder():
-    """The for_inference and fast-encoder branches construct a native SentenceTransformer, which takes
-    cache_folder (not cache_dir). The prefetch warms cache_dir first, so an explicit HF cache_dir must be
-    mapped onto cache_folder for those native loads; otherwise the load reads a different cache, misses the
-    warm, and starts an unprotected in-process Xet download (Codex #6638). Static guard."""
+    """Native SentenceTransformer loads take cache_folder, so an explicit HF cache_dir must be mapped onto it."""
     import ast
     import os
 
@@ -359,7 +316,7 @@ def test_st_native_loads_map_hf_cache_dir_to_cache_folder():
     with open(src_path, "r", encoding = "utf-8") as f:
         src = f.read()
     tree = ast.parse(src)
-    # Every native SentenceTransformer(...) constructor that forwards cache_folder must read cache_dir.
+    # Every native SentenceTransformer(...) forwarding cache_folder must read cache_dir.
     st_calls = [
         n
         for n in ast.walk(tree)
@@ -373,8 +330,7 @@ def test_st_native_loads_map_hf_cache_dir_to_cache_folder():
         assert "'cache_dir'" in ast.dump(
             kw.value
         ), "a native SentenceTransformer cache_folder must map the explicit HF cache_dir first"
-    # The for_inference branch feeds cache_folder through st_kwargs; it must map cache_dir there too, and
-    # both native branches resolve cache_dir -> cache_folder (reformatting-tolerant normalized check).
+    # for_inference feeds cache_folder via st_kwargs; both native branches map cache_dir -> cache_folder.
     normalized = "".join(src.split())
     assert (
         'st_kwargs["cache_folder"]=' in normalized
@@ -385,10 +341,7 @@ def test_st_native_loads_map_hf_cache_dir_to_cache_folder():
 
 
 def test_vision_warms_vllm_tokenizer_after_remap():
-    """On the vLLM path the base warm is skipped and the tokenizer warm is deferred until after
-    fast_inference_setup may remap model_name. The final tokenizer repo must then be warmed (tokenizer
-    only) so the in-process processor / tokenizer load is a cache hit, not an unprotected Xet fetch
-    (Codex #6638). Static guard: the vLLM-gated tokenizer warm appears after the remap."""
+    """On the vLLM path the tokenizer warm is deferred until after the fast_inference_setup remap."""
     import os
 
     src_path = os.path.join(os.path.dirname(U.__file__), "vision.py")
@@ -402,10 +355,7 @@ def test_vision_warms_vllm_tokenizer_after_remap():
 
 
 def test_diffusion_forwards_variant_to_real_load():
-    """FastDiffusionModel must forward `variant` to the real model_cls.from_pretrained load, not only to
-    the prefetch: without it the pipeline asks for the default weight variant, missing the warmed variant
-    weights (wrong precision, or a default weight fetched in-process over Xet) (Codex #6638). Static
-    guard."""
+    """FastDiffusionModel must forward variant to the real model_cls.from_pretrained load, not just the prefetch."""
     import os
 
     src_path = os.path.join(os.path.dirname(U.__file__), "diffusion.py")
@@ -417,11 +367,7 @@ def test_diffusion_forwards_variant_to_real_load():
 
 
 def test_vision_prefetch_runs_after_load_mode_validation():
-    """The FastBaseModel (vision / FastModel) prefetch must run AFTER the mutually-exclusive load-mode
-    validation, so an invalid load_in_4bit/8bit/16bit combination fails locally without first downloading
-    a multi-GB snapshot (Codex #6638). check_and_disable_bitsandbytes_loading can only resolve after the
-    config fetch, so the check cannot move earlier; the prefetch moves after it instead. Static guard:
-    importing the loader pulls heavy optional deps."""
+    """The FastBaseModel (vision) prefetch must run after the load-mode validation."""
     import ast
     import os
 
@@ -443,10 +389,7 @@ def test_vision_prefetch_runs_after_load_mode_validation():
 
 
 def test_llama_prefetch_skips_only_real_vllm_loads():
-    """A num_labels classification load takes the AutoModelForSequenceClassification branch (an in-process
-    download) even under fast_inference=True, so the llama prefetch's fast_inference skip must be gated on
-    num_labels is None -- else that load's weights fetch over un-killable Xet (Codex #6638). Static guard:
-    the base prefetch's fast_inference kwarg references both fast_inference and num_labels."""
+    """The llama prefetch's fast_inference skip must be gated on num_labels is None (a classification load still downloads)."""
     import ast
     import os
 
@@ -471,12 +414,7 @@ def test_llama_prefetch_skips_only_real_vllm_loads():
 
 
 def test_st_fallback_module_loads_resolve_env_cache():
-    """The fallback module loads must resolve the SAME cache the prefetch warmed. _module_path /
-    _read_pooling_mode call hf_hub_download directly, which does NOT honor SENTENCE_TRANSFORMERS_HOME,
-    so any cache_dir derived from cache_folder must also fall back to the env var; otherwise, when a
-    caller relies on SENTENCE_TRANSFORMERS_HOME without passing cache_folder, modules.json / module
-    files miss the warm and are fetched in-process over Xet (Codex #6638). Static guard: importing ST
-    pulls heavy optional deps."""
+    """Fallback module loads deriving cache_dir from cache_folder must also fall back to SENTENCE_TRANSFORMERS_HOME."""
     import ast
     import os
 
@@ -485,9 +423,7 @@ def test_st_fallback_module_loads_resolve_env_cache():
         src = f.read()
     tree = ast.parse(src)
 
-    # Every _module_path / _load_modules call whose cache_dir is derived from cache_folder (i.e. the
-    # from_pretrained fallback sites, not the internal `cache_dir = cache_dir` pass-throughs) must also
-    # resolve SENTENCE_TRANSFORMERS_HOME so the resolution matches the prefetch above.
+    # Fallback sites (cache_dir derived from cache_folder) must resolve SENTENCE_TRANSFORMERS_HOME.
     checked = 0
     for node in ast.walk(tree):
         if not (isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute)):
@@ -499,7 +435,7 @@ def test_st_fallback_module_loads_resolve_env_cache():
             continue
         dumped = ast.dump(cache_dir_kw.value)
         if "cache_folder" not in dumped:
-            continue  # internal pass-through (cache_dir = cache_dir): not a resolution site
+            continue  # internal pass-through, not a resolution site
         checked += 1
         assert (
             "SENTENCE_TRANSFORMERS_HOME" in dumped
@@ -510,15 +446,9 @@ def test_st_fallback_module_loads_resolve_env_cache():
 
 
 def test_st_fallback_module_loads_forward_revision():
-    """A revision-pinned ST repo loaded via the custom fallback path loads its model WEIGHTS from the
-    requested revision (FastModel forwards revision to the weight load), so the module files (modules.json,
-    pooling config, per-module dirs) must load from the SAME revision. Otherwise they resolve the repo
-    default branch: fetched in-process over Xet (missing the prefetch's revision-pinned warm) and mixed
-    with the revision-pinned weights (Codex #6638). Static guard: (a) _module_path / _read_pooling_mode /
-    _load_modules accept a revision arg, (b) every hf_hub_download / load_dir_path inside them forwards
-    revision, (c) _load_modules threads revision into its internal _module_path / _read_pooling_mode
-    calls, (d) the from_pretrained fallback _module_path / _load_modules calls forward revision. Importing
-    ST pulls heavy optional deps."""
+    """The fallback module loads must forward revision so module files match the revision-pinned weights.
+    Guards: (a) helpers accept revision, (b) every download primitive forwards it, (c) _load_modules
+    threads it into internal calls, (d) the from_pretrained fallback sites forward it."""
     import ast
     import os
 
@@ -539,7 +469,7 @@ def test_st_fallback_module_loads_forward_revision():
         arg_names = {a.arg for a in fn.args.args + fn.args.kwonlyargs}
         assert "revision" in arg_names, f"{name} must accept a revision argument"
 
-    # (b) every direct download primitive inside the helpers forwards revision.
+    # (b) every download primitive inside the helpers forwards revision.
     downloads = 0
     for name, fn in funcs.items():
         for node in ast.walk(fn):
@@ -575,7 +505,7 @@ def test_st_fallback_module_loads_forward_revision():
             continue
         cache_dir_kw = next((kw for kw in node.keywords if kw.arg == "cache_dir"), None)
         if cache_dir_kw is None or "cache_folder" not in ast.dump(cache_dir_kw.value):
-            continue  # internal pass-through, not a from_pretrained fallback site
+            continue  # internal pass-through, not a fallback site
         checked += 1
         rev_kw = next((kw for kw in node.keywords if kw.arg == "revision"), None)
         assert rev_kw is not None and "revision" in ast.dump(
@@ -587,11 +517,7 @@ def test_st_fallback_module_loads_forward_revision():
 
 
 def test_st_fallback_model_load_resolves_env_cache():
-    """The fallback FastModel weight load resolves its cache from the HF cache_dir, not ST's cache_folder /
-    SENTENCE_TRANSFORMERS_HOME. from_pretrained must therefore resolve the SAME cache the prefetch warmed
-    into kwargs['cache_dir'] BEFORE the FastModel.from_pretrained call -- else the weights miss the warm
-    and start an unprotected in-process Xet download (Codex #6638). Static guard: importing ST pulls heavy
-    optional deps."""
+    """from_pretrained must resolve the warmed ST cache into kwargs['cache_dir'] before the FastModel weight load."""
     import ast
     import os
 
@@ -600,8 +526,7 @@ def test_st_fallback_model_load_resolves_env_cache():
         tree = ast.parse(f.read())
 
     def _resolves_st_cache(value_node):
-        # The resolution may be inline in the assigned value, or in the assignment to the intermediate
-        # variable the value references (kwargs['cache_dir'] = _st_cache_dir; _st_cache_dir = ...).
+        # Resolution may be inline or in the assignment to an intermediate variable the value references.
         dumped = ast.dump(value_node)
         if "cache_folder" in dumped and "SENTENCE_TRANSFORMERS_HOME" in dumped:
             return True
@@ -647,28 +572,20 @@ def _resolves_st_cache(value_node):
 
 
 def test_canonical_variant_model_weight_matches_transformers_names():
-    """The variant safetensors detector matches only CANONICAL model variant names a transformers load
-    reads (single, either shard infix, and the variant index) and rejects a non-canonical sidecar
-    (consolidated.fp16.safetensors) so its variant .bin is never wrongly dropped, plus the default and
-    wrong-variant names (Codex #6638)."""
+    """The variant safetensors detector matches only canonical variant names, rejecting sidecars and wrong variants."""
     f = U._is_canonical_variant_model_weight_safetensors
     assert f("model.fp16.safetensors", "fp16") is True
     assert f("model.fp16-00001-of-00002.safetensors", "fp16") is True
     assert f("model-00001-of-00002.fp16.safetensors", "fp16") is True
     assert f("model.safetensors.index.fp16.json", "fp16") is True
-    # A non-canonical sidecar variant does NOT prove the .bin redundant (the M2 hang guard).
     assert f("consolidated.fp16.safetensors", "fp16") is False
-    # Default (non-variant) and wrong-variant names are not a match for variant='fp16'.
     assert f("model.safetensors", "fp16") is False
     assert f("model-00001-of-00002.safetensors", "fp16") is False
     assert f("model.bf16.safetensors", "fp16") is False
 
 
 def test_variant_is_forwarded_to_downloader(capture):
-    """maybe_prefetch_hf_snapshot must forward `variant` to snapshot_download_with_xet_fallback so the
-    PRE cache-skip gate can defer on a variant load: a cache holding only the default canonical weight
-    must not fast-path a variant='fp16' request, else the in-process load fetches the missing variant
-    weight over un-killable Xet. Absent a variant, nothing is forwarded (the fast path stays live)."""
+    """maybe_prefetch_hf_snapshot must forward variant to the downloader (absent a variant, nothing is forwarded)."""
     _, st = capture(weights_at_root = True, use_safetensors = True, variant = "fp16")
     assert st["variant"] == "fp16"
     _, st = capture(weights_at_root = True, use_safetensors = True)
@@ -676,8 +593,7 @@ def test_variant_is_forwarded_to_downloader(capture):
 
 
 def test_variant_drops_bin_for_sharded_variant_safetensors(monkeypatch):
-    """A SHARDED variant safetensors (model.fp16-00001-of-00002.safetensors) is recognized, so its
-    redundant variant .bin is dropped rather than both formats warmed (gemini #6638)."""
+    """A sharded variant safetensors is recognized, so its redundant variant .bin is dropped."""
     _install_fake_model_info(
         monkeypatch,
         [
@@ -691,10 +607,7 @@ def test_variant_drops_bin_for_sharded_variant_safetensors(monkeypatch):
 
 
 def test_tokenizer_only_warms_extra_vocab_files(capture):
-    """tokenizer_only must warm the SentencePiece / vocab / processor files real tokenizers load
-    (spm.model, normalizer.json, video_preprocessor_config.json, tokenizer.model.v3, and a named
-    additional_chat_templates/<name>.jinja) so a separate-repo tokenizer / processor load does not
-    fetch them in-process over Xet (Codex #6638)."""
+    """tokenizer_only must warm SentencePiece / vocab / processor files, including a named jinja template."""
     _, st = capture(tokenizer_only = True)
     allow = st["allow_patterns"]
     for name in (
@@ -716,35 +629,28 @@ def test_tokenizer_only_warms_extra_vocab_files(capture):
 
 
 def test_format_probe_runs_even_when_config_cached(capture, monkeypatch):
-    """A cached config.json must NOT skip the weight-format probe: AutoConfig caches config.json
-    before this helper runs (Llama / diffusion), so a config-based "cached" guess would leave both
-    formats eligible and over-fetch both multi-GB weight sets even when no weights are cached. The
-    auto branch still consults model_info and drops the redundant .bin for a safetensors repo
-    (Codex #6638)."""
+    """A cached config.json must not skip the weight-format probe; model_info still drops the redundant .bin."""
     import huggingface_hub
 
-    # Pretend config.json is locally cached (the AutoConfig side effect). This must not gate the probe.
+    # Pretend config.json is cached (the AutoConfig side effect); this must not gate the probe.
     monkeypatch.setattr(
         huggingface_hub, "try_to_load_from_cache", lambda *a, **k: "/cache/config.json"
     )
     _install_fake_model_info(monkeypatch, ["model.safetensors", "pytorch_model.bin"])
     _, st = capture(weights_at_root = True)
     ig = st["ignore_patterns"] or []
-    assert "*.bin" in ig  # redundant .bin dropped because real model safetensors is present
+    assert "*.bin" in ig
 
 
 def test_optimizer_safetensors_does_not_drop_bin(monkeypatch):
-    """A training-state optimizer.safetensors sidecar must NOT count as model safetensors: a repo
-    whose real weights are pytorch_model.bin alongside an optimizer.safetensors must keep its .bin,
-    else the in-process load fetches the only weights over Xet without the fallback (Codex #6638)."""
+    """An optimizer.safetensors sidecar must not count as model safetensors, so the real .bin weights are kept."""
     _install_fake_model_info(monkeypatch, ["pytorch_model.bin", "optimizer.safetensors"])
     ig = U._prefetch_ignore_patterns("org/repo", weights_at_root = True)
-    assert "*.bin" not in ig  # .bin is the only real weight -> not dropped
+    assert "*.bin" not in ig
 
 
 def test_model_safetensors_still_drops_bin(monkeypatch):
-    """Control for the optimizer case: a real model.safetensors next to pytorch_model.bin still
-    drops the redundant .bin (the sidecar exclusion must not over-trigger) (Codex #6638)."""
+    """Control for the optimizer case: a real model.safetensors next to pytorch_model.bin still drops the .bin."""
     _install_fake_model_info(
         monkeypatch, ["model.safetensors", "pytorch_model.bin", "optimizer.safetensors"]
     )
@@ -753,11 +659,7 @@ def test_model_safetensors_still_drops_bin(monkeypatch):
 
 
 def test_whole_multi_component_snapshot_keeps_subdir_bin(monkeypatch):
-    """A whole multi-component snapshot (weights_at_root=False, no subfolder: a SentenceTransformer /
-    diffusers repo) must NOT drop *.bin even when root safetensors exist -- HF's "*" spans "/", so the
-    drop would strip a subdir module's only weight (1_Dense/pytorch_model.bin) and leave the module load
-    to an in-process Xet fetch. A root-scoped load of the same repo still drops the redundant root .bin
-    (Codex #6638)."""
+    """A whole multi-component snapshot must not drop *.bin (it would strip a subdir module's weight); a root load still does."""
     _install_fake_model_info(monkeypatch, ["model.safetensors", "1_Dense/pytorch_model.bin"])
     ig = U._prefetch_ignore_patterns("org/repo", weights_at_root = False)
     assert "*.bin" not in ig
@@ -766,7 +668,7 @@ def test_whole_multi_component_snapshot_keeps_subdir_bin(monkeypatch):
 
 
 def test_is_model_weight_safetensors_classification():
-    """Direct unit coverage: real model weights count, adapter / trainer-state sidecars do not."""
+    """Real model weights count; adapter / trainer-state sidecars do not."""
     assert U._is_model_weight_safetensors("model.safetensors") is True
     assert U._is_model_weight_safetensors("model-00001-of-00002.safetensors") is True
     assert U._is_model_weight_safetensors("model.safetensors.index.json") is True
@@ -778,10 +680,7 @@ def test_is_model_weight_safetensors_classification():
 
 
 def test_tokenizer_only_warms_slow_sentencepiece_vocab(capture):
-    """tokenizer_only must warm the slow-tokenizer SentencePiece / BPE vocab files AutoTokenizer
-    fetches first (sentencepiece.bpe.model for XLM-R / mBART, source.spm / target.spm for Marian,
-    bpe.codes / vocab.bpe, sentencepiece.model for RemBERT, vocab-src.json / vocab-tgt.json for FSMT),
-    so they are not left to an in-process Xet fetch (Codex #6638)."""
+    """tokenizer_only must warm the slow-tokenizer SentencePiece / BPE vocab files AutoTokenizer fetches first."""
     _, st = capture(tokenizer_only = True)
     allow = st["allow_patterns"]
     for name in (
@@ -798,9 +697,7 @@ def test_tokenizer_only_warms_slow_sentencepiece_vocab(capture):
 
 
 def test_adapter_safetensors_check_scoped_to_root(monkeypatch):
-    """_adapter_repo_has_safetensors must only count a ROOT adapter_model*.safetensors: a repo with
-    a root adapter_model.bin plus an unrelated checkpoint-5/adapter_model.safetensors must NOT drop
-    the root .bin (the adapter warm only pulls root adapter_model*) (Codex #6638)."""
+    """_adapter_repo_has_safetensors must only count a root adapter_model*.safetensors, not a subdir one."""
     import huggingface_hub
 
     class _Sib:
@@ -814,7 +711,7 @@ def __init__(self, names):
         def model_info(self, *a, **k):
             return type("MI", (), {"siblings": [_Sib(n) for n in self._names]})()
 
-    # Subdir safetensors only -> not at root -> must NOT report safetensors present.
+    # Subdir safetensors only -> not reported present.
     monkeypatch.setattr(
         huggingface_hub,
         "HfApi",
@@ -833,30 +730,28 @@ def model_info(self, *a, **k):
 
 
 def test_gguf_file_warm_keeps_gguf(capture):
-    """A gguf_file load reads exactly that GGUF, so the warm must allow-list it (not drop *.gguf via
-    the static ignore list) while not pulling other quants the repo may publish (Codex #6638)."""
+    """A gguf_file load allow-lists that GGUF while not pulling other quants the repo publishes."""
     _, st = capture(weights_at_root = True, gguf_file = "model-Q4_K_M.gguf")
     allow = st["allow_patterns"]
     ig = st["ignore_patterns"]
     assert allow is not None and "model-Q4_K_M.gguf" in allow
     sample = [
         "model-Q4_K_M.gguf",
-        "model-Q8_0.gguf",  # a different quant the load does not read
+        "model-Q8_0.gguf",
         "config.json",
         "tokenizer.json",
     ]
     kept = _filter(sample, allow, ig)
-    assert "model-Q4_K_M.gguf" in kept  # the requested GGUF is warmed
-    assert "config.json" in kept  # root aux warmed
-    assert "model-Q8_0.gguf" not in kept  # other quants are not pulled
+    assert "model-Q4_K_M.gguf" in kept
+    assert "config.json" in kept
+    assert "model-Q8_0.gguf" not in kept
 
 
 # ----- Finding Q: adapter weight-format selection -----
 
 
 def test_adapter_only_prefers_safetensors_over_bin(capture, monkeypatch):
-    """A mixed-format adapter repo (adapter_model.safetensors AND adapter_model.bin) warms only
-    the safetensors PeftModel.from_pretrained reads, not both formats (Codex #6638)."""
+    """A mixed-format adapter repo warms only the safetensors PeftModel reads, not both formats."""
     _install_fake_model_info(
         monkeypatch, ["adapter_config.json", "adapter_model.safetensors", "adapter_model.bin"]
     )
@@ -873,8 +768,7 @@ def test_adapter_only_prefers_safetensors_over_bin(capture, monkeypatch):
 
 
 def test_adapter_only_bin_only_keeps_bin(capture, monkeypatch):
-    """A .bin-only adapter repo must keep adapter_model.bin -- never under-warm it into an
-    in-process Xet fetch (best-effort: no safetensors found -> both formats eligible)."""
+    """A .bin-only adapter repo must keep adapter_model.bin (no safetensors found -> both formats eligible)."""
     _install_fake_model_info(monkeypatch, ["adapter_config.json", "adapter_model.bin"])
     _, st = capture(adapter_only = True)
     kept = _filter(
@@ -898,8 +792,7 @@ def test_adapter_only_explicit_use_safetensors_false_keeps_bin(capture):
 
 
 def test_gguf_file_with_subfolder_warms_subfolder_path(capture):
-    """gguf_file + subfolder: the load resolves <subfolder>/<gguf_file>, so the warm must allow-list
-    that subfolder path, not the bare root name (Codex #6638)."""
+    """gguf_file + subfolder: the warm allow-lists <subfolder>/<gguf_file>, not the bare root name."""
     _, st = capture(weights_at_root = True, gguf_file = "model-Q4_K_M.gguf", subfolder = "gguf")
     allow = st["allow_patterns"]
     assert "gguf/model-Q4_K_M.gguf" in allow
@@ -908,22 +801,17 @@ def test_gguf_file_with_subfolder_warms_subfolder_path(capture):
 
 
 def test_from_tf_root_load_ignores_nested_h5(capture):
-    """A from_tf root load reads the ROOT .h5; nested .h5 / .msgpack checkpoints under subdirs are
-    unread, so the root-only subdir ignore must drop them (it covers every weight format, not only
-    safetensors / bin) (Codex #6638)."""
+    """A from_tf root load keeps the root .h5 but drops nested .h5 / .msgpack checkpoints."""
     _, st = capture(weights_at_root = True, from_tf = True)
     ig = st["ignore_patterns"]
     assert "*/*.h5" in ig and "*/*.msgpack" in ig
     kept = _filter(["model.h5", "checkpoint-1/model.h5", "config.json"], st["allow_patterns"], ig)
-    assert "model.h5" in kept  # root TF weight warmed
-    assert "checkpoint-1/model.h5" not in kept  # nested TF checkpoint ignored
+    assert "model.h5" in kept
+    assert "checkpoint-1/model.h5" not in kept
 
 
 def test_sentence_transformer_from_pretrained_is_prefetch_wired():
-    """FastSentenceTransformer.from_pretrained must warm the repo via maybe_prefetch_hf_snapshot as an
-    UNCONDITIONAL top-level statement (so it fires on every load path: for_inference, fast-encoder,
-    fallback) and before any top-level return. Static AST guard (importing ST pulls heavy optional
-    deps); checking the call is top-level -- not nested in an if/for/try -- catches a dead-branch wire."""
+    """from_pretrained must call maybe_prefetch_hf_snapshot as an unconditional top-level statement before any return."""
     import ast
     import os
 
@@ -949,8 +837,7 @@ def _is_prefetch_call(node):
         prefetch_pos is not None
     ), "from_pretrained must call maybe_prefetch_hf_snapshot at top level"
     assert prefetch_pos < return_pos, "prefetch must run before any top-level return"
-    # local_files_only must be forwarded so an offline / cache-only load does not start a Hub download
-    # via the prefetch before the ST load sees the flag (Codex #6638).
+    # local_files_only must be forwarded so an offline load does not start a Hub download.
     prefetch_call = fp.body[prefetch_pos].value
     assert "local_files_only" in {
         kw.arg for kw in prefetch_call.keywords
@@ -958,9 +845,7 @@ def _is_prefetch_call(node):
 
 
 def test_st_module_download_forwards_cache_folder():
-    """_load_modules must forward the custom cache_folder into load_dir_path so per-module subdirs are
-    read from the warmed cache rather than the default one, avoiding a second in-process Hub/Xet fetch
-    (Codex #6638). Static AST guard (importing ST pulls heavy optional deps)."""
+    """_load_modules must forward the custom cache_folder into load_dir_path so per-module subdirs read the warmed cache."""
     import ast
     import os
 
@@ -979,11 +864,7 @@ def test_st_module_download_forwards_cache_folder():
 
 
 def test_st_native_sentence_transformer_calls_forward_cache_folder():
-    """Every native SentenceTransformer(model_name, ...) load (for_inference AND fast-encoder) must
-    forward cache_folder, so a custom cache_folder reads the cache the prefetch warmed instead of
-    missing it and starting an unprotected in-process Hub/Xet download (Codex #6638). The modules-based
-    SentenceTransformer(modules=...) call builds from already-loaded modules and needs no cache_folder.
-    Static AST guard (importing ST pulls heavy optional deps)."""
+    """Every native SentenceTransformer(model_name, ...) load must forward cache_folder; a modules-based build needs none."""
     import ast
     import os
 
@@ -999,16 +880,14 @@ def test_st_native_sentence_transformer_calls_forward_cache_folder():
         ):
             continue
         kw_names = {kw.arg for kw in n.keywords}
-        # A modules-based build (SentenceTransformer(modules=...)) downloads nothing; only a
-        # repo-name load (positional model_name, no modules=) reads the cache.
+        # A modules-based build downloads nothing; only a repo-name load reads the cache.
         if "modules" in kw_names:
             continue
         weight_loading_calls.append(n)
     assert (
         weight_loading_calls
     ), "expected a repo-name SentenceTransformer load in sentence_transformer.py"
-    # cache_folder is forwarded either explicitly (fast-encoder branch) or via a **kwargs unpacking
-    # (for_inference branch builds st_kwargs incl. cache_folder). A ** unpacking has kw.arg == None.
+    # cache_folder is forwarded explicitly or via a **kwargs unpacking (kw.arg == None).
     for c in weight_loading_calls:
         kw_names = {kw.arg for kw in c.keywords}
         forwards = "cache_folder" in kw_names or None in kw_names
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index dfdbeddcee..c9fdce9aaf 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -906,9 +906,8 @@ def _run_temporary_patches(phase):
 TORCHAO_MSG = "Error: torchao not found, please install with `pip install torchao`"
 
 
-# Artifacts a Transformers/PEFT load never consumes (ONNX/TF/Flax/CoreML/GGUF/training state) -- skip
-# them when prewarming so a mixed-format repo is not pulled in full. An ignore list, not an allowlist,
-# so no file a load needs is dropped.
+# Artifacts a Transformers/PEFT load never reads (ONNX/TF/Flax/CoreML/GGUF/training state); skip when
+# prewarming so a mixed-format repo is not pulled in full. Ignore list (not allowlist) so nothing needed is dropped.
 _PREFETCH_IGNORE_PATTERNS = (
     "*.onnx",
     "onnx/*",
@@ -919,7 +918,7 @@ def _run_temporary_patches(phase):
     "*.mlpackage/*",
     "*.mlmodel",
     "*.gguf",
-    # Training / original-checkpoint formats from_pretrained does not read.
+    # Training / checkpoint formats from_pretrained never reads.
     "*.pt",
     "*.pth",
     "*.ckpt",
@@ -932,8 +931,8 @@ def _run_temporary_patches(phase):
 )
 
 
-# Repo-root tokenizer / config / processor files a from_pretrained reads from the root even when the
-# weights load from a subfolder. Exact filenames (no wildcard) so they match only root-level files.
+# Repo-root tokenizer / config / processor files from_pretrained reads from root even when weights
+# load from a subfolder. Exact names (no wildcard) so they match only root-level files.
 _ROOT_AUX_PREFETCH_PATTERNS = (
     "config.json",
     "generation_config.json",
@@ -946,10 +945,8 @@ def _run_temporary_patches(phase):
     "vocab.txt",
     "merges.txt",
     "spiece.model",
-    # More SentencePiece / vocab load targets (VOCAB_FILES_NAMES) not covered above: spm.model
-    # (DeBERTa-v2), normalizer.json (Whisper), tokenizer.model.v3 (Mistral), sentencepiece.bpe.model
-    # (XLM-R / mBART -- the slow tokenizer Unsloth tries first fetches these), source/target.spm
-    # (Marian), bpe.codes (FSMT / XLM), vocab.bpe (GPT-2 merges).
+    # More SentencePiece / vocab load targets (VOCAB_FILES_NAMES) the slow tokenizer Unsloth tries first
+    # may fetch: DeBERTa-v2, Whisper, Mistral, XLM-R/mBART, Marian, FSMT/XLM, GPT-2.
     "spm.model",
     "normalizer.json",
     "tokenizer.model.v3",
@@ -958,39 +955,35 @@ def _run_temporary_patches(phase):
     "target.spm",
     "bpe.codes",
     "vocab.bpe",
-    # sentencepiece.model (RemBERT) and vocab-src.json / vocab-tgt.json (FSMT) are VOCAB_FILES_NAMES
-    # not covered by the names above; a distinct-tokenizer-repo warm must cache them too.
+    # More VOCAB_FILES_NAMES (RemBERT, FSMT) a distinct-tokenizer-repo warm must cache too.
     "sentencepiece.model",
     "vocab-src.json",
     "vocab-tgt.json",
     "chat_template.jinja",
     "chat_template.json",
-    # A non-default chat_template="<name>" load fetches additional_chat_templates/<name>.jinja.
+    # chat_template="<name>" fetches additional_chat_templates/<name>.jinja.
     "additional_chat_templates/*.jinja",
     "preprocessor_config.json",
     "processor_config.json",
     "video_preprocessor_config.json",  # Qwen2.5-VL-style video processors
-    # trust_remote_code entry points via auto_map can point at any module name, so prefix globs would
-    # miss them. Warm every *.py (tiny; a non-remote-code repo ships none). HF fnmatch "*" spans "/".
+    # trust_remote_code auto_map can name any module, so warm every *.py (tiny; none in a non-remote repo).
     "*.py",
-    "*.tiktoken",  # tiktoken vocab assets (e.g. Qwen's qwen.tiktoken)
+    "*.tiktoken",  # tiktoken vocab (e.g. Qwen's qwen.tiktoken)
 )
 
 
-# The files a PEFT adapter load reads: its config + weights ("adapter_model*" is a glob so a sharded
-# adapter is covered). Merged / full-model weights an adapter repo may also publish match none of these.
+# Files a PEFT adapter load reads: config + weights (glob covers sharded adapters). Merged / full-model
+# weights an adapter repo may also publish match none of these.
 _ADAPTER_PREFETCH_PATTERNS = (
     "adapter_config.json",
     "adapter_model*",
 )
 
 
-# Weight files in a SUBDIRECTORY. A bare root load reads only root weights, so ignoring these keeps a
-# repo's alternate-precision / experimental weight dirs (fp16/, experimental/) from the otherwise
-# unfiltered warm. HF fnmatch "*" spans "/", so "*/*.safetensors" matches any nested weight while root
-# "model.safetensors" is kept. Every format is covered, incl. .h5 / .msgpack (a from_tf / from_flax
-# root load keeps those at the root). Only applied when weights_at_root, never to a diffusion warm
-# (its component weights live in subfolders).
+# Weight files in a SUBDIRECTORY. A bare root load reads only root weights, so ignoring these drops a
+# repo's alternate-precision / experimental weight dirs (fp16/, experimental/). "*/*" spans "/" (HF
+# fnmatch) so it matches nested weights while root "model.safetensors" is kept; covers .h5/.msgpack too
+# (from_tf/from_flax keep those at root). Only applied when weights_at_root (diffusion keeps weights in subfolders).
 _SUBDIR_WEIGHT_IGNORE_PATTERNS = (
     "*/*.safetensors",
     "*/*.bin",
@@ -1002,19 +995,16 @@ def _run_temporary_patches(phase):
 
 
 def _in_requested_load_scope(filename, subfolder):
-    """True if a repo-relative *filename* belongs to the location being loaded (*subfolder*, else the
-    repo root). Keys the ".bin is redundant when safetensors exist" decision off the files the load
-    actually uses, so a .bin-only subfolder keeps its .bin even when another subfolder ships safetensors.
-    """
+    """True if *filename* is in the location being loaded (*subfolder*, else root). Scopes the ".bin is
+    redundant when safetensors exist" test so a .bin-only subfolder keeps its .bin."""
     filename = filename.replace("\\", "/")
     if isinstance(subfolder, str) and subfolder.strip("/"):
         return filename.startswith(subfolder.strip("/") + "/")
     return "/" not in filename  # root load: no directory component
 
 
-# Training-state files with a .safetensors suffix that are NOT loadable model weights. A Trainer
-# checkpoint can ship optimizer.safetensors next to a real pytorch_model.bin, and counting it as
-# "model safetensors present" would drop the needed .bin.
+# .safetensors training-state files that are NOT model weights (e.g. optimizer.safetensors next to a
+# real pytorch_model.bin); counting them as "model safetensors present" would drop the needed .bin.
 _NON_MODEL_WEIGHT_STEMS = frozenset(
     {
         "optimizer",
@@ -1027,18 +1017,15 @@ def _in_requested_load_scope(filename, subfolder):
 
 
 def _is_model_weight_safetensors(filename):
-    """True if a repo-relative *filename* is a model-weights safetensors, not a PEFT adapter / sidecar
-    (``adapter_model.safetensors``) or a trainer-state file (``optimizer.safetensors``). Only a real
-    model-weights safetensors proves the ``.bin`` full-model weights are redundant; counting a sidecar
-    would wrongly skip the needed ``.bin`` and leave the load to fetch it without the Xet fallback.
-    """
+    """True if *filename* is a model-weights safetensors, not a PEFT adapter/sidecar
+    (adapter_model.safetensors) or trainer-state (optimizer.safetensors). Only a real one proves the
+    .bin redundant; counting a sidecar would wrongly drop the needed .bin (fetched then without Xet fallback)."""
     name = filename.replace("\\", "/").rsplit("/", 1)[-1]
     if not name.endswith((".safetensors", ".safetensors.index.json")):
         return False
     if name.startswith("adapter_"):
         return False
-    # Stem before the first dot: "optimizer.safetensors" -> "optimizer" (a real "model-00001-of-00002"
-    # is kept); "rng_state_0..." is caught by the prefix test below.
+    # Stem before first dot: "optimizer.safetensors" -> "optimizer" (real shards kept); rng_state via prefix.
     stem = name.split(".", 1)[0].lower()
     if stem in _NON_MODEL_WEIGHT_STEMS or stem.startswith("rng_state"):
         return False
@@ -1046,12 +1033,9 @@ def _is_model_weight_safetensors(filename):
 
 
 def _is_canonical_variant_model_weight_safetensors(filename, variant):
-    """True for a canonical model-weights safetensors carrying the requested *variant* token, in the
-    forms transformers reads: model.<variant>.safetensors (single), a numbered shard (either
-    model.<variant>-00001-of-00002.safetensors or model-00001-of-00002.<variant>.safetensors), or the
-    index model.safetensors.index.<variant>.json. Errs strict (base must be ``model``): a non-canonical
-    sidecar such as consolidated.<variant>.safetensors does NOT prove the variant .bin redundant, so its
-    .bin is not wrongly dropped from the warm and left to an unprotected in-process fetch."""
+    """True for a canonical model-weights safetensors carrying the requested *variant*, in the forms
+    transformers reads (single, either numbered-shard layout, or the index). Strict (base must be
+    "model"): a sidecar like consolidated.<variant>.safetensors does not prove the variant .bin redundant."""
     base = filename.replace("\\", "/").rsplit("/", 1)[-1]
     v = re.escape(variant)
     return bool(
@@ -1071,10 +1055,9 @@ def _is_canonical_variant_model_weight_safetensors(filename, variant):
 
 
 def _is_canonical_model_weight_safetensors(filename):
-    """True for a canonical (non-variant) model-weights safetensors a default load reads
-    (model.safetensors, a numbered shard, or the index). Errs strict: an unrecognized name keeps
-    both formats, so a repo shipping only a variant safetensors plus pytorch_model.bin never has
-    its .bin wrongly dropped for a no-variant load."""
+    """True for a canonical (non-variant) model-weights safetensors a default load reads (model.safetensors,
+    a numbered shard, or the index). Strict: an unrecognized name keeps both formats, so a variant-only
+    safetensors + pytorch_model.bin repo never has its .bin dropped for a no-variant load."""
     name = filename.replace("\\", "/").rsplit("/", 1)[-1]
     return bool(_CANONICAL_MODEL_WEIGHT_SAFETENSORS_RE.match(name))
 
@@ -1085,13 +1068,13 @@ def _adapter_repo_has_safetensors(
     token = None,
     revision = None,
 ):
-    """Best-effort: does the adapter repo ship a root safetensors adapter weight (making the
-    .bin redundant)? Scoped to root adapter_model* files; any failure returns False."""
+    """Best-effort: does the adapter repo ship a root safetensors adapter weight (making the .bin
+    redundant)? Scoped to root adapter_model* files; any failure returns False."""
     try:
         from huggingface_hub import HfApi
         siblings = HfApi().model_info(model_name, revision = revision, token = token).siblings or []
         return any(
-            "/" not in sibling.rfilename.replace("\\", "/")  # root files only
+            "/" not in sibling.rfilename.replace("\\", "/")  # root only
             and sibling.rfilename.startswith("adapter_model")
             and sibling.rfilename.endswith(".safetensors")
             for sibling in siblings
@@ -1112,15 +1095,15 @@ def _prefetch_ignore_patterns(
     variant = None,
     weights_at_root = False,
 ):
-    """ignore_patterns for the prewarm snapshot: the static skip list, minus the checkpoint guard
-    when loading from a checkpoint-* subfolder, minus the weight format the load will not read.
-    use_safetensors is a format allowlist (True -> skip *.bin, False -> skip *.safetensors); auto
-    (None) skips *.bin only when in-scope safetensors are shipped. from_tf/from_flax keep *.h5/*.msgpack.
+    """ignore_patterns for the prewarm snapshot: the static skip list, minus the checkpoint guard when
+    loading from a checkpoint-* subfolder, minus the weight format the load will not read. use_safetensors
+    is a format allowlist (True -> skip *.bin, False -> skip *.safetensors); auto (None) skips *.bin only
+    when in-scope safetensors are shipped. from_tf/from_flax keep *.h5/*.msgpack.
 
     Suppressed for a whole multi-component snapshot (weights_at_root=False, no subfolder: ST/diffusers
-    repos whose modules ship weights per-subfolder, each in its own format), since HF fnmatch "*" spans
-    "/" so dropping "*.bin" would strip a module's only weight."""
-    # Keep checkpoint-*/* when loading from such a subfolder; keep *.h5 / *.msgpack under from_tf/flax.
+    repos with per-subfolder weights, each in its own format), since "*" spans "/" so dropping "*.bin"
+    would strip a module's only weight."""
+    # Keep checkpoint-*/* under a checkpoint-* subfolder; keep *.h5 / *.msgpack under from_tf/flax.
     ignore_patterns = [
         pattern
         for pattern in _PREFETCH_IGNORE_PATTERNS
@@ -1134,15 +1117,15 @@ def _prefetch_ignore_patterns(
             or (from_flax and pattern == "*.msgpack")
         )
     ]
-    # Drop the format the load will not read (the other doubles the download). Skipped for a whole
-    # multi-component snapshot (see docstring) so a subdir module's only weight is not stripped.
+    # Drop the format the load will not read (the other doubles the download); skipped for a whole
+    # multi-component snapshot (see docstring).
     whole_multi_component = not weights_at_root and not (
         isinstance(subfolder, str) and subfolder.strip("/")
     )
     if whole_multi_component:
         pass
     elif from_tf or from_flax:
-        # TF / Flax loads never touch the PyTorch formats; drop safetensors and .bin outright.
+        # TF / Flax loads never read the PyTorch formats; drop safetensors and .bin.
         ignore_patterns.extend(
             (
                 "*.safetensors",
@@ -1152,14 +1135,14 @@ def _prefetch_ignore_patterns(
             )
         )
     elif use_safetensors is True:
-        # Explicit safetensors: the load never reads .bin (no model_info call needed).
+        # Explicit safetensors: load never reads .bin (no model_info call needed).
         ignore_patterns.extend(("*.bin", "*.bin.index.json"))
     elif use_safetensors is False:
-        # Explicit .bin: the load never reads safetensors.
+        # Explicit .bin: load never reads safetensors.
         ignore_patterns.extend(("*.safetensors", "*.safetensors.index.json"))
     else:
         # Auto: skip .bin only once in-scope safetensors are confirmed (Transformers prefers them).
-        # Best-effort: any failure leaves both formats eligible.
+        # Best-effort: any failure leaves both formats.
         try:
             from huggingface_hub import HfApi
 
@@ -1173,8 +1156,8 @@ def _prefetch_ignore_patterns(
                 .siblings
                 or []
             )
-            # Only count in-scope model-weights safetensors the load reads (not adapters/sidecars):
-            # with a variant, a variant-matching one; without, a canonical one proves the .bin redundant.
+            # Count only in-scope model-weights safetensors (not adapters/sidecars): variant-matching if
+            # a variant is requested, else canonical, proving the .bin redundant.
             has_safetensors = any(
                 _is_model_weight_safetensors(sibling.rfilename)
                 and _in_requested_load_scope(sibling.rfilename, subfolder)
@@ -1211,15 +1194,15 @@ def maybe_prefetch_hf_snapshot(
     variant = None,
     gguf_file = None,
 ):
-    """Warm the Hugging Face cache for a remote repo before the in-process load.
+    """Warm the HF cache for a remote repo before the in-process load.
 
-    Xet downloads can hang on a blob with no progress and no exception, and a blocked native Xet
-    thread cannot be killed in-process. We pull the snapshot first in a killable subprocess that falls
-    back Xet -> HTTP on a stall (unsloth_zoo.hf_xet_fallback), so from_pretrained is then a cache hit.
+    Xet can hang on a blob with no progress or exception, and a blocked native Xet thread cannot be
+    killed in-process. So pull the snapshot first in a killable subprocess that falls back Xet -> HTTP
+    on a stall (unsloth_zoo.hf_xet_fallback), making from_pretrained a cache hit.
 
-    Returns True iff warmed (caller can then clear force_download), False when skipped (local / offline /
-    local_files_only / fast_inference / old unsloth_zoo) or failed. Best-effort: only a both-transports-
-    stalled DownloadStallError is raised; other failures are left for from_pretrained to surface.
+    Returns True iff warmed (caller can clear force_download), else False (skipped: local/offline/
+    local_files_only/fast_inference/old unsloth_zoo, or failed). Only a both-transports-stalled
+    DownloadStallError is raised; other failures are left for from_pretrained to surface.
     """
     try:
         from unsloth_zoo.hf_xet_fallback import (
@@ -1231,19 +1214,19 @@ def maybe_prefetch_hf_snapshot(
 
     if not isinstance(model_name, str) or not model_name:
         return False
-    # A local path has nothing to download. Expand ~ first (os.path.exists does not).
+    # Local path: nothing to download. Expand ~ first (os.path.exists does not).
     model_path = os.path.expanduser(model_name)
     if os.path.isdir(model_path) or os.path.exists(model_path):
         return False
-    # Looks local but not yet on disk (e.g. an uncreated output dir): still not a Hub repo id; leave
-    # it for from_pretrained to surface rather than trying to download it.
+    # Looks local but not yet on disk (e.g. an uncreated output dir): not a Hub repo id, so leave it
+    # for from_pretrained rather than download it.
     if (
         os.path.isabs(model_path)
         or model_name.startswith(("~", "./", "../", ".\\", "..\\"))
         or "\\" in model_name
     ):
         return False
-    if local_files_only:  # offline / cache-only: never reach out
+    if local_files_only:  # cache-only: never reach out
         return False
     if any(
         os.environ.get(flag, "0").lower() in ("1", "true", "yes", "on")
@@ -1254,7 +1237,7 @@ def maybe_prefetch_hf_snapshot(
         return False
 
     # tokenizer-only / adapter-only warms allow-list exact files below, so the weight-format ignore
-    # list (and its auto-branch model_info call) is moot and skipped.
+    # list (and its auto-branch model_info call) is skipped.
     ignore_patterns = (
         None
         if tokenizer_only or adapter_only or gguf_file
@@ -1270,12 +1253,12 @@ def maybe_prefetch_hf_snapshot(
             weights_at_root = weights_at_root,
         )
     )
-    # Narrow the warm to what the load reads (a repo with extra checkpoints/precisions is not pulled in
-    # full); every branch still warms root tokenizer/config/custom-code so those never fall in-process.
+    # Narrow the warm to what the load reads (skip extra checkpoints/precisions); every branch still warms
+    # root tokenizer/config/custom-code so those never fall in-process.
     allow_patterns = None
     if gguf_file:
-        # from_pretrained(gguf_file=NAME) reads exactly that GGUF (de-quantized on load), but the static
-        # ignore list drops *.gguf -- so warm exactly that file (plus root aux), under <subfolder>/ if set.
+        # gguf_file=NAME reads exactly that GGUF, but the static ignore list drops *.gguf; so warm just
+        # that file (plus root aux), under <subfolder>/ if set.
         _gguf_path = (
             f"{subfolder.strip('/')}/{gguf_file}"
             if isinstance(subfolder, str) and subfolder.strip("/")
@@ -1283,12 +1266,11 @@ def maybe_prefetch_hf_snapshot(
         )
         allow_patterns = [_gguf_path, *_ROOT_AUX_PREFETCH_PATTERNS]
     elif tokenizer_only:
-        # A distinct tokenizer repo: warm only its tokenizer / config / vocab files, never its weights
-        # (not what the tokenizer load reads) even if it ships them.
+        # A distinct tokenizer repo: warm only tokenizer / config / vocab files, never its weights.
         allow_patterns = list(_ROOT_AUX_PREFETCH_PATTERNS)
     elif adapter_only:
-        # A PEFT adapter load reads only adapter_config.json + adapter_model.* (plus root aux), so an
-        # adapter repo also publishing merged weights does not pull them.
+        # A PEFT adapter load reads only adapter_config.json + adapter_model.* (plus root aux), not any
+        # merged weights the repo may also publish.
         allow_patterns = [*_ADAPTER_PREFETCH_PATTERNS, *_ROOT_AUX_PREFETCH_PATTERNS]
         # PeftModel reads one format (safetensors when present): explicit use_safetensors wins, else
         # prefer safetensors when shipped (best-effort; any failure keeps both).
@@ -1302,11 +1284,11 @@ def maybe_prefetch_hf_snapshot(
         ):
             ignore_patterns = ["adapter_model*.bin", "adapter_model*.bin.index.json"]
     elif isinstance(subfolder, str) and subfolder.strip("/"):
-        # subfolder=X: the load resolves every weight under X/, so warm that subfolder (plus root aux).
+        # subfolder=X: load resolves every weight under X/, so warm that subfolder (plus root aux).
         allow_patterns = [f"{subfolder.strip('/')}/*", *_ROOT_AUX_PREFETCH_PATTERNS]
     elif weights_at_root:
-        # A bare load reads only root weights: drop weights nested in subdirs (fp16/, checkpoint dirs)
-        # it never reads, while keeping subdir configs. Diffusion leaves weights_at_root False.
+        # A bare load reads only root weights: drop subdir weights (fp16/, checkpoint dirs) while keeping
+        # subdir configs. Diffusion leaves weights_at_root False.
         ignore_patterns = [*(ignore_patterns or []), *_SUBDIR_WEIGHT_IGNORE_PATTERNS]
     try:
         snapshot_download_with_xet_fallback(
diff --git a/unsloth/models/diffusion.py b/unsloth/models/diffusion.py
index 064b79112e..955bf55987 100644
--- a/unsloth/models/diffusion.py
+++ b/unsloth/models/diffusion.py
@@ -181,9 +181,8 @@ def from_pretrained(
 
         model_cls = _resolve_diffusion_model_class(config)
 
-        # Pre-download the confirmed diffusion repo (Xet -> HTTP on a stall) so the weight load is a cache
-        # hit. subfolder is NOT forwarded: the pipeline loads the whole repo root (every component
-        # subfolder), so narrowing to one would leave unet/, vae/, text_encoder/ to in-process Xet.
+        # Prefetch the whole repo root so the weight load is a cache hit. No subfolder: the pipeline
+        # loads every component subfolder, so narrowing would leave unet/vae/text_encoder to Xet.
         maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
@@ -193,8 +192,7 @@ def from_pretrained(
             fast_inference = False,
             force_download = kwargs.get("force_download", False),
             use_safetensors = kwargs.get("use_safetensors"),
-            # Diffusion variants (variant="fp16") are common: forward it so the warm never drops a
-            # variant .bin for a non-variant safetensors.
+            # Forward variant (e.g. "fp16") so the warm keeps variant weights.
             variant = kwargs.get("variant"),
         )
 
@@ -208,14 +206,10 @@ def from_pretrained(
             local_files_only = local_files_only,
             cache_dir = cache_dir,
         )
-        # Honor an explicit weight format on the real load too, so it reads the format the prefetch
-        # warmed (else a mixed-format repo could pick the other and start an in-process Xet download).
-        # use_safetensors=None (auto) already matches the prefetch's heuristic.
+        # Match the load's weight format to the warm (None/auto already matches).
         if kwargs.get("use_safetensors") is not None:
             load_kwargs["use_safetensors"] = kwargs["use_safetensors"]
-        # Forward the variant to the real load too, so it reads the variant weights the prefetch warmed.
-        # Without it the pipeline asks for the default weight variant, missing the warm (wrong precision,
-        # or a default weight a variant-only repo may not ship, fetched in-process over un-killable Xet).
+        # Forward variant to the real load so it reads the warmed variant weights.
         if kwargs.get("variant") is not None:
             load_kwargs["variant"] = kwargs["variant"]
 
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index dac95f3129..bcc53112e6 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2420,38 +2420,35 @@ def from_pretrained(
 
         preferred_attn_impl = resolve_attention_implementation(model_function, model_config)
 
-        # Pre-download the repo in a killable subprocess (Xet -> HTTP on a stall) so the weight load is
-        # a cache hit. After the AutoConfig/model-class check, so an unsupported repo fails on its small
-        # config fetch. revision is NOT forwarded: the load resolves model_name (possibly a remapped
-        # prequantized repo where the caller's revision does not exist) on its default branch.
+        # Prefetch the repo (killable child) so the weight load is a cache hit. Runs after the
+        # AutoConfig/model-class check so an unsupported repo fails on its small config fetch. No
+        # revision: the load resolves model_name (maybe a remapped prequant repo) on its default branch.
         _prefetched = maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
             cache_dir = kwargs.get("cache_dir"),
             local_files_only = kwargs.get("local_files_only", False),
-            # Only a real vLLM-owned load skips the warm. A num_labels classification load takes the
-            # AutoModelForSequenceClassification branch below (an in-process download) even under
-            # fast_inference=True, so it must still be warmed or its weights fetch over un-killable Xet.
+            # Skip the warm only for a real vLLM load; a num_labels classification load still goes
+            # in-process below, so it must be warmed even under fast_inference.
             fast_inference = fast_inference and num_labels is None,
             subfolder = kwargs.get("subfolder"),
             force_download = kwargs.get("force_download", False),
             use_safetensors = kwargs.get("use_safetensors"),
             from_tf = kwargs.get("from_tf", False),
             from_flax = kwargs.get("from_flax", False),
-            # Bare load reads only ROOT weights; skip subdir weights (fp16/, experimental/). Ignored
-            # when a subfolder is set.
+            # Bare load reads only ROOT weights; skip subdir weights. Ignored when a subfolder is set.
             weights_at_root = True,
             variant = kwargs.get("variant"),  # forward so the warm keeps the variant .bin
             gguf_file = kwargs.get(
                 "gguf_file"
             ),  # forward so the warm fetches the GGUF (else ignored)
         )
-        # Child already did the forced download; clear the flag so the load reuses the warm cache.
+        # Child did the forced download; clear the flag so the load reuses the warm cache.
         if _prefetched and kwargs.get("force_download", False):
             kwargs["force_download"] = False
 
-        # The tokenizer loads in-process regardless of the vLLM path; the base prefetch already covered
-        # model_name, so only warm here for a different tokenizer repo, or when fast_inference skipped it.
+        # Tokenizer always loads in-process; base prefetch covered model_name, so only warm here for a
+        # different tokenizer repo or when fast_inference skipped it.
         _tokenizer_repo = (
             tokenizer_name if (isinstance(tokenizer_name, str) and tokenizer_name) else model_name
         )
@@ -2721,8 +2718,7 @@ def from_pretrained(
 
         # Counteract saved tokenizers
         tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
-        # With a custom cache_dir the prefetch warmed it (incl. tokenizer files); route the tokenizer
-        # load there too so it reuses that cache instead of its own in-process Hub/Xet download.
+        # Route the tokenizer load to the custom cache_dir the prefetch warmed.
         _tokenizer_cache_kwargs = {}
         if kwargs.get("cache_dir") is not None:
             _tokenizer_cache_kwargs["cache_dir"] = kwargs["cache_dir"]
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index f4d2e6c1cf..84f808d2b5 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -866,26 +866,25 @@ def from_pretrained(
         if is_peft:
             # From https://github.com/huggingface/peft/issues/184
             # Now add PEFT adapters
-            # Warm the adapter repo first: PeftModel downloads it in-process and can hang on Xet.
+            # Warm the adapter repo: PeftModel downloads it in-process and can hang on Xet.
             _prefetched = maybe_prefetch_hf_snapshot(
                 old_model_name,
                 token = token,
                 revision = revision,
                 cache_dir = kwargs.get("cache_dir"),
                 local_files_only = local_files_only,
-                # The adapter loads in-process via PeftModel, not vLLM, so warm it even under
-                # fast_inference (vLLM owns only the base model's download path).
+                # Adapter always loads in-process via PeftModel, so warm it even under fast_inference.
                 fast_inference = False,
                 force_download = kwargs.get("force_download", False),
-                # Leave use_safetensors auto: inheriting the base format could skip a safetensors-only
-                # adapter. adapter_only restricts the warm to the adapter's own files + root aux.
+                # Leave use_safetensors auto (inheriting base format could skip a safetensors-only
+                # adapter). adapter_only restricts the warm to the adapter files + root aux.
                 adapter_only = True,
             )
-            # Child already did the forced download; clear the flag so the load reuses the warm cache.
+            # Child did the forced download; clear the flag so the load reuses the warm cache.
             if _prefetched and kwargs.get("force_download", False):
                 kwargs["force_download"] = False
-            # Read the adapter from where the prefetch warmed it: forward cache_dir when set. subfolder
-            # is NOT forwarded (it targets the base checkpoint; an adapter usually lives at the root).
+            # Forward cache_dir so the load reads the warmed adapter. No subfolder (that targets the
+            # base checkpoint; adapters live at the root).
             peft_load_kwargs = {}
             if kwargs.get("cache_dir") is not None:
                 peft_load_kwargs["cache_dir"] = kwargs["cache_dir"]
@@ -1815,26 +1814,25 @@ def _patched_car(
 
                 _LoraModel._create_and_replace = _patched_car
 
-            # Warm the adapter repo first: PeftModel downloads it in-process and can hang on Xet.
+            # Warm the adapter repo: PeftModel downloads it in-process and can hang on Xet.
             _prefetched = maybe_prefetch_hf_snapshot(
                 old_model_name,
                 token = token,
                 revision = revision,
                 cache_dir = kwargs.get("cache_dir"),
                 local_files_only = local_files_only,
-                # The adapter loads in-process via PeftModel, not vLLM, so warm it even under
-                # fast_inference (vLLM owns only the base model's download path).
+                # Adapter always loads in-process via PeftModel, so warm it even under fast_inference.
                 fast_inference = False,
                 force_download = kwargs.get("force_download", False),
-                # Leave use_safetensors auto: inheriting the base format could skip a safetensors-only
-                # adapter. adapter_only restricts the warm to the adapter's own files + root aux.
+                # Leave use_safetensors auto (inheriting base format could skip a safetensors-only
+                # adapter). adapter_only restricts the warm to the adapter files + root aux.
                 adapter_only = True,
             )
-            # Child already did the forced download; clear the flag so the load reuses the warm cache.
+            # Child did the forced download; clear the flag so the load reuses the warm cache.
             if _prefetched and kwargs.get("force_download", False):
                 kwargs["force_download"] = False
-            # Read the adapter from where the prefetch warmed it: forward cache_dir when set. subfolder
-            # is NOT forwarded (it targets the base checkpoint; an adapter usually lives at the root).
+            # Forward cache_dir so the load reads the warmed adapter. No subfolder (that targets the
+            # base checkpoint; adapters live at the root).
             peft_load_kwargs = {}
             if kwargs.get("cache_dir") is not None:
                 peft_load_kwargs["cache_dir"] = kwargs["cache_dir"]
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index f5d02c0ba7..38e51d8af1 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -1423,9 +1423,8 @@ def from_pretrained(
                 "Run `pip install sentence-transformers` to install it."
             )
 
-        # Validate the mutually-exclusive load modes BEFORE the prefetch (a config rejected locally must
-        # not first download many GB of weights). Only the non-for_inference path uses these flags; the
-        # for_inference branch below skips them, so guard the check to preserve its behavior.
+        # Validate the load modes BEFORE the prefetch so a bad config fails without downloading weights.
+        # Guard on not for_inference: that branch below never used these flags.
         if not for_inference:
             # sanity check, thanks Etherl:
             if full_finetuning and (load_in_4bit or load_in_8bit):
@@ -1445,11 +1444,9 @@ def from_pretrained(
                     "If you want 8bit finetuning, set both `load_in_16bit = False` and `load_in_8bit = True`"
                 )
 
-        # Pre-download in a killable subprocess (Xet -> HTTP on a stall) so the ST load below is a cache
-        # hit. weights_at_root stays False since ST component weights live in per-module subfolders.
-        # Resolve the cache the load uses: an explicit HF cache_dir wins (the FastModel fallback load
-        # forwards it), else cache_folder, else SENTENCE_TRANSFORMERS_HOME (which ST honors when
-        # cache_folder is unset), else the default HF cache -- a wrong-cache warm would be missed.
+        # Prefetch so the ST load below is a cache hit. weights_at_root stays False (ST component
+        # weights live in per-module subfolders). Resolve the same cache the load uses: HF cache_dir,
+        # else cache_folder, else SENTENCE_TRANSFORMERS_HOME, else default -- a wrong cache misses the warm.
         maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
@@ -1490,10 +1487,8 @@ def from_pretrained(
                 if k in kwargs:
                     st_kwargs[k] = kwargs[k]
 
-            # ST takes cache_folder, not cache_dir. Map an explicit HF cache_dir onto cache_folder so this
-            # native load reads the cache the prefetch warmed above (cache_dir wins, else the caller's
-            # cache_folder; None lets ST honor SENTENCE_TRANSFORMERS_HOME, matching the prefetch) -- else a
-            # cache_dir warm is missed and the load starts an unprotected in-process Xet download.
+            # ST takes cache_folder, not cache_dir: map cache_dir onto it so this load hits the warm
+            # (None lets ST honor SENTENCE_TRANSFORMERS_HOME, matching the prefetch).
             _st_cache = kwargs.get("cache_dir") or kwargs.get("cache_folder")
             if _st_cache is not None:
                 st_kwargs["cache_folder"] = _st_cache
@@ -1501,7 +1496,7 @@ def from_pretrained(
             st_model = SentenceTransformer(model_name, **st_kwargs)
             return st_model
 
-        # Load-mode validation + full_finetuning normalization already ran before the prefetch above.
+        # Load-mode validation already ran before the prefetch above.
         if "auto_model" not in kwargs:
             kwargs["auto_model"] = AutoModel
 
@@ -1598,9 +1593,8 @@ def from_pretrained(
                 elif is_mpnet:
                     FastSentenceTransformer._patch_mpnet_v5()
 
-            # ST takes cache_folder, not cache_dir; map an explicit HF cache_dir onto it so this load reads
-            # the cache the prefetch warmed (cache_dir wins, else cache_folder; None lets ST honor
-            # SENTENCE_TRANSFORMERS_HOME, matching the prefetch) -- a mismatched cache would miss the warm.
+            # ST takes cache_folder, not cache_dir: map cache_dir onto it so this load hits the warm
+            # (None lets ST honor SENTENCE_TRANSFORMERS_HOME, matching the prefetch).
             st_model = SentenceTransformer(
                 model_name,
                 device = st_device,
@@ -1714,8 +1708,7 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
 
         # No modules.json -> force 16-bit: saving is custom for these models and
         # 4-bit would need dequant in save_pretrained_merged, not worth it.
-        # Resolve the same cache the prefetch warmed: hf_hub_download (used here and by
-        # _load_modules) ignores SENTENCE_TRANSFORMERS_HOME, so passing bare cache_folder would miss it.
+        # Resolve the warmed cache: hf_hub_download ignores SENTENCE_TRANSFORMERS_HOME, so pass it as cache_dir.
         has_modules_json = (
             FastSentenceTransformer._module_path(
                 model_name,
@@ -1736,10 +1729,8 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
             load_in_4bit = False
             load_in_16bit = True
 
-        # The fallback FastModel weight load resolves its cache from the HF cache_dir, not ST's
-        # cache_folder / SENTENCE_TRANSFORMERS_HOME. Point it at the SAME cache the prefetch warmed above,
-        # else the weights miss the warm and start an unprotected in-process Xet download. Only set it when
-        # a custom ST cache is in play and the caller passed no explicit HF cache_dir (which wins).
+        # The fallback FastModel load reads HF cache_dir, not ST's cache_folder/SENTENCE_TRANSFORMERS_HOME.
+        # Point it at the warmed cache, but only when no explicit cache_dir was passed (which wins).
         _st_cache_dir = kwargs.get("cache_folder") or os.environ.get("SENTENCE_TRANSFORMERS_HOME")
         if _st_cache_dir is not None and "cache_dir" not in kwargs:
             kwargs["cache_dir"] = _st_cache_dir
@@ -1789,10 +1780,7 @@ def _push_to_hub_merged(self, repo_id, **push_kwargs):
             cache_dir = kwargs.get("cache_dir")
             or kwargs.get("cache_folder")
             or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
-            # Read the modules from the SAME revision the model weights load from (FastModel forwards
-            # revision to the weight load), so a revision-pinned repo hits the prefetch's warm instead
-            # of fetching default-branch module files in-process over Xet (and mixing them with the
-            # revision-pinned weights). A None revision resolves to the default branch as before.
+            # Same revision as the weight load so modules hit the warm (None = default branch).
             revision = revision,
         )
 
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 01cf6aee1c..6b6dae69f4 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -805,10 +805,8 @@ def from_pretrained(
         # For debugging - we use a download counter to see if environments are not breaking or if HF is down
         get_statistics(kwargs.get("local_files_only", False))
 
-        # NOTE: the base + tokenizer prefetch (the Xet -> HTTP stall fallback warm) runs AFTER the
-        # load-mode validation below, so an invalid load_in_4bit/8bit/16bit combination fails locally
-        # without first downloading a multi-GB snapshot. See the maybe_prefetch_hf_snapshot block placed
-        # right after that check.
+        # NOTE: the base + tokenizer prefetch runs AFTER the load-mode validation below, so an invalid
+        # load_in_* combination fails without first downloading a snapshot. See the block after that check.
 
         if dtype is None:
             dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
@@ -907,12 +905,10 @@ def from_pretrained(
                 "Unsloth: Can only load in 4bit or 8bit or 16bit, not a combination!"
             )
 
-        # Pre-download the repo in a killable subprocess (Xet -> HTTP on a no-progress stall) so the
-        # in-process load below is a cache hit and cannot hang. Runs AFTER the load-mode validation
-        # above so an invalid load_in_* combination fails without first pulling a multi-GB snapshot.
-        # vLLM owns the weight download only when actually available; if fast_inference was requested
-        # but vLLM is missing, the load falls through to the in-process HF path (fast_inference_setup
-        # flips the flag below), so the weights must still be warmed here. Resolve availability now.
+        # Prefetch the repo (killable child) so the in-process load below is a cache hit. Runs after
+        # the load-mode validation above. vLLM owns the weight download only when actually available;
+        # if fast_inference was requested but vLLM is missing, the load falls through in-process, so
+        # weights must still be warmed here. Resolve availability now.
         _vllm_owns_weights = fast_inference and is_vLLM_available()
         _prefetched = maybe_prefetch_hf_snapshot(
             model_name,
@@ -926,21 +922,19 @@ def from_pretrained(
             use_safetensors = kwargs.get("use_safetensors"),
             from_tf = kwargs.get("from_tf", False),
             from_flax = kwargs.get("from_flax", False),
-            # Bare load reads only ROOT weights; skip subdir weights (fp16/, experimental/). Ignored
-            # when a subfolder is set.
+            # Bare load reads only ROOT weights; skip subdir weights. Ignored when a subfolder is set.
             weights_at_root = True,
             variant = kwargs.get("variant"),  # forward so the warm keeps the variant .bin
             gguf_file = kwargs.get(
                 "gguf_file"
             ),  # forward so the warm fetches the GGUF (else ignored)
         )
-        # Child already did the forced download; clear the flag so the load reuses the warm cache.
+        # Child did the forced download; clear the flag so the load reuses the warm cache.
         if _prefetched and kwargs.get("force_download", False):
             kwargs["force_download"] = False
 
-        # Warm a SEPARATE tokenizer repo (explicit tokenizer_name); when it is model_name it is already
-        # covered. Do NOT warm model_name here on the vLLM path: this runs before fast_inference_setup may
-        # remap "*-unsloth-bnb-4bit" -> "*-bnb-4bit", so it would warm the wrong repo.
+        # Warm a SEPARATE tokenizer repo only (model_name is covered above). Not model_name here: this
+        # runs before fast_inference_setup may remap the repo, so it would warm the wrong one.
         _tokenizer_repo = (
             tokenizer_name if (isinstance(tokenizer_name, str) and tokenizer_name) else model_name
         )
@@ -1257,11 +1251,9 @@ def from_pretrained(
         # Counteract saved tokenizers
         tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
 
-        # On the vLLM path the base warm above was skipped (vLLM owns the weight download) and the
-        # tokenizer warm was deferred because fast_inference_setup may remap model_name. Now that the
-        # final tokenizer repo is known, warm it (tokenizer-only) so the in-process processor / tokenizer
-        # load below is a cache hit rather than an unprotected in-process Xet fetch. A re-warm of an
-        # already-cached repo (or a local path) is a fast no-op.
+        # On the vLLM path the tokenizer warm was deferred (fast_inference_setup may remap model_name).
+        # Now the final tokenizer repo is known, so warm it so the processor/tokenizer load below hits
+        # the cache. Re-warming an already-cached repo (or a local path) is a fast no-op.
         if _vllm_owns_weights and isinstance(tokenizer_name, str) and tokenizer_name:
             maybe_prefetch_hf_snapshot(
                 tokenizer_name,
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index db3d3253ae..3a91ef188d 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -564,10 +564,10 @@ def _load_correct_tokenizer(
         # Let's utilize them
         cache_dir = os.path.join(KAGGLE_TMP, cache_dir)
     elif cache_dir == "huggingface_tokenizers_cache":
-        # This default name is only meaningful on Colab/Kaggle; elsewhere use the HF default cache.
+        # This default name is Colab/Kaggle-only; elsewhere use the HF default cache.
         cache_dir = None
-    # else: a caller-supplied cache_dir is preserved, so the tokenizer loads from the directory the
-    # snapshot prefetch warmed rather than risking an in-process Hub/Xet transfer on a default-cache miss.
+    # else: keep a caller-supplied cache_dir so the tokenizer loads from the prefetch-warmed dir instead
+    # of risking an in-process Hub/Xet transfer.
 
     # Try loading the slow tokenizer. If it fails, then try Fast only
     # Mainly to solve Deepseek models with no tokenizer.model file
@@ -1417,8 +1417,8 @@ def check_tokenizer(
                     f"Fix your tokenizer since it'll perform out of bounds memory accesses."
                 )
 
-            # Honor a caller-supplied cache_dir so the repair reload reuses any warmed cache; else fall
-            # back to the Colab/Kaggle sentinel (HF default elsewhere), as load_correct_tokenizer does.
+            # Reuse a caller-supplied cache_dir (warmed cache) for the repair reload; else the
+            # Colab/Kaggle sentinel (HF default elsewhere), as load_correct_tokenizer does.
             reload_cache_dir = cache_dir
             if reload_cache_dir is None and (IS_COLAB_ENVIRONMENT or IS_KAGGLE_ENVIRONMENT):
                 reload_cache_dir = "huggingface_tokenizers_cache"

From 9889390b12fc33d216c05a49c87867d5d06a1bf1 Mon Sep 17 00:00:00 2001
From: danielhanchen <danielhanchen@gmail.com>
Date: Fri, 3 Jul 2026 13:41:12 +0000
Subject: [PATCH 80/80] Warm SentenceTransformer force_download in the killable
 child

From a consolidated review pass (12x reviewer.py personas + 3 forks). The
SentenceTransformer prefetch now forwards force_download and clears it after a
successful warm, matching the llama / vision / diffusion sites. Before, a
force_download=True ST load warmed without forcing, so a stale cache could be
reused or the refresh could fall to an unguarded in-process Hub/Xet download.

Generalises the ST prefetch-wiring AST test to accept the return-captured call.
Prefetch suite 43 passed.

Not taken: the llama / vision base-load prefetch omitting revision is correct --
revision is a named from_pretrained parameter that never reaches the in-process
AutoModel load (which resolves model_name on its default branch), so the warm
matches the load; forwarding it would warm the wrong branch. The degraded Studio
shim dropping local_files_only is not production-reachable (its GGUF callers never
pass it, and the public snapshot wrapper is test-only).
---
 tests/test_prefetch_snapshot_scope.py  | 21 ++++++++++++---------
 unsloth/models/sentence_transformer.py |  7 ++++++-
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/tests/test_prefetch_snapshot_scope.py b/tests/test_prefetch_snapshot_scope.py
index 9fc2878c17..9f975cea2f 100644
--- a/tests/test_prefetch_snapshot_scope.py
+++ b/tests/test_prefetch_snapshot_scope.py
@@ -823,22 +823,25 @@ def test_sentence_transformer_from_pretrained_is_prefetch_wired():
     )
     fp = next(n for n in cls.body if isinstance(n, ast.FunctionDef) and n.name == "from_pretrained")
 
-    def _is_prefetch_call(node):
-        return (
-            isinstance(node, ast.Expr)
-            and isinstance(node.value, ast.Call)
-            and isinstance(node.value.func, ast.Name)
-            and node.value.func.id == "maybe_prefetch_hf_snapshot"
-        )
+    def _prefetch_call(node):
+        # a bare call statement, or one whose return is captured (e.g. _st_prefetched = ...)
+        value = node.value if isinstance(node, (ast.Expr, ast.Assign)) else None
+        if (
+            isinstance(value, ast.Call)
+            and isinstance(value.func, ast.Name)
+            and value.func.id == "maybe_prefetch_hf_snapshot"
+        ):
+            return value
+        return None
 
-    prefetch_pos = next((i for i, n in enumerate(fp.body) if _is_prefetch_call(n)), None)
+    prefetch_pos = next((i for i, n in enumerate(fp.body) if _prefetch_call(n)), None)
     return_pos = next((i for i, n in enumerate(fp.body) if isinstance(n, ast.Return)), len(fp.body))
     assert (
         prefetch_pos is not None
     ), "from_pretrained must call maybe_prefetch_hf_snapshot at top level"
     assert prefetch_pos < return_pos, "prefetch must run before any top-level return"
     # local_files_only must be forwarded so an offline load does not start a Hub download.
-    prefetch_call = fp.body[prefetch_pos].value
+    prefetch_call = _prefetch_call(fp.body[prefetch_pos])
     assert "local_files_only" in {
         kw.arg for kw in prefetch_call.keywords
     }, "prefetch must forward local_files_only"
diff --git a/unsloth/models/sentence_transformer.py b/unsloth/models/sentence_transformer.py
index 38e51d8af1..c1172faa94 100644
--- a/unsloth/models/sentence_transformer.py
+++ b/unsloth/models/sentence_transformer.py
@@ -1447,7 +1447,7 @@ def from_pretrained(
         # Prefetch so the ST load below is a cache hit. weights_at_root stays False (ST component
         # weights live in per-module subfolders). Resolve the same cache the load uses: HF cache_dir,
         # else cache_folder, else SENTENCE_TRANSFORMERS_HOME, else default -- a wrong cache misses the warm.
-        maybe_prefetch_hf_snapshot(
+        _st_prefetched = maybe_prefetch_hf_snapshot(
             model_name,
             token = token,
             revision = revision,
@@ -1455,7 +1455,12 @@ def from_pretrained(
             or kwargs.get("cache_folder")
             or os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
             local_files_only = kwargs.get("local_files_only", False),
+            # Forward force_download so the refresh happens in the killable child, then clear it so the
+            # in-process ST load reuses the warm cache instead of re-downloading over unguarded Xet.
+            force_download = kwargs.get("force_download", False),
         )
+        if _st_prefetched and kwargs.get("force_download", False):
+            kwargs["force_download"] = False
 
         # if for_inference == True, skip Unsloth optimizations to avoid torch compile issues
         if for_inference: