Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions unsloth/models/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,45 @@ def _get_user_task_config_attrs(user_config):
"gpt_oss",
]

# Architectures with gated-deltanet (linear attention) layers. Unsloth bundles the
# flash-linear-attention Triton kernels (unsloth_zoo/_vendored/fla), so no install is
# needed; transformers uses the much slower pure PyTorch path only when they can't be enabled.
FLA_MODEL_TYPE_PREFIXES = ("qwen3_next", "qwen3_5", "kimi_linear")
_fla_advised = False


def _maybe_advise_fla_install(model_types):
"""One-time note when a gated-deltanet model loads without the fast kernels.

The kernels ship with Unsloth (no install needed); this fires only when they
could not be enabled on this platform (e.g. no CUDA, torch < 2.7 or
triton < 3.3), i.e. exactly when transformers uses the slow pure PyTorch path.
"""
global _fla_advised
if _fla_advised:
return
if model_types is None:
return
if isinstance(model_types, str):
model_types = [model_types] # a lone string would otherwise iterate chars
try:
if not any(
isinstance(t, str) and t.startswith(FLA_MODEL_TYPE_PREFIXES) for t in model_types
):
return
from transformers.utils.import_utils import is_flash_linear_attention_available
if is_flash_linear_attention_available():
return # bundled (or user-installed) fast kernels are active
except Exception:
return
Comment on lines +213 to +236

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

To make _maybe_advise_fla_install more robust and defensive, we should handle cases where model_types is None, empty, or a single string (which would otherwise cause character-by-character iteration and fail to match).

Additionally, to ensure this advisory is triggered regardless of the loading path, consider calling _maybe_advise_fla_install(model_types) in FastLanguageModel.from_pretrained (around line 620) right after model_types is resolved.

def _maybe_advise_fla_install(model_types):
    """Print a one-time advisory when a gated-deltanet model is loaded without fla."""
    global _fla_advised
    if _fla_advised or not model_types:
        return
    if isinstance(model_types, str):
        model_types = [model_types]
    try:
        if not any(
            isinstance(t, str) and t.startswith(FLA_MODEL_TYPE_PREFIXES) for t in model_types
        ):
            return
        if importlib.util.find_spec("fla") is not None:
            return
    except Exception:
        return

Comment on lines +227 to +236

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

If is_flash_linear_attention_available fails to import (for example, on older transformers versions or if there is an ImportError), the entire try block raises an exception, which is caught by except Exception: and silently returns. This prevents the warning from being printed, even though the fast kernels are not active and the model will run on the slower pure PyTorch path.

To make this more robust, we should isolate the import and check of is_flash_linear_attention_available so that any failure/absence of the import is treated as fla_available = False, allowing the warning to be printed correctly. Additionally, we should avoid broad, silent exception handlers (like except Exception:) and instead catch specific exceptions (like ModuleNotFoundError for optional imports) and log them at a debug level to aid in troubleshooting.

Suggested change
try:
if not any(
isinstance(t, str) and t.startswith(FLA_MODEL_TYPE_PREFIXES) for t in model_types
):
return
from transformers.utils.import_utils import is_flash_linear_attention_available
if is_flash_linear_attention_available():
return # bundled (or user-installed) fast kernels are active
except Exception:
return
try:
if not any(
isinstance(t, str) and t.startswith(FLA_MODEL_TYPE_PREFIXES) for t in model_types
):
return
except Exception as e:
import logging
logging.getLogger(__name__).debug("Error checking model types: %s", e)
return
try:
from transformers.utils.import_utils import is_flash_linear_attention_available
fla_available = is_flash_linear_attention_available()
except ModuleNotFoundError as e:
import logging
logging.getLogger(__name__).debug("transformers.utils.import_utils or is_flash_linear_attention_available not found: %s", e)
fla_available = False
if fla_available:
return # bundled (or user-installed) fast kernels are active
References
  1. Avoid using broad, silent exception handlers like except Exception: pass. Instead, log the exception, even if at a debug level, to aid in future debugging.
  2. When catching an ImportError for an optional dependency, prefer catching the more specific ModuleNotFoundError and check the module name to avoid suppressing unrelated import errors.
  3. When handling exceptions, avoid broad except Exception: pass clauses. Instead, catch specific exceptions and log them (at least at a debug level) to aid in troubleshooting. If a failure is expected, log the specific exception type and its details.

_fla_advised = True
print(
"Unsloth: This model uses gated-deltanet linear attention layers. Unsloth\n"
"bundles the flash-linear-attention kernels, but they could not be enabled\n"
"on this setup (they need CUDA with torch >= 2.7 and triton >= 3.3), so\n"
"transformers will use a slower pure PyTorch path."
)


def _fix_rope_inv_freq(model):
"""Fix inv_freq corruption caused by transformers v5 meta-device loading.
Expand Down Expand Up @@ -1263,6 +1302,7 @@ def _dispatch_diffusion():
trust_remote_code = trust_remote_code,
)
model_types_all = ",".join(model_types) + ","
_maybe_advise_fla_install(model_types)

# ---- Text-diffusion models (e.g. DiffusionGemma) take a transformers-only slow path. ----
# These use a custom block-diffusion `generate` and a novel backbone, so we skip Unsloth's
Expand Down
Loading