diff --git a/tests/test_dora_merge.py b/tests/test_dora_merge.py new file mode 100644 index 000000000..aac36415f --- /dev/null +++ b/tests/test_dora_merge.py @@ -0,0 +1,107 @@ +"""DoRA (use_dora=True) merge support in the safetensors merge path. + +The dense merge must (1) no longer raise a key-mismatch on a DoRA adapter (the magnitude vector +is now captured), and (2) produce the same merged weight as PEFT's own DoRA merge. MoE-expert +DoRA is explicitly refused (fail loud) rather than silently dropping the magnitude. +""" +import copy + +import pytest +import torch +import torch.nn as nn + +from unsloth_zoo.saving_utils import create_lora_statistics, _merge_lora, LoraStats + + +class _Tiny(nn.Module): + def __init__(self, d_in=32, d_out=24): + super().__init__() + self.q_proj = nn.Linear(d_in, d_out, bias=False) + + def forward(self, x): + return self.q_proj(x) + + +def _find_q_stats(lora_weights): + for v in lora_weights.values(): + if v.lora_A is not None and v.lora_B is not None: + return v + return None + + +def test_dora_merge_matches_peft(): + from peft import LoraConfig, get_peft_model + + torch.manual_seed(0) + base = _Tiny().to(torch.float32) + W0 = base.q_proj.weight.detach().clone() + + cfg = LoraConfig(r=8, lora_alpha=16, target_modules=["q_proj"], use_dora=True) + pm = get_peft_model(copy.deepcopy(base), cfg) + + # Give the adapter a non-trivial delta and magnitude so DoRA actually rescales. + for n, p in pm.named_parameters(): + if n.endswith("lora_B.default.weight"): + with torch.no_grad(): + p.copy_(torch.randn_like(p) * 0.1) + if n.endswith("lora_magnitude_vector.default.weight"): + with torch.no_grad(): + p.add_(torch.randn_like(p) * 0.1) + + # Ground truth: PEFT's own DoRA merge. + merged_peft = copy.deepcopy(pm).merge_and_unload() + W_peft = None + for n, p in merged_peft.named_parameters(): + if n.endswith("q_proj.weight"): + W_peft = p.detach().float().clone() + assert W_peft is not None + + # Unsloth merge path: capture stats (must NOT raise on DoRA) then fold via _merge_lora. + result = create_lora_statistics(pm, merge_into_original=True, return_state_dict=True) + lora_weights = result[0] if isinstance(result, tuple) else result + stats = _find_q_stats(lora_weights) + assert stats is not None + assert stats.magnitude is not None, "DoRA magnitude was not captured" + + W_uns = _merge_lora(W0.clone(), stats, "q_proj").cpu().float() + + max_abs = (W_uns - W_peft).abs().max().item() + assert torch.allclose(W_uns, W_peft, atol=1e-4, rtol=1e-4), f"max abs diff {max_abs}" + # Sanity: DoRA actually changed the weight vs the plain base. + assert (W_uns - W0.float()).abs().max().item() > 1e-3 + + +def test_plain_lora_unaffected(): + """A non-DoRA adapter has magnitude None and merges as W0 + alpha*BA.""" + from peft import LoraConfig, get_peft_model + + torch.manual_seed(1) + base = _Tiny().to(torch.float32) + W0 = base.q_proj.weight.detach().clone() + cfg = LoraConfig(r=8, lora_alpha=16, target_modules=["q_proj"], use_dora=False) + pm = get_peft_model(copy.deepcopy(base), cfg) + for n, p in pm.named_parameters(): + if n.endswith("lora_B.default.weight"): + with torch.no_grad(): + p.copy_(torch.randn_like(p) * 0.1) + + result = create_lora_statistics(pm, merge_into_original=True, return_state_dict=True) + lora_weights = result[0] if isinstance(result, tuple) else result + stats = _find_q_stats(lora_weights) + assert stats is not None and stats.magnitude is None + + W_uns = _merge_lora(W0.clone(), stats, "q_proj").cpu().float() + expected = W0.float() + stats.alpha * (stats.lora_B.float() @ stats.lora_A.float()) + assert torch.allclose(W_uns, expected, atol=1e-5) + + +def test_dora_on_moe_expert_is_refused(): + from unsloth_zoo.saving_utils import _merge_moe_fused_gate_up_expert + + E, rank, H, I = 4, 4, 8, 6 + gate_up_W = torch.randn(E, 2 * I, H) + A = torch.randn(E * rank, H) + B = torch.randn(2 * I, E * rank) + stats = LoraStats(None, A, B, 1.0, magnitude=torch.randn(2 * I)) + with pytest.raises(RuntimeError, match="DoRA"): + _merge_moe_fused_gate_up_expert(gate_up_W, stats, torch.float32) diff --git a/unsloth_zoo/saving_utils.py b/unsloth_zoo/saving_utils.py index f42c0f77c..38700286b 100644 --- a/unsloth_zoo/saving_utils.py +++ b/unsloth_zoo/saving_utils.py @@ -243,6 +243,19 @@ def _merge_lora(W, lora_stats, name, use_dequant_base = False): W = W_new.addmm_(lora_B, lora_A, alpha=lora_stats.alpha) else: W = W.addmm_(lora_B, lora_A, alpha=lora_stats.alpha) + # DoRA: rescale the merged direction to the learned magnitude. With delta = alpha*(B@A), + # PEFT's DoRA merge is (m / ||W0 + delta||_row) * (W0 + delta), one L2 norm per output row + # over the input dim. W already holds W0 + delta here, so fold m onto it. + magnitude = getattr(lora_stats, "magnitude", None) + if magnitude is not None: + magnitude = magnitude.to(device, dtype = torch.float32, non_blocking = True).reshape(-1) + if magnitude.shape[0] != W.shape[0]: + raise ValueError( + f"Unsloth: DoRA magnitude for `{name}` has {magnitude.shape[0]} entries but the " + f"merged weight has {W.shape[0]} output rows." + ) + weight_norm = torch.linalg.norm(W, dim = 1).clamp_min(1e-9) + W = (magnitude / weight_norm).unsqueeze(1) * W if not torch.isfinite(torch.amax(W)).item(): raise ValueError('Unsloth: Merge failed as there are infinite elements in ' + name) return W @@ -332,6 +345,7 @@ class LoraStats: lora_A : torch.Tensor lora_B : torch.Tensor alpha : float + magnitude : object = None # DoRA lora_magnitude_vector weight (None for plain LoRA) pass @@ -344,13 +358,15 @@ def assert_same_keys(model, new_state_dict): def _should_ignore(key: str) -> bool: # Ignore helper wrappers and raw LoRA adapter tensors; the merged - # state_dict intentionally omits lora_A / lora_B weights. + # state_dict intentionally omits lora_A / lora_B / DoRA magnitude weights + # (the magnitude is folded into the merged weight in _merge_lora). return ( "modules_to_save" in key or "original_module" in key or ".lora_A" in key or ".lora_B" in key or ".lora_embedding" in key + or ".lora_magnitude_vector" in key ) def _normalize(key: str) -> str: @@ -440,6 +456,12 @@ def create_lora_statistics(model, merge_into_original = False, return_state_dict lora_B_count += 1 expand_module_keys(name, module, remove_keys) + elif name.endswith(".lora_magnitude_vector.default"): + # DoRA magnitude vector m; folded onto the merged weight in _merge_lora. Register its + # key so the key-consistency check does not flag it (the merged model omits it). + lora_weights[name[:-len(".lora_magnitude_vector.default")]].magnitude = module.weight + expand_module_keys(name, module, remove_keys) + elif isinstance(module, Linear_LoRA_Layers): lora_weights[name].alpha = _get_lora_scaling(module) scaling_count += 1 @@ -1179,10 +1201,24 @@ def _detect_moe_lora_layout(lora_A, lora_B, num_experts, out_dim, in_dim, lora_m return "unknown", r +def _refuse_dora_on_moe(lora_stats): + """DoRA on MoE experts is not yet supported: the expert merge helpers fold only the LoRA + delta, not the DoRA magnitude (the dense path handles it in _merge_lora). Fail loud rather + than emit a checkpoint with the magnitude silently dropped.""" + if getattr(lora_stats, "magnitude", None) is not None: + raise RuntimeError( + "Unsloth: DoRA (use_dora=True) merging is not yet supported for MoE expert layers. " + "Fine-tune only the non-expert (attention/MLP) layers with DoRA, or open an issue at " + "https://github.com/unslothai/unsloth/issues." + ) +pass + + def _merge_moe_gate_or_up_expert(W, lora_stats, expert_idx, num_experts, output_dtype, *, role): """Per-expert merge for gate_proj/up_proj (role='gate' -> first I, 'up' -> last I).""" if lora_stats is None or lora_stats.lora_A is None or lora_stats.lora_B is None: return W + _refuse_dora_on_moe(lora_stats) _MOE_MERGE_STATE["attempted"] += 1 try: num_experts = _resolve_num_experts_from_lora_stats(lora_stats, num_experts) @@ -1266,6 +1302,7 @@ def _merge_moe_up_expert(up_W, lora_stats, expert_idx, num_experts, output_dtype def _merge_moe_down_proj_expert(down_W, lora_stats, expert_idx, num_experts, output_dtype): if lora_stats is None or lora_stats.lora_A is None or lora_stats.lora_B is None: return down_W + _refuse_dora_on_moe(lora_stats) _MOE_MERGE_STATE["attempted"] += 1 try: num_experts = _resolve_num_experts_from_lora_stats(lora_stats, num_experts) @@ -1650,6 +1687,7 @@ def _merge_moe_fused_gate_up_expert(gate_up_W, lora_stats, output_dtype, is_tran - Standard (Gemma4): (E, 2*I, H) with lora_A (E*R, H), lora_B (2*I, E*R) is_transposed: if provided, overrides dimension-based heuristic (needed when dims are equal). """ + _refuse_dora_on_moe(lora_stats) _MOE_MERGE_STATE["attempted"] += 1 try: if lora_stats.lora_A is None or lora_stats.lora_B is None: @@ -1733,6 +1771,7 @@ def _merge_moe_fused_down_proj_expert(down_W, lora_stats, output_dtype, is_trans - Standard (Gemma4): (E, H, I) with lora_A (E*R, H), lora_B (I, E*R) is_transposed: if provided, overrides dimension-based heuristic (needed when H==I). """ + _refuse_dora_on_moe(lora_stats) _MOE_MERGE_STATE["attempted"] += 1 try: if lora_stats.lora_A is None or lora_stats.lora_B is None: