Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions src/transformers/conversion_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1479,6 +1479,64 @@ def _build_checkpoint_conversion_mapping():
WeightRenaming("mlp.shared_expert.", "mlp.shared_experts."),
]

# Original `OpenGVLab/InternVL2-*` (`internvl_chat`) checkpoints store a different
# weight layout than the native `InternVLForConditionalGeneration`. These rules map
# the original names onto the native ones (and split the fused vision `attn.qkv`), so
# the checkpoints load without remote code. The native `llava`-style rules are appended
# so already-converted InternVL checkpoints keep loading through the same mapping.
mapping["internvl"] = [
# Language model (Qwen2 / InternLM2 backbone).
WeightRenaming(source_patterns=r"^language_model\.lm_head", target_patterns="lm_head"),
WeightRenaming(source_patterns=r"^language_model\.model\.", target_patterns="model.language_model."),
# Multi-modal projector (`mlp1.{0,1,3}` -> layer_norm / linear_1 / linear_2).
WeightRenaming(source_patterns=r"^mlp1\.0\.", target_patterns="model.multi_modal_projector.layer_norm."),
WeightRenaming(source_patterns=r"^mlp1\.1\.", target_patterns="model.multi_modal_projector.linear_1."),
WeightRenaming(source_patterns=r"^mlp1\.3\.", target_patterns="model.multi_modal_projector.linear_2."),
# Vision embeddings.
WeightRenaming(
source_patterns=r"^vision_model\.embeddings\.class_embedding",
target_patterns="model.vision_tower.embeddings.cls_token",
),
WeightRenaming(
source_patterns=r"^vision_model\.embeddings\.position_embedding",
target_patterns="model.vision_tower.embeddings.position_embeddings",
),
WeightRenaming(
source_patterns=r"^vision_model\.embeddings\.patch_embedding\.",
target_patterns="model.vision_tower.embeddings.patch_embeddings.projection.",
),
# Vision encoder: prefix + `layers` -> `layer`, then per-layer leaf renames.
WeightRenaming(
source_patterns=r"^vision_model\.encoder\.layers\.",
target_patterns="model.vision_tower.encoder.layer.",
),
WeightRenaming(source_patterns=r"\.attn\.proj\.", target_patterns=".attention.projection_layer."),
WeightRenaming(source_patterns=r"\.attn\.qkv\.", target_patterns=".attention.qkv."),
WeightRenaming(source_patterns=r"\.norm1\.", target_patterns=".layernorm_before."),
WeightRenaming(source_patterns=r"\.norm2\.", target_patterns=".layernorm_after."),
WeightRenaming(source_patterns=r"\.ls1$", target_patterns=".lambda_1"),
WeightRenaming(source_patterns=r"\.ls2$", target_patterns=".lambda_2"),
# Split the fused vision attention `qkv` into `q_proj`/`k_proj`/`v_proj`.
WeightConverter(
source_patterns="attention.qkv.weight",
target_patterns=[
"attention.q_proj.weight",
"attention.k_proj.weight",
"attention.v_proj.weight",
],
operations=[Chunk(dim=0)],
),
WeightConverter(
source_patterns="attention.qkv.bias",
target_patterns=[
"attention.q_proj.bias",
"attention.k_proj.bias",
"attention.v_proj.bias",
],
operations=[Chunk(dim=0)],
),
] + mapping["llava"].copy()

for model_type, base_pattern in _MODEL_TO_CONVERSION_PATTERN.items():
if model_type in mapping:
continue
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/models/auto/configuration_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
CONFIG_MAPPING_NAMES.update(
{
"EvollaModel": "EvollaConfig",
# Original `OpenGVLab/InternVL2-*` checkpoints; loaded via the native InternVL.
"internvl_chat": "InternVLConfig",
"mlcd": "MLCDVisionConfig",
"parakeet_tdt": "ParakeetTDTConfig",
"vibevoice_acoustic_tokenizer_decoder": "VibeVoiceAcousticTokenizerDecoderConfig",
Expand All @@ -50,6 +52,7 @@
SPECIAL_MODEL_TYPE_TO_MODULE_NAME.update(
{
"EvollaModel": "evolla",
"internvl_chat": "internvl",
"parakeet_tdt": "parakeet",
"vibevoice_acoustic_tokenizer_encoder": "vibevoice_acoustic_tokenizer",
"vibevoice_acoustic_tokenizer_decoder": "vibevoice_acoustic_tokenizer",
Expand Down
60 changes: 60 additions & 0 deletions src/transformers/models/internvl/configuration_internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,65 @@
# limitations under the License.


import copy

from huggingface_hub.dataclasses import strict

from ...configuration_utils import PreTrainedConfig
from ...utils import auto_docstring
from ..auto import CONFIG_MAPPING, AutoConfig


# Language backbone architecture -> (native text model_type, InternVL image token id).
# The original `internvl_chat` checkpoints do not store these natively.
_INTERNVL_CHAT_LM_MAPPING = {
"Qwen2ForCausalLM": ("qwen2", 151667),
"InternLM2ForCausalLM": ("llama", 92546),
}


def _convert_internvl_chat_config_dict(config_dict: dict) -> dict:
"""Normalize an original ``internvl_chat`` config dict into the native layout.

The ``OpenGVLab/InternVL2-*`` checkpoints ship a bespoke ``internvl_chat``
config (``llm_config``/``vision_config`` with ``intern_vit_6b`` fields and a
``select_layer`` index). This maps those onto the fields expected by
``InternVLConfig`` so the checkpoints load with the native implementation
instead of remote code. Mirrors the offline conversion in
``convert_internvl_weights_to_hf.py``.
"""
config_dict = copy.deepcopy(config_dict)
llm_config = config_dict.get("llm_config") or {}
vision_config = config_dict.get("vision_config") or {}

lm_arch = (llm_config.get("architectures") or ["Qwen2ForCausalLM"])[0]
text_model_type, image_token_id = _INTERNVL_CHAT_LM_MAPPING.get(lm_arch, ("qwen2", 151667))
llm_config["model_type"] = text_model_type
llm_config.setdefault("use_cache", True)

# InternViT -> InternVLVisionConfig field renames.
if "attention_probs_dropout_prob" in vision_config:
dropout = vision_config.pop("attention_probs_dropout_prob")
vision_config["attention_dropout"] = dropout
vision_config["projection_dropout"] = dropout
if "qk_normalization" in vision_config:
vision_config["use_qk_norm"] = vision_config.pop("qk_normalization")
if "qkv_bias" in vision_config:
vision_config["attention_bias"] = vision_config.pop("qkv_bias")
vision_config["use_absolute_position_embeddings"] = True
allowed = set(InternVLVisionConfig.__annotations__)
vision_config = {k: v for k, v in vision_config.items() if k in allowed}

return {
"vision_config": vision_config,
"text_config": llm_config,
"image_token_id": image_token_id,
"downsample_ratio": config_dict.get("downsample_ratio", 0.5),
"vision_feature_layer": config_dict.get("select_layer", -1),
"tie_word_embeddings": llm_config.get("tie_word_embeddings", False),
}


@auto_docstring(checkpoint="OpenGVLab/InternVL3-1B-hf")
@strict
class InternVLVisionConfig(PreTrainedConfig):
Expand Down Expand Up @@ -118,6 +170,14 @@ class InternVLConfig(PreTrainedConfig):
vision_feature_select_strategy: str = "default"
tie_word_embeddings: bool = True

@classmethod
def from_dict(cls, config_dict, **kwargs):
# Original `internvl_chat` checkpoints are remapped onto the native layout
# so they can be loaded without remote code.
if config_dict.get("model_type") == "internvl_chat" or "llm_config" in config_dict:
config_dict = _convert_internvl_chat_config_dict(config_dict)
return super().from_dict(config_dict, **kwargs)

def __post_init__(self, **kwargs):
if isinstance(self.vision_config, dict):
self.vision_config = InternVLVisionConfig(**self.vision_config)
Expand Down
34 changes: 34 additions & 0 deletions tests/models/internvl/test_modeling_internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import requests

from transformers import (
AutoConfig,
AutoProcessor,
BitsAndBytesConfig,
InternVLConfig,
Expand Down Expand Up @@ -215,6 +216,39 @@ def test_flash_attn_2_fp32_ln(self):
pass


@slow
@require_torch
class InternVLOriginalCheckpointTest(unittest.TestCase):
"""The original `OpenGVLab/InternVL2-*` checkpoints use the bespoke `internvl_chat`
remote-code layout. They should load into the native implementation without
`trust_remote_code`, on CPU, with every weight mapped."""

def setUp(self):
self.original_v2_checkpoint = "OpenGVLab/InternVL2-1B"
cleanup(torch_device, gc_collect=True)

def tearDown(self):
cleanup(torch_device, gc_collect=True)

def test_original_internvl2_checkpoint_loads_natively(self):
config = AutoConfig.from_pretrained(self.original_v2_checkpoint, trust_remote_code=False)
self.assertIsInstance(config, InternVLConfig)
self.assertEqual(config.text_config.model_type, "qwen2")
self.assertGreater(config.text_config.num_attention_heads, 0)
self.assertGreater(config.vision_config.num_attention_heads, 0)

model, loading_info = InternVLForConditionalGeneration.from_pretrained(
self.original_v2_checkpoint,
dtype=torch.bfloat16,
trust_remote_code=False,
output_loading_info=True,
)
self.assertIsInstance(model, InternVLForConditionalGeneration)
self.assertEqual(len(loading_info["missing_keys"]), 0)
self.assertEqual(len(loading_info["unexpected_keys"]), 0)
self.assertEqual(len(loading_info["mismatched_keys"]), 0)


@slow
@require_torch_accelerator
class InternVLQwen2IntegrationTest(unittest.TestCase):
Expand Down
Loading