model, mtmd: fix gguf conversion for audio/vision mmproj (#21309)

* fix gguf conversion for audio/vision mmproj

* fix test
This commit is contained in:
Xuan-Son Nguyen 2026-04-02 17:10:32 +02:00 committed by GitHub
parent 223373742b
commit 63f8fe0ef4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
27 changed files with 1462 additions and 41 deletions

View file

@ -419,6 +419,7 @@ class MODEL_ARCH(IntEnum):
GEMMA2 = auto()
GEMMA3 = auto()
GEMMA3N = auto()
GEMMA4 = auto()
GEMMA_EMBEDDING = auto()
STARCODER2 = auto()
RWKV6 = auto()
@ -535,8 +536,11 @@ class MODEL_TENSOR(IntEnum):
FFN_GATE_INP = auto()
FFN_GATE_INP_SHEXP = auto()
FFN_NORM = auto()
FFN_PRE_NORM = auto()
FFN_PRE_NORM = auto() # alias of FFN_NORM
FFN_PRE_NORM_2 = auto() # gemma4
FFN_POST_NORM = auto()
FFN_POST_NORM_1 = auto() # gemma4
FFN_POST_NORM_2 = auto() # gemma4
FFN_GATE = auto()
FFN_DOWN = auto()
FFN_UP = auto()
@ -558,6 +562,7 @@ class MODEL_TENSOR(IntEnum):
ATTN_Q_NORM = auto()
ATTN_K_NORM = auto()
LAYER_OUT_NORM = auto()
LAYER_OUT_SCALE = auto()
PER_LAYER_TOKEN_EMBD = auto() # gemma3n
PER_LAYER_MODEL_PROJ = auto() # gemma3n
PER_LAYER_INP_GATE = auto() # gemma3n
@ -722,8 +727,11 @@ class MODEL_TENSOR(IntEnum):
V_ENC_FFN_UP = auto()
V_ENC_FFN_GATE = auto()
V_ENC_FFN_DOWN = auto()
V_ENC_ATTN_POST_NORM = auto() # gemma4
V_ENC_FFN_POST_NORM = auto()
V_LAYER_SCALE_1 = auto()
V_LAYER_SCALE_2 = auto()
V_LAYER_OUT_SCALE = auto()
V_PRE_NORM = auto()
V_POST_NORM = auto()
V_MM_POST_NORM = auto()
@ -761,6 +769,8 @@ class MODEL_TENSOR(IntEnum):
V_MM_GATE = auto() # cogvlm
V_TOK_BOI = auto() # cogvlm
V_TOK_EOI = auto() # cogvlm
V_STD_BIAS = auto() # gemma4
V_STD_SCALE = auto() # gemma4
V_SAM_POS_EMBD = auto() # Deepseek-OCR
V_SAM_PATCH_EMBD = auto() # Deepseek-OCR
V_SAM_PRE_NORM = auto() # Deepseek-OCR
@ -781,6 +791,7 @@ class MODEL_TENSOR(IntEnum):
A_ENC_EMBD_POS = auto()
A_ENC_EMBD_NORM = auto()
A_ENC_EMBD_TO_LOGITS = auto() # lfm2
A_ENC_INP_PROJ = auto() # gemma4
A_ENC_CONV1D = auto()
A_ENC_CONV1D_NORM = auto() # gemma3n
A_PRE_NORM = auto()
@ -789,10 +800,13 @@ class MODEL_TENSOR(IntEnum):
A_ENC_ATTN_Q = auto()
A_ENC_ATTN_K = auto()
A_ENC_ATTN_V = auto()
A_ENC_ATTN_POST_NORM = auto()
A_ENC_ATTN_PRE_NORM = auto()
A_ENC_ATTN_K_REL = auto() # gemma4
A_ENC_PER_DIM_SCALE = auto() # gemma3n
A_ENC_INPUT_NORM = auto()
A_ENC_OUTPUT = auto()
A_ENC_OUTPUT_NORM = auto()
A_ENC_OUTPUT = auto() # TODO @ngxson: rename to ATTN_OUT
A_ENC_OUTPUT_NORM = auto() # TODO @ngxson: rename to ATTN_OUT
A_ENC_FFN_UP = auto()
A_ENC_FFN_NORM = auto()
A_ENC_FFN_POST_NORM = auto() # gemma3n
@ -813,6 +827,8 @@ class MODEL_TENSOR(IntEnum):
A_MM_HARD_EMB_NORM = auto() # gemma3n
A_MM_SOFT_EMB_NORM = auto() # gemma3n
A_MM_INP_PROJ = auto() # gemma3n
A_PER_DIM_K_SCALE = auto() # gemma4
A_PER_DIM_SCALE = auto() # gemma4
# nextn/mtp
NEXTN_EH_PROJ = auto()
NEXTN_EMBED_TOKENS = auto()
@ -882,6 +898,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.GEMMA2: "gemma2",
MODEL_ARCH.GEMMA3: "gemma3",
MODEL_ARCH.GEMMA3N: "gemma3n",
MODEL_ARCH.GEMMA4: "gemma4",
MODEL_ARCH.GEMMA_EMBEDDING: "gemma-embedding",
MODEL_ARCH.STARCODER2: "starcoder2",
MODEL_ARCH.RWKV6: "rwkv6",
@ -1000,6 +1017,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
MODEL_TENSOR.FFN_PRE_NORM_2: "blk.{bid}.pre_ffw_norm_2", # gemma4
MODEL_TENSOR.FFN_POST_NORM_1: "blk.{bid}.post_ffw_norm_1", # gemma4
MODEL_TENSOR.FFN_POST_NORM_2: "blk.{bid}.post_ffw_norm_2", # gemma4
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
@ -1019,6 +1039,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.MOE_LATENT_DOWN: "blk.{bid}.ffn_latent_down", # nemotron 3 super
MODEL_TENSOR.MOE_LATENT_UP: "blk.{bid}.ffn_latent_up", # nemotron 3 super
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
MODEL_TENSOR.LAYER_OUT_SCALE: "blk.{bid}.layer_output_scale",
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n
@ -1183,8 +1204,11 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
MODEL_TENSOR.V_ENC_ATTN_POST_NORM: "v.blk.{bid}.attn_post_norm",
MODEL_TENSOR.V_ENC_FFN_POST_NORM: "v.blk.{bid}.ffn_post_norm",
MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1",
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
MODEL_TENSOR.V_LAYER_OUT_SCALE: "v.blk.{bid}.out_scale",
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm",
@ -1222,6 +1246,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.V_MM_GATE: "mm.gate",
MODEL_TENSOR.V_TOK_BOI: "v.boi",
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4
MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4
# DeepSeek-OCR SAM
MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd",
MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd",
@ -1243,6 +1269,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm",
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
MODEL_TENSOR.A_ENC_INP_PROJ: "a.input_projection",
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm",
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
@ -1251,6 +1278,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q",
MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k",
MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v",
MODEL_TENSOR.A_ENC_ATTN_POST_NORM: "a.blk.{bid}.attn_post_norm",
MODEL_TENSOR.A_ENC_ATTN_PRE_NORM: "a.blk.{bid}.attn_pre_norm",
MODEL_TENSOR.A_ENC_ATTN_K_REL: "a.blk.{bid}.attn_k_rel",
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale",
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
@ -1275,6 +1305,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.A_MM_SOFT_EMB_NORM: "mm.a.soft_emb_norm", # gemma3n
MODEL_TENSOR.A_MM_EMBEDDING: "mm.a.embedding", # gemma3n
MODEL_TENSOR.A_MM_HARD_EMB_NORM: "mm.a.hard_emb_norm", # gemma3n
MODEL_TENSOR.A_PER_DIM_K_SCALE: "a.blk.{bid}.per_dim_k_scale", # gemma4
MODEL_TENSOR.A_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale", # gemma4
# lfm2 audio
MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv",
MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos",
@ -1319,8 +1351,11 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.V_ENC_FFN_UP,
MODEL_TENSOR.V_ENC_FFN_GATE,
MODEL_TENSOR.V_ENC_FFN_DOWN,
MODEL_TENSOR.V_ENC_ATTN_POST_NORM,
MODEL_TENSOR.V_ENC_FFN_POST_NORM,
MODEL_TENSOR.V_LAYER_SCALE_1,
MODEL_TENSOR.V_LAYER_SCALE_2,
MODEL_TENSOR.V_LAYER_OUT_SCALE,
MODEL_TENSOR.V_PRE_NORM,
MODEL_TENSOR.V_POST_NORM,
MODEL_TENSOR.V_MM_POST_NORM,
@ -1358,6 +1393,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.V_MM_GATE,
MODEL_TENSOR.V_TOK_BOI,
MODEL_TENSOR.V_TOK_EOI,
MODEL_TENSOR.V_STD_BIAS,
MODEL_TENSOR.V_STD_SCALE,
MODEL_TENSOR.V_SAM_POS_EMBD,
MODEL_TENSOR.V_SAM_PATCH_EMBD,
MODEL_TENSOR.V_SAM_PRE_NORM,
@ -1375,6 +1412,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.A_ENC_EMBD_POS,
MODEL_TENSOR.A_ENC_EMBD_NORM,
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
MODEL_TENSOR.A_ENC_INP_PROJ,
MODEL_TENSOR.A_ENC_CONV1D,
MODEL_TENSOR.A_ENC_CONV1D_NORM,
MODEL_TENSOR.A_PRE_NORM,
@ -1383,6 +1421,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.A_ENC_ATTN_Q,
MODEL_TENSOR.A_ENC_ATTN_K,
MODEL_TENSOR.A_ENC_ATTN_V,
MODEL_TENSOR.A_ENC_ATTN_POST_NORM,
MODEL_TENSOR.A_ENC_ATTN_PRE_NORM,
MODEL_TENSOR.A_ENC_ATTN_K_REL,
MODEL_TENSOR.A_ENC_PER_DIM_SCALE,
MODEL_TENSOR.A_ENC_INPUT_NORM,
MODEL_TENSOR.A_ENC_OUTPUT,
@ -1416,6 +1457,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.A_MM_SOFT_EMB_NORM,
MODEL_TENSOR.A_MM_EMBEDDING,
MODEL_TENSOR.A_MM_HARD_EMB_NORM,
MODEL_TENSOR.A_PER_DIM_K_SCALE,
MODEL_TENSOR.A_PER_DIM_SCALE,
],
MODEL_ARCH.LLAMA: [
MODEL_TENSOR.TOKEN_EMBD,
@ -2273,6 +2316,38 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.LAUREL_R,
MODEL_TENSOR.LAUREL_POST_NORM,
],
MODEL_ARCH.GEMMA4: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_UP_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_PRE_NORM,
MODEL_TENSOR.FFN_PRE_NORM_2,
MODEL_TENSOR.FFN_POST_NORM,
MODEL_TENSOR.FFN_POST_NORM_1,
MODEL_TENSOR.FFN_POST_NORM_2,
MODEL_TENSOR.LAYER_OUT_SCALE,
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
MODEL_TENSOR.PER_LAYER_MODEL_PROJ,
MODEL_TENSOR.PER_LAYER_INP_GATE,
MODEL_TENSOR.PER_LAYER_PROJ,
MODEL_TENSOR.PER_LAYER_PROJ_NORM,
MODEL_TENSOR.PER_LAYER_POST_NORM,
],
MODEL_ARCH.GEMMA_EMBEDDING: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT,
@ -4010,6 +4085,8 @@ class VisionProjectorType:
GEMMA3 = "gemma3"
GEMMA3NV = "gemma3nv"
GEMMA3NA = "gemma3na"
GEMMA4V = "gemma4v"
GEMMA4A = "gemma4a"
PHI4 = "phi4"
IDEFICS3 = "idefics3"
PIXTRAL = "pixtral"

View file

@ -799,6 +799,7 @@ class GGUFWriter:
def add_shared_kv_layers(self, value: int) -> None:
self.add_uint32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
# if input is array, true means SWA and false means full_attention for each layer
def add_sliding_window_pattern(self, value: int | Sequence[bool]) -> None:
key = Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch)
if isinstance(value, int):

View file

@ -401,6 +401,10 @@ class TensorNameMap:
"model.layers.{bid}.pre_mlp_layernorm", # afmoe
),
MODEL_TENSOR.FFN_PRE_NORM_2: (
"model.layers.{bid}.pre_feedforward_layernorm_2", # gemma4
),
# Post feed-forward norm
MODEL_TENSOR.FFN_POST_NORM: (
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
@ -411,6 +415,14 @@ class TensorNameMap:
"model.layers.{bid}.post_moe_norm", # grok-2
),
MODEL_TENSOR.FFN_POST_NORM_1: (
"model.layers.{bid}.post_feedforward_layernorm_1", # gemma4
),
MODEL_TENSOR.FFN_POST_NORM_2: (
"model.layers.{bid}.post_feedforward_layernorm_2", # gemma4
),
MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral phimoe
@ -428,6 +440,7 @@ class TensorNameMap:
"layers.{bid}.gate", # mistral-large
"backbone.layers.{bid}.mixer.gate", # nemotron-h-moe
"model.layers.{bid}.moe.gate", # step3.5
"model.layers.{bid}.router.proj", # gemma4
),
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@ -570,6 +583,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_GATE_UP_EXP: (
"model.layers.{bid}.mlp.experts.gate_up_proj",
"model.layers.{bid}.experts.gate_up_proj", # gemma4
),
MODEL_TENSOR.MOE_LATENT_DOWN: (
@ -629,6 +643,7 @@ class TensorNameMap:
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
"model.layers.{bid}.block_sparse_moe.experts.down", # smallthinker
"model.layers.{bid}.moe.down_proj", # step3.5
"model.layers.{bid}.experts.down_proj", # gemma4
),
MODEL_TENSOR.FFN_DOWN_SHEXP: (
@ -693,6 +708,10 @@ class TensorNameMap:
"model.layers.{bid}.final_layernorm", # bailingmoe2
),
MODEL_TENSOR.LAYER_OUT_SCALE: (
"model.layers.{bid}.layer_scalar", # gemma4
),
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: (
"model.embed_tokens_per_layer", # gemma3n
),
@ -1383,6 +1402,7 @@ class TensorNameMap:
"model.vision_model.embeddings.patch_embedding", # Deepseek-OCR CLIP
"siglip2.vision_model.embeddings.patch_embedding",
"vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
"model.vision_tower.patch_embedder.input_proj", # gemma4
),
MODEL_TENSOR.V_ENC_EMBD_NORM: (
@ -1400,6 +1420,7 @@ class TensorNameMap:
"model.vision.patch_embedding.position_embedding", # cogvlm
"visual.embeddings.position_embedding", # glm4v
"vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
"model.vision_tower.patch_embedder.position_embedding_table", # gemma4
),
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
@ -1430,12 +1451,14 @@ class TensorNameMap:
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
"model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
"vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4
),
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
"model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1
"visual.blocks.{bid}.attn.q_norm", # GLM-OCR
"vision_model.model.layers.{bid}.self_attn.q_norm", # gemma4
),
MODEL_TENSOR.V_ENC_ATTN_K: (
@ -1450,12 +1473,14 @@ class TensorNameMap:
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
"model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
"vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4
),
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
"model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1
"visual.blocks.{bid}.attn.k_norm", # GLM-OCR
"vision_model.model.layers.{bid}.self_attn.k_norm", # gemma4
),
MODEL_TENSOR.V_ENC_ATTN_V: (
@ -1470,6 +1495,7 @@ class TensorNameMap:
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
"model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
"vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4
),
MODEL_TENSOR.V_ENC_INPUT_NORM: (
@ -1480,7 +1506,7 @@ class TensorNameMap:
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
"vision_model.model.layers.{bid}.input_layernorm", # llama4
"vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
"visual.blocks.{bid}.norm1", # qwen2vl
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
@ -1505,6 +1531,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
"vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
),
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@ -1522,6 +1549,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.layer_norm2", # Deepseek-OCR CLIP
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
"vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
"vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
),
MODEL_TENSOR.V_ENC_FFN_UP: (
@ -1540,12 +1568,14 @@ class TensorNameMap:
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
"vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
),
MODEL_TENSOR.V_ENC_FFN_GATE: (
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
"vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4
),
MODEL_TENSOR.V_ENC_FFN_DOWN: (
@ -1564,6 +1594,15 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
"vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
),
MODEL_TENSOR.V_ENC_ATTN_POST_NORM: (
"vision_model.model.layers.{bid}.post_attention_layernorm", # gemma4
),
MODEL_TENSOR.V_ENC_FFN_POST_NORM: (
"vision_model.model.layers.{bid}.post_feedforward_layernorm", # gemma4
),
MODEL_TENSOR.V_LAYER_SCALE_1: (
@ -1576,6 +1615,10 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1
),
MODEL_TENSOR.V_LAYER_OUT_SCALE: (
"vision_model.model.layers.{bid}.layer_scalar", # gemma4
),
MODEL_TENSOR.V_PRE_NORM: (
"vision_tower.vision_model.pre_layrnorm",
"vision_tower.ln_pre", # pixtral-hf
@ -1763,6 +1806,14 @@ class TensorNameMap:
"model.vision.eoi", # cogvlm
),
MODEL_TENSOR.V_STD_BIAS: (
"model.vision_tower.std_bias", # gemma4
),
MODEL_TENSOR.V_STD_SCALE: (
"model.vision_tower.std_scale", # gemma4
),
# audio (mtmd)
MODEL_TENSOR.A_ENC_EMBD_POS: (
@ -1782,10 +1833,15 @@ class TensorNameMap:
"audio_tower.conv{bid}", # ultravox
"conformer.pre_encode.conv.{bid}", # lfm2
"model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
"conformer.subsample_conv_projection.layer{bid}.conv", # gemma4
),
MODEL_TENSOR.A_ENC_CONV1D_NORM: (
"model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
"conformer.subsample_conv_projection.layer{bid}.norm", # gemma4
),
MODEL_TENSOR.A_ENC_INP_PROJ: (
"conformer.subsample_conv_projection.input_proj_linear", # gemma4
),
MODEL_TENSOR.A_PRE_NORM: (),
@ -1799,22 +1855,38 @@ class TensorNameMap:
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
"conformer.layers.{bid}.self_attn.q_proj", # gemma4
),
MODEL_TENSOR.A_ENC_ATTN_K: (
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
"conformer.layers.{bid}.self_attn.k_proj", # gemma4
),
MODEL_TENSOR.A_ENC_ATTN_V: (
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
"conformer.layers.{bid}.self_attn.v_proj", # gemma4
),
MODEL_TENSOR.A_ENC_ATTN_K_REL: (
"conformer.layers.{bid}.self_attn.relative_k_proj", # gemma4
),
MODEL_TENSOR.A_ENC_ATTN_POST_NORM: (
"conformer.layers.{bid}.norm_post_attn", # gemma4
),
MODEL_TENSOR.A_ENC_ATTN_PRE_NORM: (
"conformer.layers.{bid}.norm_pre_attn", # gemma4
),
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
"conformer.layers.{bid}.self_attn.per_dim_scale", # gemma3n
),
MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
@ -1831,6 +1903,7 @@ class TensorNameMap:
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
"conformer.layers.{bid}.attention.post", # gemma3n
"conformer.layers.{bid}.self_attn.post", # gemma4
),
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
@ -1842,10 +1915,12 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_FFN_NORM: (
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
"conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4
),
MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
"conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
"conformer.layers.{bid}.feed_forward1.post_layer_norm", # gemma4
),
MODEL_TENSOR.A_ENC_FFN_SCALE: (
@ -1856,6 +1931,7 @@ class TensorNameMap:
"audio_tower.layers.{bid}.fc1", # ultravox
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
"conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4
),
MODEL_TENSOR.A_ENC_FFN_GATE: (),
@ -1864,25 +1940,30 @@ class TensorNameMap:
"audio_tower.layers.{bid}.fc2", # ultravox
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
"conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4
),
MODEL_TENSOR.A_ENC_FFN_UP_1: (
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
"conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4
),
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
"conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4
),
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
"conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4
),
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
"conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
"conformer.layers.{bid}.feed_forward2.post_layer_norm", # gemma4
),
MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
@ -1904,7 +1985,8 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_OUT: (
"conformer.pre_encode.out", # lfm2
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n (note: it should be A_ENC_INP_PROJ, this is a mistake; it should be corrected in C++ code when it's supported)
"conformer.output_proj", # gemma4
),
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
@ -1918,6 +2000,7 @@ class TensorNameMap:
MODEL_TENSOR.A_MMPROJ_FC: (
"audio.multi_modal_projector.linear", # qwen2audio
"audio_tower.proj", # qwen2omni
"model.audio_tower.output_proj" # gemma4
),
MODEL_TENSOR.A_MM_NORM_PRE: (
@ -1953,6 +2036,14 @@ class TensorNameMap:
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
),
MODEL_TENSOR.A_PER_DIM_K_SCALE: (
"conformer.layers.{bid}.attention.attn.per_dim_key_scale", # gemma4
),
MODEL_TENSOR.A_PER_DIM_SCALE: (
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma4
),
MODEL_TENSOR.A_MM_EMBEDDING: (
"model.embed_audio.embedding", # gemma3n
),