model, mtmd: fix gguf conversion for audio/vision mmproj (#21309)
* fix gguf conversion for audio/vision mmproj * fix test
This commit is contained in:
parent
223373742b
commit
63f8fe0ef4
27 changed files with 1462 additions and 41 deletions
|
|
@ -419,6 +419,7 @@ class MODEL_ARCH(IntEnum):
|
|||
GEMMA2 = auto()
|
||||
GEMMA3 = auto()
|
||||
GEMMA3N = auto()
|
||||
GEMMA4 = auto()
|
||||
GEMMA_EMBEDDING = auto()
|
||||
STARCODER2 = auto()
|
||||
RWKV6 = auto()
|
||||
|
|
@ -535,8 +536,11 @@ class MODEL_TENSOR(IntEnum):
|
|||
FFN_GATE_INP = auto()
|
||||
FFN_GATE_INP_SHEXP = auto()
|
||||
FFN_NORM = auto()
|
||||
FFN_PRE_NORM = auto()
|
||||
FFN_PRE_NORM = auto() # alias of FFN_NORM
|
||||
FFN_PRE_NORM_2 = auto() # gemma4
|
||||
FFN_POST_NORM = auto()
|
||||
FFN_POST_NORM_1 = auto() # gemma4
|
||||
FFN_POST_NORM_2 = auto() # gemma4
|
||||
FFN_GATE = auto()
|
||||
FFN_DOWN = auto()
|
||||
FFN_UP = auto()
|
||||
|
|
@ -558,6 +562,7 @@ class MODEL_TENSOR(IntEnum):
|
|||
ATTN_Q_NORM = auto()
|
||||
ATTN_K_NORM = auto()
|
||||
LAYER_OUT_NORM = auto()
|
||||
LAYER_OUT_SCALE = auto()
|
||||
PER_LAYER_TOKEN_EMBD = auto() # gemma3n
|
||||
PER_LAYER_MODEL_PROJ = auto() # gemma3n
|
||||
PER_LAYER_INP_GATE = auto() # gemma3n
|
||||
|
|
@ -722,8 +727,11 @@ class MODEL_TENSOR(IntEnum):
|
|||
V_ENC_FFN_UP = auto()
|
||||
V_ENC_FFN_GATE = auto()
|
||||
V_ENC_FFN_DOWN = auto()
|
||||
V_ENC_ATTN_POST_NORM = auto() # gemma4
|
||||
V_ENC_FFN_POST_NORM = auto()
|
||||
V_LAYER_SCALE_1 = auto()
|
||||
V_LAYER_SCALE_2 = auto()
|
||||
V_LAYER_OUT_SCALE = auto()
|
||||
V_PRE_NORM = auto()
|
||||
V_POST_NORM = auto()
|
||||
V_MM_POST_NORM = auto()
|
||||
|
|
@ -761,6 +769,8 @@ class MODEL_TENSOR(IntEnum):
|
|||
V_MM_GATE = auto() # cogvlm
|
||||
V_TOK_BOI = auto() # cogvlm
|
||||
V_TOK_EOI = auto() # cogvlm
|
||||
V_STD_BIAS = auto() # gemma4
|
||||
V_STD_SCALE = auto() # gemma4
|
||||
V_SAM_POS_EMBD = auto() # Deepseek-OCR
|
||||
V_SAM_PATCH_EMBD = auto() # Deepseek-OCR
|
||||
V_SAM_PRE_NORM = auto() # Deepseek-OCR
|
||||
|
|
@ -781,6 +791,7 @@ class MODEL_TENSOR(IntEnum):
|
|||
A_ENC_EMBD_POS = auto()
|
||||
A_ENC_EMBD_NORM = auto()
|
||||
A_ENC_EMBD_TO_LOGITS = auto() # lfm2
|
||||
A_ENC_INP_PROJ = auto() # gemma4
|
||||
A_ENC_CONV1D = auto()
|
||||
A_ENC_CONV1D_NORM = auto() # gemma3n
|
||||
A_PRE_NORM = auto()
|
||||
|
|
@ -789,10 +800,13 @@ class MODEL_TENSOR(IntEnum):
|
|||
A_ENC_ATTN_Q = auto()
|
||||
A_ENC_ATTN_K = auto()
|
||||
A_ENC_ATTN_V = auto()
|
||||
A_ENC_ATTN_POST_NORM = auto()
|
||||
A_ENC_ATTN_PRE_NORM = auto()
|
||||
A_ENC_ATTN_K_REL = auto() # gemma4
|
||||
A_ENC_PER_DIM_SCALE = auto() # gemma3n
|
||||
A_ENC_INPUT_NORM = auto()
|
||||
A_ENC_OUTPUT = auto()
|
||||
A_ENC_OUTPUT_NORM = auto()
|
||||
A_ENC_OUTPUT = auto() # TODO @ngxson: rename to ATTN_OUT
|
||||
A_ENC_OUTPUT_NORM = auto() # TODO @ngxson: rename to ATTN_OUT
|
||||
A_ENC_FFN_UP = auto()
|
||||
A_ENC_FFN_NORM = auto()
|
||||
A_ENC_FFN_POST_NORM = auto() # gemma3n
|
||||
|
|
@ -813,6 +827,8 @@ class MODEL_TENSOR(IntEnum):
|
|||
A_MM_HARD_EMB_NORM = auto() # gemma3n
|
||||
A_MM_SOFT_EMB_NORM = auto() # gemma3n
|
||||
A_MM_INP_PROJ = auto() # gemma3n
|
||||
A_PER_DIM_K_SCALE = auto() # gemma4
|
||||
A_PER_DIM_SCALE = auto() # gemma4
|
||||
# nextn/mtp
|
||||
NEXTN_EH_PROJ = auto()
|
||||
NEXTN_EMBED_TOKENS = auto()
|
||||
|
|
@ -882,6 +898,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.GEMMA2: "gemma2",
|
||||
MODEL_ARCH.GEMMA3: "gemma3",
|
||||
MODEL_ARCH.GEMMA3N: "gemma3n",
|
||||
MODEL_ARCH.GEMMA4: "gemma4",
|
||||
MODEL_ARCH.GEMMA_EMBEDDING: "gemma-embedding",
|
||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||
MODEL_ARCH.RWKV6: "rwkv6",
|
||||
|
|
@ -1000,6 +1017,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
||||
MODEL_TENSOR.FFN_PRE_NORM_2: "blk.{bid}.pre_ffw_norm_2", # gemma4
|
||||
MODEL_TENSOR.FFN_POST_NORM_1: "blk.{bid}.post_ffw_norm_1", # gemma4
|
||||
MODEL_TENSOR.FFN_POST_NORM_2: "blk.{bid}.post_ffw_norm_2", # gemma4
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
|
|
@ -1019,6 +1039,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.MOE_LATENT_DOWN: "blk.{bid}.ffn_latent_down", # nemotron 3 super
|
||||
MODEL_TENSOR.MOE_LATENT_UP: "blk.{bid}.ffn_latent_up", # nemotron 3 super
|
||||
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||
MODEL_TENSOR.LAYER_OUT_SCALE: "blk.{bid}.layer_output_scale",
|
||||
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
|
||||
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
|
||||
MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n
|
||||
|
|
@ -1183,8 +1204,11 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.V_ENC_ATTN_POST_NORM: "v.blk.{bid}.attn_post_norm",
|
||||
MODEL_TENSOR.V_ENC_FFN_POST_NORM: "v.blk.{bid}.ffn_post_norm",
|
||||
MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1",
|
||||
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
|
||||
MODEL_TENSOR.V_LAYER_OUT_SCALE: "v.blk.{bid}.out_scale",
|
||||
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
|
||||
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
|
||||
MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm",
|
||||
|
|
@ -1222,6 +1246,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.V_MM_GATE: "mm.gate",
|
||||
MODEL_TENSOR.V_TOK_BOI: "v.boi",
|
||||
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
|
||||
MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4
|
||||
MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4
|
||||
# DeepSeek-OCR SAM
|
||||
MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd",
|
||||
MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd",
|
||||
|
|
@ -1243,6 +1269,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
|
||||
MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm",
|
||||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
|
||||
MODEL_TENSOR.A_ENC_INP_PROJ: "a.input_projection",
|
||||
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm",
|
||||
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
|
||||
|
|
@ -1251,6 +1278,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.A_ENC_ATTN_POST_NORM: "a.blk.{bid}.attn_post_norm",
|
||||
MODEL_TENSOR.A_ENC_ATTN_PRE_NORM: "a.blk.{bid}.attn_pre_norm",
|
||||
MODEL_TENSOR.A_ENC_ATTN_K_REL: "a.blk.{bid}.attn_k_rel",
|
||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale",
|
||||
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
|
||||
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
|
||||
|
|
@ -1275,6 +1305,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.A_MM_SOFT_EMB_NORM: "mm.a.soft_emb_norm", # gemma3n
|
||||
MODEL_TENSOR.A_MM_EMBEDDING: "mm.a.embedding", # gemma3n
|
||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM: "mm.a.hard_emb_norm", # gemma3n
|
||||
MODEL_TENSOR.A_PER_DIM_K_SCALE: "a.blk.{bid}.per_dim_k_scale", # gemma4
|
||||
MODEL_TENSOR.A_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale", # gemma4
|
||||
# lfm2 audio
|
||||
MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv",
|
||||
MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos",
|
||||
|
|
@ -1319,8 +1351,11 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.V_ENC_FFN_UP,
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE,
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN,
|
||||
MODEL_TENSOR.V_ENC_ATTN_POST_NORM,
|
||||
MODEL_TENSOR.V_ENC_FFN_POST_NORM,
|
||||
MODEL_TENSOR.V_LAYER_SCALE_1,
|
||||
MODEL_TENSOR.V_LAYER_SCALE_2,
|
||||
MODEL_TENSOR.V_LAYER_OUT_SCALE,
|
||||
MODEL_TENSOR.V_PRE_NORM,
|
||||
MODEL_TENSOR.V_POST_NORM,
|
||||
MODEL_TENSOR.V_MM_POST_NORM,
|
||||
|
|
@ -1358,6 +1393,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.V_MM_GATE,
|
||||
MODEL_TENSOR.V_TOK_BOI,
|
||||
MODEL_TENSOR.V_TOK_EOI,
|
||||
MODEL_TENSOR.V_STD_BIAS,
|
||||
MODEL_TENSOR.V_STD_SCALE,
|
||||
MODEL_TENSOR.V_SAM_POS_EMBD,
|
||||
MODEL_TENSOR.V_SAM_PATCH_EMBD,
|
||||
MODEL_TENSOR.V_SAM_PRE_NORM,
|
||||
|
|
@ -1375,6 +1412,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.A_ENC_EMBD_POS,
|
||||
MODEL_TENSOR.A_ENC_EMBD_NORM,
|
||||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
|
||||
MODEL_TENSOR.A_ENC_INP_PROJ,
|
||||
MODEL_TENSOR.A_ENC_CONV1D,
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM,
|
||||
MODEL_TENSOR.A_PRE_NORM,
|
||||
|
|
@ -1383,6 +1421,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.A_ENC_ATTN_Q,
|
||||
MODEL_TENSOR.A_ENC_ATTN_K,
|
||||
MODEL_TENSOR.A_ENC_ATTN_V,
|
||||
MODEL_TENSOR.A_ENC_ATTN_POST_NORM,
|
||||
MODEL_TENSOR.A_ENC_ATTN_PRE_NORM,
|
||||
MODEL_TENSOR.A_ENC_ATTN_K_REL,
|
||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE,
|
||||
MODEL_TENSOR.A_ENC_INPUT_NORM,
|
||||
MODEL_TENSOR.A_ENC_OUTPUT,
|
||||
|
|
@ -1416,6 +1457,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.A_MM_SOFT_EMB_NORM,
|
||||
MODEL_TENSOR.A_MM_EMBEDDING,
|
||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM,
|
||||
MODEL_TENSOR.A_PER_DIM_K_SCALE,
|
||||
MODEL_TENSOR.A_PER_DIM_SCALE,
|
||||
],
|
||||
MODEL_ARCH.LLAMA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
|
@ -2273,6 +2316,38 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.LAUREL_R,
|
||||
MODEL_TENSOR.LAUREL_POST_NORM,
|
||||
],
|
||||
MODEL_ARCH.GEMMA4: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_UP_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_POST_NORM,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_PRE_NORM,
|
||||
MODEL_TENSOR.FFN_PRE_NORM_2,
|
||||
MODEL_TENSOR.FFN_POST_NORM,
|
||||
MODEL_TENSOR.FFN_POST_NORM_1,
|
||||
MODEL_TENSOR.FFN_POST_NORM_2,
|
||||
MODEL_TENSOR.LAYER_OUT_SCALE,
|
||||
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
|
||||
MODEL_TENSOR.PER_LAYER_MODEL_PROJ,
|
||||
MODEL_TENSOR.PER_LAYER_INP_GATE,
|
||||
MODEL_TENSOR.PER_LAYER_PROJ,
|
||||
MODEL_TENSOR.PER_LAYER_PROJ_NORM,
|
||||
MODEL_TENSOR.PER_LAYER_POST_NORM,
|
||||
],
|
||||
MODEL_ARCH.GEMMA_EMBEDDING: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
|
|
@ -4010,6 +4085,8 @@ class VisionProjectorType:
|
|||
GEMMA3 = "gemma3"
|
||||
GEMMA3NV = "gemma3nv"
|
||||
GEMMA3NA = "gemma3na"
|
||||
GEMMA4V = "gemma4v"
|
||||
GEMMA4A = "gemma4a"
|
||||
PHI4 = "phi4"
|
||||
IDEFICS3 = "idefics3"
|
||||
PIXTRAL = "pixtral"
|
||||
|
|
|
|||
|
|
@ -799,6 +799,7 @@ class GGUFWriter:
|
|||
def add_shared_kv_layers(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
|
||||
|
||||
# if input is array, true means SWA and false means full_attention for each layer
|
||||
def add_sliding_window_pattern(self, value: int | Sequence[bool]) -> None:
|
||||
key = Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch)
|
||||
if isinstance(value, int):
|
||||
|
|
|
|||
|
|
@ -401,6 +401,10 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.pre_mlp_layernorm", # afmoe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_PRE_NORM_2: (
|
||||
"model.layers.{bid}.pre_feedforward_layernorm_2", # gemma4
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
MODEL_TENSOR.FFN_POST_NORM: (
|
||||
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
||||
|
|
@ -411,6 +415,14 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.post_moe_norm", # grok-2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_POST_NORM_1: (
|
||||
"model.layers.{bid}.post_feedforward_layernorm_1", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_POST_NORM_2: (
|
||||
"model.layers.{bid}.post_feedforward_layernorm_2", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
"layers.{bid}.feed_forward.gate", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral phimoe
|
||||
|
|
@ -428,6 +440,7 @@ class TensorNameMap:
|
|||
"layers.{bid}.gate", # mistral-large
|
||||
"backbone.layers.{bid}.mixer.gate", # nemotron-h-moe
|
||||
"model.layers.{bid}.moe.gate", # step3.5
|
||||
"model.layers.{bid}.router.proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||
|
|
@ -570,6 +583,7 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.FFN_GATE_UP_EXP: (
|
||||
"model.layers.{bid}.mlp.experts.gate_up_proj",
|
||||
"model.layers.{bid}.experts.gate_up_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.MOE_LATENT_DOWN: (
|
||||
|
|
@ -629,6 +643,7 @@ class TensorNameMap:
|
|||
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
|
||||
"model.layers.{bid}.block_sparse_moe.experts.down", # smallthinker
|
||||
"model.layers.{bid}.moe.down_proj", # step3.5
|
||||
"model.layers.{bid}.experts.down_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||
|
|
@ -693,6 +708,10 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.final_layernorm", # bailingmoe2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.LAYER_OUT_SCALE: (
|
||||
"model.layers.{bid}.layer_scalar", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: (
|
||||
"model.embed_tokens_per_layer", # gemma3n
|
||||
),
|
||||
|
|
@ -1383,6 +1402,7 @@ class TensorNameMap:
|
|||
"model.vision_model.embeddings.patch_embedding", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.embeddings.patch_embedding",
|
||||
"vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
|
||||
"model.vision_tower.patch_embedder.input_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_NORM: (
|
||||
|
|
@ -1400,6 +1420,7 @@ class TensorNameMap:
|
|||
"model.vision.patch_embedding.position_embedding", # cogvlm
|
||||
"visual.embeddings.position_embedding", # glm4v
|
||||
"vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
|
||||
"model.vision_tower.patch_embedder.position_embedding_table", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
|
||||
|
|
@ -1430,12 +1451,14 @@ class TensorNameMap:
|
|||
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
|
||||
"vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1
|
||||
"visual.blocks.{bid}.attn.q_norm", # GLM-OCR
|
||||
"vision_model.model.layers.{bid}.self_attn.q_norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K: (
|
||||
|
|
@ -1450,12 +1473,14 @@ class TensorNameMap:
|
|||
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1
|
||||
"visual.blocks.{bid}.attn.k_norm", # GLM-OCR
|
||||
"vision_model.model.layers.{bid}.self_attn.k_norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_V: (
|
||||
|
|
@ -1470,6 +1495,7 @@ class TensorNameMap:
|
|||
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
|
||||
"vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||
|
|
@ -1480,7 +1506,7 @@ class TensorNameMap:
|
|||
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
|
||||
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
|
||||
"vision_model.model.layers.{bid}.input_layernorm", # llama4
|
||||
"vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
|
||||
"visual.blocks.{bid}.norm1", # qwen2vl
|
||||
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
|
||||
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
|
||||
|
|
@ -1505,6 +1531,7 @@ class TensorNameMap:
|
|||
"model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
|
||||
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
|
||||
|
|
@ -1522,6 +1549,7 @@ class TensorNameMap:
|
|||
"model.vision_model.transformer.layers.{bid}.layer_norm2", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||
"vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||
|
|
@ -1540,12 +1568,14 @@ class TensorNameMap:
|
|||
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
|
||||
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE: (
|
||||
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
|
||||
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
|
||||
"vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
||||
|
|
@ -1564,6 +1594,15 @@ class TensorNameMap:
|
|||
"model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_POST_NORM: (
|
||||
"vision_model.model.layers.{bid}.post_attention_layernorm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_POST_NORM: (
|
||||
"vision_model.model.layers.{bid}.post_feedforward_layernorm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_LAYER_SCALE_1: (
|
||||
|
|
@ -1576,6 +1615,10 @@ class TensorNameMap:
|
|||
"model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_LAYER_OUT_SCALE: (
|
||||
"vision_model.model.layers.{bid}.layer_scalar", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_PRE_NORM: (
|
||||
"vision_tower.vision_model.pre_layrnorm",
|
||||
"vision_tower.ln_pre", # pixtral-hf
|
||||
|
|
@ -1763,6 +1806,14 @@ class TensorNameMap:
|
|||
"model.vision.eoi", # cogvlm
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_STD_BIAS: (
|
||||
"model.vision_tower.std_bias", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_STD_SCALE: (
|
||||
"model.vision_tower.std_scale", # gemma4
|
||||
),
|
||||
|
||||
# audio (mtmd)
|
||||
|
||||
MODEL_TENSOR.A_ENC_EMBD_POS: (
|
||||
|
|
@ -1782,10 +1833,15 @@ class TensorNameMap:
|
|||
"audio_tower.conv{bid}", # ultravox
|
||||
"conformer.pre_encode.conv.{bid}", # lfm2
|
||||
"model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
|
||||
"conformer.subsample_conv_projection.layer{bid}.conv", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM: (
|
||||
"model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
|
||||
"conformer.subsample_conv_projection.layer{bid}.norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_INP_PROJ: (
|
||||
"conformer.subsample_conv_projection.input_proj_linear", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_PRE_NORM: (),
|
||||
|
|
@ -1799,22 +1855,38 @@ class TensorNameMap:
|
|||
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.q_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_K: (
|
||||
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.k_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_V: (
|
||||
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.v_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_K_REL: (
|
||||
"conformer.layers.{bid}.self_attn.relative_k_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_POST_NORM: (
|
||||
"conformer.layers.{bid}.norm_post_attn", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_PRE_NORM: (
|
||||
"conformer.layers.{bid}.norm_pre_attn", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
|
||||
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.per_dim_scale", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
|
||||
|
|
@ -1831,6 +1903,7 @@ class TensorNameMap:
|
|||
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
|
||||
"conformer.layers.{bid}.attention.post", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.post", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
|
||||
|
|
@ -1842,10 +1915,12 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.A_ENC_FFN_NORM: (
|
||||
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
|
||||
"conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.post_layer_norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE: (
|
||||
|
|
@ -1856,6 +1931,7 @@ class TensorNameMap:
|
|||
"audio_tower.layers.{bid}.fc1", # ultravox
|
||||
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE: (),
|
||||
|
|
@ -1864,25 +1940,30 @@ class TensorNameMap:
|
|||
"audio_tower.layers.{bid}.fc2", # ultravox
|
||||
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_UP_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
|
||||
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
|
||||
"conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.post_layer_norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
|
||||
|
|
@ -1904,7 +1985,8 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.A_ENC_OUT: (
|
||||
"conformer.pre_encode.out", # lfm2
|
||||
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n
|
||||
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n (note: it should be A_ENC_INP_PROJ, this is a mistake; it should be corrected in C++ code when it's supported)
|
||||
"conformer.output_proj", # gemma4
|
||||
),
|
||||
|
||||
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
|
||||
|
|
@ -1918,6 +2000,7 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.A_MMPROJ_FC: (
|
||||
"audio.multi_modal_projector.linear", # qwen2audio
|
||||
"audio_tower.proj", # qwen2omni
|
||||
"model.audio_tower.output_proj" # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MM_NORM_PRE: (
|
||||
|
|
@ -1953,6 +2036,14 @@ class TensorNameMap:
|
|||
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_PER_DIM_K_SCALE: (
|
||||
"conformer.layers.{bid}.attention.attn.per_dim_key_scale", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_PER_DIM_SCALE: (
|
||||
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MM_EMBEDDING: (
|
||||
"model.embed_audio.embedding", # gemma3n
|
||||
),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue