model, mtmd: fix gguf conversion for audio/vision mmproj (#21309)

* fix gguf conversion for audio/vision mmproj * fix test
2026-04-02 17:10:32 +02:00 · 2026-04-02 17:10:32 +02:00 · 63f8fe0ef4
commit 63f8fe0ef4
parent 223373742b
27 changed files with 1462 additions and 41 deletions
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -24,6 +24,7 @@
 #include <limits>
 #include <array>
 #include <functional>
+#include <float.h>

 struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};

@ -379,19 +380,34 @@ ggml_tensor * clip_graph::build_vit(
                    Vcur = ggml_add(ctx0, Vcur, layer.v_b);
                }

-                if (layer.q_norm) {
-                    Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
-                    cb(Qcur, "Qcur_norm", il);
-                }
+                // if true, norm must be applied after reshaping to (d_head, n_head, n_pos)
+                bool norm_per_head = layer.q_norm && layer.q_norm->ne[0] == d_head;

-                if (layer.k_norm) {
-                    Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
-                    cb(Kcur, "Kcur_norm", il);
+                if (!norm_per_head) {
+                    if (layer.q_norm) {
+                        Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
+                        cb(Qcur, "Qcur_norm", il);
+                    }
+                    if (layer.k_norm) {
+                        Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
+                        cb(Kcur, "Kcur_norm", il);
+                    }
                }

                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+
+                if (norm_per_head) {
+                    if (layer.q_norm) {
+                        Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
+                        cb(Qcur, "Qcur_norm_per_head", il);
+                    }
+                    if (layer.k_norm) {
+                        Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
+                        cb(Kcur, "Kcur_norm_per_head", il);
+                    }
+                }
            }

            cb(Qcur, "Qcur", il);
@ -405,6 +421,11 @@ ggml_tensor * clip_graph::build_vit(
                cb(Kcur, "Kcur_pos", il);
            }

+            if (proj_type == PROJECTOR_TYPE_GEMMA4V) {
+                Vcur = ggml_rms_norm(ctx0, Vcur, eps);
+                cb(Vcur, "Vcur_normed", il);
+            }
+
            cur = build_attn(layer.o_w, layer.o_b,
                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
            cb(cur, "attn_out", il);
@ -415,6 +436,11 @@ ggml_tensor * clip_graph::build_vit(
            cb(cur, "attn_out_scaled", il);
        }

+        if (layer.attn_post_norm_w) {
+            cur = build_norm(cur, layer.attn_post_norm_w, nullptr, norm_t, eps, il);
+            cb(cur, "attn_post_normed", il);
+        }
+
        // re-add the layer input, e.g., residual
        cur = ggml_add(ctx0, cur, inpL);

@ -422,7 +448,7 @@ ggml_tensor * clip_graph::build_vit(

        cb(cur, "ffn_inp", il);

-        // layernorm2
+        // layernorm2 (pre-ffn norm)
        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
        cb(cur, "ffn_inp_normed", il);

@ -435,6 +461,11 @@ ggml_tensor * clip_graph::build_vit(

        cb(cur, "ffn_out", il);

+        if (layer.ff_post_norm_w) {
+            cur = build_norm(cur, layer.ff_post_norm_w, nullptr, norm_t, eps, il);
+            cb(cur, "ffn_post_normed", il);
+        }
+
        if (layer.ls_2_w) {
            cur = ggml_mul(ctx0, cur, layer.ls_2_w);
            cb(cur, "ffn_out_scaled", il);
@ -444,6 +475,11 @@ ggml_tensor * clip_graph::build_vit(
        cur = ggml_add(ctx0, inpL, cur);
        cb(cur, "layer_out", il);

+        if (layer.ls_out_w) {
+            cur = ggml_mul(ctx0, cur, layer.ls_out_w);
+            cb(cur, "layer_out_scaled", il);
+        }
+
        inpL = cur;
    }

@ -808,6 +844,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
            } break;
+        case PROJECTOR_TYPE_GEMMA4V:
+            {
+                builder = std::make_unique<clip_graph_gemma4v>(ctx, img);
+            } break;
        case PROJECTOR_TYPE_PIXTRAL:
        case PROJECTOR_TYPE_LIGHTONOCR:
            {
@ -1257,6 +1297,17 @@ struct clip_model_loader {
                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                    } break;

+                case PROJECTOR_TYPE_GEMMA4V:
+                    {
+                        hparams.rope_theta = 100.0f;
+                        hparams.n_merge = 3; // pooling_kernel_size
+                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+                        // @ngxson : the model performs quite poor with small images, we need to bump minimum image tokens to 40 to avoid that
+                        hparams.set_limit_image_tokens(252, 280);
+                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
+                    } break;
+
                case PROJECTOR_TYPE_GEMMA3NV:
                    {
                        // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
@ -1442,6 +1493,11 @@ struct clip_model_loader {
        std::map<std::string, size_t> tensor_offset;
        std::vector<ggml_tensor *> tensors_to_load;

+        auto fin = std::ifstream(fname, std::ios::binary);
+        if (!fin) {
+            throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
+        }
+
        // TODO @ngxson : support both audio and video in the future
        const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";

@ -1478,6 +1534,18 @@ struct clip_model_loader {
            return cur;
        };

+        auto get_scalar = [&](const std::string & name, float default_val) {
+            auto it = tensor_offset.find(name);
+            if (it == tensor_offset.end()) {
+                return default_val;
+            }
+            size_t offset = it->second;
+            fin.seekg(offset, std::ios::beg);
+            float value;
+            fin.read(reinterpret_cast<char*>(&value), sizeof(float));
+            return value;
+        };
+
        model.class_embedding = get_tensor(TN_CLASS_EMBD, false);

        model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
@ -1512,8 +1580,11 @@ struct clip_model_loader {
            layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
            layer.ln_1_w = get_tensor(string_format(TN_LN_1,        prefix, il, "weight"), false);
            layer.ln_2_w = get_tensor(string_format(TN_LN_2,        prefix, il, "weight"), false);
-            layer.ls_1_w = get_tensor(string_format(TN_LS_1,        prefix, il, "weight"), false); // no bias
-            layer.ls_2_w = get_tensor(string_format(TN_LS_2,        prefix, il, "weight"), false); // no bias
+            layer.ls_1_w        = get_tensor(string_format(TN_LS_1,         prefix, il, "weight"), false); // no bias
+            layer.ls_2_w        = get_tensor(string_format(TN_LS_2,         prefix, il, "weight"), false); // no bias
+            layer.ls_out_w      = get_tensor(string_format(TN_LS_OUT,        prefix, il, "weight"), false); // no bias
+            layer.attn_post_norm_w = get_tensor(string_format(TN_ATTN_POST_NORM, prefix, il, "weight"), false); // no bias
+            layer.ff_post_norm_w   = get_tensor(string_format(TN_FFN_POST_NORM,  prefix, il, "weight"), false); // no bias

            layer.k_b    = get_tensor(string_format(TN_ATTN_K,      prefix, il, "bias"), false);
            layer.q_b    = get_tensor(string_format(TN_ATTN_Q,      prefix, il, "bias"), false);
@ -1713,6 +1784,32 @@ struct clip_model_loader {
                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
                } break;
+            case PROJECTOR_TYPE_GEMMA4V:
+                {
+                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
+                    model.std_bias  = get_tensor(TN_STD_BIAS,  false);
+                    model.std_scale = get_tensor(TN_STD_SCALE, false);
+                    // load scalar for Gemma4ClippableLinear
+                    for (auto * tensor : tensors_to_load) {
+                        std::string name = tensor->name;
+                        if (string_ends_with(name, ".weight")) {
+                            std::string name_inp_max = name;
+                            std::string name_inp_min = name;
+                            std::string name_out_max = name;
+                            std::string name_out_min = name;
+                            string_replace_all(name_inp_max, ".weight", ".input_max");
+                            string_replace_all(name_inp_min, ".weight", ".input_min");
+                            string_replace_all(name_out_max, ".weight", ".output_max");
+                            string_replace_all(name_out_min, ".weight", ".output_min");
+                            model.clamp_info_map[name] = {
+                                get_scalar(name_inp_max, FLT_MAX),
+                                get_scalar(name_inp_min, -FLT_MAX),
+                                get_scalar(name_out_max, FLT_MAX),
+                                get_scalar(name_out_min, -FLT_MAX)
+                            };
+                        }
+                    }
+                } break;
            case PROJECTOR_TYPE_GEMMA3NV:
                {
                    model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
@ -2042,11 +2139,6 @@ struct clip_model_loader {
        {
            std::vector<uint8_t> read_buf;

-            auto fin = std::ifstream(fname, std::ios::binary);
-            if (!fin) {
-                throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
-            }
-
            // alloc memory and offload data
            ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
            ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
@ -2345,7 +2437,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params

            // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
            // we can remove this check when we implement audio support for Gemma 3N
-            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
+            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV
+                || ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA4V;
        }

        if (loader.has_audio && !skip_audio) {
@ -2581,6 +2674,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                n_patches = x_patch * y_patch;
            } break;
        case PROJECTOR_TYPE_GEMMA3:
+        case PROJECTOR_TYPE_GEMMA4V:
        case PROJECTOR_TYPE_IDEFICS3:
        case PROJECTOR_TYPE_INTERNVL:
        case PROJECTOR_TYPE_NEMOTRON_V2_VL:
@ -3031,6 +3125,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                }
                set_input_i32("patches", patches);
            } break;
+        case PROJECTOR_TYPE_GEMMA4V:
+            {
+                // set (col, row) patch positions for learned positional embedding
+                const int n_cols = image_size_width  / patch_size;
+                std::vector<int> pos_x(num_patches), pos_y(num_patches);
+                for (int i = 0; i < num_patches; i++) {
+                    pos_x[i] = i % n_cols;
+                    pos_y[i] = i / n_cols;
+                }
+                set_input_i32("pos_x", pos_x);
+                set_input_i32("pos_y", pos_y);
+            } break;
        case PROJECTOR_TYPE_DEEPSEEKOCR:
            {
                GGML_ASSERT(pos_w == pos_h);
@ -3218,6 +3324,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        case PROJECTOR_TYPE_GEMMA3:
        case PROJECTOR_TYPE_GEMMA3NV:
            return ctx->model.mm_input_proj_w->ne[0];
+        case PROJECTOR_TYPE_GEMMA4V:
+            return ctx->model.mm_input_proj_w->ne[1];
        case PROJECTOR_TYPE_IDEFICS3:
            return ctx->model.mm_fc_w->ne[1];
        case PROJECTOR_TYPE_ULTRAVOX: