model, mtmd: fix gguf conversion for audio/vision mmproj (#21309)

* fix gguf conversion for audio/vision mmproj

* fix test
This commit is contained in:
Xuan-Son Nguyen 2026-04-02 17:10:32 +02:00 committed by GitHub
parent 223373742b
commit 63f8fe0ef4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
27 changed files with 1462 additions and 41 deletions

View file

@ -24,6 +24,7 @@
#include <limits>
#include <array>
#include <functional>
#include <float.h>
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
@ -379,19 +380,34 @@ ggml_tensor * clip_graph::build_vit(
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
}
if (layer.q_norm) {
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
cb(Qcur, "Qcur_norm", il);
}
// if true, norm must be applied after reshaping to (d_head, n_head, n_pos)
bool norm_per_head = layer.q_norm && layer.q_norm->ne[0] == d_head;
if (layer.k_norm) {
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
cb(Kcur, "Kcur_norm", il);
if (!norm_per_head) {
if (layer.q_norm) {
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
cb(Qcur, "Qcur_norm", il);
}
if (layer.k_norm) {
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
cb(Kcur, "Kcur_norm", il);
}
}
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
if (norm_per_head) {
if (layer.q_norm) {
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
cb(Qcur, "Qcur_norm_per_head", il);
}
if (layer.k_norm) {
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
cb(Kcur, "Kcur_norm_per_head", il);
}
}
}
cb(Qcur, "Qcur", il);
@ -405,6 +421,11 @@ ggml_tensor * clip_graph::build_vit(
cb(Kcur, "Kcur_pos", il);
}
if (proj_type == PROJECTOR_TYPE_GEMMA4V) {
Vcur = ggml_rms_norm(ctx0, Vcur, eps);
cb(Vcur, "Vcur_normed", il);
}
cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
@ -415,6 +436,11 @@ ggml_tensor * clip_graph::build_vit(
cb(cur, "attn_out_scaled", il);
}
if (layer.attn_post_norm_w) {
cur = build_norm(cur, layer.attn_post_norm_w, nullptr, norm_t, eps, il);
cb(cur, "attn_post_normed", il);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, inpL);
@ -422,7 +448,7 @@ ggml_tensor * clip_graph::build_vit(
cb(cur, "ffn_inp", il);
// layernorm2
// layernorm2 (pre-ffn norm)
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
cb(cur, "ffn_inp_normed", il);
@ -435,6 +461,11 @@ ggml_tensor * clip_graph::build_vit(
cb(cur, "ffn_out", il);
if (layer.ff_post_norm_w) {
cur = build_norm(cur, layer.ff_post_norm_w, nullptr, norm_t, eps, il);
cb(cur, "ffn_post_normed", il);
}
if (layer.ls_2_w) {
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
cb(cur, "ffn_out_scaled", il);
@ -444,6 +475,11 @@ ggml_tensor * clip_graph::build_vit(
cur = ggml_add(ctx0, inpL, cur);
cb(cur, "layer_out", il);
if (layer.ls_out_w) {
cur = ggml_mul(ctx0, cur, layer.ls_out_w);
cb(cur, "layer_out_scaled", il);
}
inpL = cur;
}
@ -808,6 +844,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
} break;
case PROJECTOR_TYPE_GEMMA4V:
{
builder = std::make_unique<clip_graph_gemma4v>(ctx, img);
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
@ -1257,6 +1297,17 @@ struct clip_model_loader {
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
} break;
case PROJECTOR_TYPE_GEMMA4V:
{
hparams.rope_theta = 100.0f;
hparams.n_merge = 3; // pooling_kernel_size
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
// @ngxson : the model performs quite poor with small images, we need to bump minimum image tokens to 40 to avoid that
hparams.set_limit_image_tokens(252, 280);
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_GEMMA3NV:
{
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
@ -1442,6 +1493,11 @@ struct clip_model_loader {
std::map<std::string, size_t> tensor_offset;
std::vector<ggml_tensor *> tensors_to_load;
auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
// TODO @ngxson : support both audio and video in the future
const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
@ -1478,6 +1534,18 @@ struct clip_model_loader {
return cur;
};
auto get_scalar = [&](const std::string & name, float default_val) {
auto it = tensor_offset.find(name);
if (it == tensor_offset.end()) {
return default_val;
}
size_t offset = it->second;
fin.seekg(offset, std::ios::beg);
float value;
fin.read(reinterpret_cast<char*>(&value), sizeof(float));
return value;
};
model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
@ -1512,8 +1580,11 @@ struct clip_model_loader {
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
layer.ls_out_w = get_tensor(string_format(TN_LS_OUT, prefix, il, "weight"), false); // no bias
layer.attn_post_norm_w = get_tensor(string_format(TN_ATTN_POST_NORM, prefix, il, "weight"), false); // no bias
layer.ff_post_norm_w = get_tensor(string_format(TN_FFN_POST_NORM, prefix, il, "weight"), false); // no bias
layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
@ -1713,6 +1784,32 @@ struct clip_model_loader {
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
} break;
case PROJECTOR_TYPE_GEMMA4V:
{
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
model.std_bias = get_tensor(TN_STD_BIAS, false);
model.std_scale = get_tensor(TN_STD_SCALE, false);
// load scalar for Gemma4ClippableLinear
for (auto * tensor : tensors_to_load) {
std::string name = tensor->name;
if (string_ends_with(name, ".weight")) {
std::string name_inp_max = name;
std::string name_inp_min = name;
std::string name_out_max = name;
std::string name_out_min = name;
string_replace_all(name_inp_max, ".weight", ".input_max");
string_replace_all(name_inp_min, ".weight", ".input_min");
string_replace_all(name_out_max, ".weight", ".output_max");
string_replace_all(name_out_min, ".weight", ".output_min");
model.clamp_info_map[name] = {
get_scalar(name_inp_max, FLT_MAX),
get_scalar(name_inp_min, -FLT_MAX),
get_scalar(name_out_max, FLT_MAX),
get_scalar(name_out_min, -FLT_MAX)
};
}
}
} break;
case PROJECTOR_TYPE_GEMMA3NV:
{
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
@ -2042,11 +2139,6 @@ struct clip_model_loader {
{
std::vector<uint8_t> read_buf;
auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
// alloc memory and offload data
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
@ -2345,7 +2437,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
// we can remove this check when we implement audio support for Gemma 3N
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV
|| ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA4V;
}
if (loader.has_audio && !skip_audio) {
@ -2581,6 +2674,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA4V:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
@ -3031,6 +3125,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
set_input_i32("patches", patches);
} break;
case PROJECTOR_TYPE_GEMMA4V:
{
// set (col, row) patch positions for learned positional embedding
const int n_cols = image_size_width / patch_size;
std::vector<int> pos_x(num_patches), pos_y(num_patches);
for (int i = 0; i < num_patches; i++) {
pos_x[i] = i % n_cols;
pos_y[i] = i / n_cols;
}
set_input_i32("pos_x", pos_x);
set_input_i32("pos_y", pos_y);
} break;
case PROJECTOR_TYPE_DEEPSEEKOCR:
{
GGML_ASSERT(pos_w == pos_h);
@ -3218,6 +3324,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA3NV:
return ctx->model.mm_input_proj_w->ne[0];
case PROJECTOR_TYPE_GEMMA4V:
return ctx->model.mm_input_proj_w->ne[1];
case PROJECTOR_TYPE_IDEFICS3:
return ctx->model.mm_fc_w->ne[1];
case PROJECTOR_TYPE_ULTRAVOX: