model, mtmd: fix gguf conversion for audio/vision mmproj (#21309)

* fix gguf conversion for audio/vision mmproj

* fix test
This commit is contained in:
Xuan-Son Nguyen 2026-04-02 17:10:32 +02:00 committed by GitHub
parent 223373742b
commit 63f8fe0ef4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
27 changed files with 1462 additions and 41 deletions

View file

@ -17,6 +17,7 @@ add_library(mtmd
models/models.h
models/cogvlm.cpp
models/conformer.cpp
models/gemma4v.cpp
models/glm4v.cpp
models/internvl.cpp
models/kimivl.cpp

View file

@ -29,7 +29,7 @@ struct clip_graph {
const int n_layer;
const int n_mmproj_embd;
const float eps;
const float kq_scale;
float kq_scale; // TODO: maybe move this to hparams
const clip_flash_attn_type flash_attn_type;
ggml_context_ptr ctx0_ptr;

View file

@ -88,8 +88,11 @@
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
#define TN_LS_OUT "%s.blk.%d.out_scale.%s" // layer out scale (gemma4)
#define TN_ATTN_POST_NORM "%s.blk.%d.attn_post_norm.%s" // post-attn norm (gemma4)
#define TN_FFN_POST_NORM "%s.blk.%d.ffn_post_norm.%s" // post-FFN norm (gemma4)
#define TN_LN_PRE "%s.pre_ln.%s"
#define TN_LN_POST "%s.post_ln.%s"
#define TN_LLAVA_PROJ "mm.%d.%s"
@ -213,6 +216,10 @@
#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
// gemma4
#define TN_STD_BIAS "v.std_bias"
#define TN_STD_SCALE "v.std_scale"
// align x to upper multiple of n
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
@ -233,6 +240,8 @@ enum projector_type {
PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_GEMMA3NV,
PROJECTOR_TYPE_GEMMA3NA,
PROJECTOR_TYPE_GEMMA4V,
PROJECTOR_TYPE_GEMMA4A,
PROJECTOR_TYPE_PHI4,
PROJECTOR_TYPE_IDEFICS3,
PROJECTOR_TYPE_PIXTRAL,
@ -272,6 +281,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
{ PROJECTOR_TYPE_GEMMA4V, "gemma4v"},
{ PROJECTOR_TYPE_GEMMA4A, "gemma4a"},
{ PROJECTOR_TYPE_PHI4, "phi4"},
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
@ -476,6 +487,18 @@ static std::vector<std::string> string_split_str(std::string s, const std::strin
return tokens;
}
// remove when moving to c++20
inline bool string_starts_with(std::string_view str, std::string_view prefix) {
return str.size() >= prefix.size() &&
str.compare(0, prefix.size(), prefix) == 0;
}
// remove when moving to c++20
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
return str.size() >= suffix.size() &&
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
}
//
// gguf utils
//

View file

@ -143,6 +143,10 @@ struct clip_hparams {
};
struct clip_layer {
// layernorm 1 (or layer input norm, or pre-attention norm)
ggml_tensor * ln_1_w = nullptr;
ggml_tensor * ln_1_b = nullptr;
// attention
ggml_tensor * k_w = nullptr;
ggml_tensor * k_b = nullptr;
@ -159,9 +163,7 @@ struct clip_layer {
ggml_tensor * k_norm = nullptr;
ggml_tensor * q_norm = nullptr;
// layernorm 1
ggml_tensor * ln_1_w = nullptr;
ggml_tensor * ln_1_b = nullptr;
ggml_tensor * attn_post_norm_w = nullptr;
ggml_tensor * ff_up_w = nullptr;
ggml_tensor * ff_up_b = nullptr;
@ -170,13 +172,16 @@ struct clip_layer {
ggml_tensor * ff_down_w = nullptr;
ggml_tensor * ff_down_b = nullptr;
// layernorm 2
// layernorm 2 (or pre-FFN norm)
ggml_tensor * ln_2_w = nullptr;
ggml_tensor * ln_2_b = nullptr;
ggml_tensor * ff_post_norm_w = nullptr;
// layer scale (no bias)
ggml_tensor * ls_1_w = nullptr;
ggml_tensor * ls_2_w = nullptr;
ggml_tensor * ls_1_w = nullptr;
ggml_tensor * ls_2_w = nullptr;
ggml_tensor * ls_out_w = nullptr; // gemma4
// qwen3vl deepstack merger
ggml_tensor * deepstack_norm_w = nullptr;
@ -437,6 +442,18 @@ struct clip_model {
ggml_tensor * pre_encode_out_w = nullptr;
ggml_tensor * pre_encode_out_b = nullptr;
// gemma4
ggml_tensor * std_bias = nullptr;
ggml_tensor * std_scale = nullptr;
// Gemma4ClippableLinear
struct clamp_info {
float inp_max;
float inp_min;
float out_max;
float out_min;
};
std::map<std::string, clamp_info> clamp_info_map;
bool audio_has_avgpool() const {
return proj_type == PROJECTOR_TYPE_QWEN2A
|| proj_type == PROJECTOR_TYPE_VOXTRAL

View file

@ -24,6 +24,7 @@
#include <limits>
#include <array>
#include <functional>
#include <float.h>
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
@ -379,19 +380,34 @@ ggml_tensor * clip_graph::build_vit(
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
}
if (layer.q_norm) {
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
cb(Qcur, "Qcur_norm", il);
}
// if true, norm must be applied after reshaping to (d_head, n_head, n_pos)
bool norm_per_head = layer.q_norm && layer.q_norm->ne[0] == d_head;
if (layer.k_norm) {
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
cb(Kcur, "Kcur_norm", il);
if (!norm_per_head) {
if (layer.q_norm) {
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
cb(Qcur, "Qcur_norm", il);
}
if (layer.k_norm) {
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
cb(Kcur, "Kcur_norm", il);
}
}
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
if (norm_per_head) {
if (layer.q_norm) {
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
cb(Qcur, "Qcur_norm_per_head", il);
}
if (layer.k_norm) {
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
cb(Kcur, "Kcur_norm_per_head", il);
}
}
}
cb(Qcur, "Qcur", il);
@ -405,6 +421,11 @@ ggml_tensor * clip_graph::build_vit(
cb(Kcur, "Kcur_pos", il);
}
if (proj_type == PROJECTOR_TYPE_GEMMA4V) {
Vcur = ggml_rms_norm(ctx0, Vcur, eps);
cb(Vcur, "Vcur_normed", il);
}
cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
@ -415,6 +436,11 @@ ggml_tensor * clip_graph::build_vit(
cb(cur, "attn_out_scaled", il);
}
if (layer.attn_post_norm_w) {
cur = build_norm(cur, layer.attn_post_norm_w, nullptr, norm_t, eps, il);
cb(cur, "attn_post_normed", il);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, inpL);
@ -422,7 +448,7 @@ ggml_tensor * clip_graph::build_vit(
cb(cur, "ffn_inp", il);
// layernorm2
// layernorm2 (pre-ffn norm)
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
cb(cur, "ffn_inp_normed", il);
@ -435,6 +461,11 @@ ggml_tensor * clip_graph::build_vit(
cb(cur, "ffn_out", il);
if (layer.ff_post_norm_w) {
cur = build_norm(cur, layer.ff_post_norm_w, nullptr, norm_t, eps, il);
cb(cur, "ffn_post_normed", il);
}
if (layer.ls_2_w) {
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
cb(cur, "ffn_out_scaled", il);
@ -444,6 +475,11 @@ ggml_tensor * clip_graph::build_vit(
cur = ggml_add(ctx0, inpL, cur);
cb(cur, "layer_out", il);
if (layer.ls_out_w) {
cur = ggml_mul(ctx0, cur, layer.ls_out_w);
cb(cur, "layer_out_scaled", il);
}
inpL = cur;
}
@ -808,6 +844,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
} break;
case PROJECTOR_TYPE_GEMMA4V:
{
builder = std::make_unique<clip_graph_gemma4v>(ctx, img);
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
@ -1257,6 +1297,17 @@ struct clip_model_loader {
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
} break;
case PROJECTOR_TYPE_GEMMA4V:
{
hparams.rope_theta = 100.0f;
hparams.n_merge = 3; // pooling_kernel_size
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
// @ngxson : the model performs quite poor with small images, we need to bump minimum image tokens to 40 to avoid that
hparams.set_limit_image_tokens(252, 280);
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_GEMMA3NV:
{
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
@ -1442,6 +1493,11 @@ struct clip_model_loader {
std::map<std::string, size_t> tensor_offset;
std::vector<ggml_tensor *> tensors_to_load;
auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
// TODO @ngxson : support both audio and video in the future
const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
@ -1478,6 +1534,18 @@ struct clip_model_loader {
return cur;
};
auto get_scalar = [&](const std::string & name, float default_val) {
auto it = tensor_offset.find(name);
if (it == tensor_offset.end()) {
return default_val;
}
size_t offset = it->second;
fin.seekg(offset, std::ios::beg);
float value;
fin.read(reinterpret_cast<char*>(&value), sizeof(float));
return value;
};
model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
@ -1512,8 +1580,11 @@ struct clip_model_loader {
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
layer.ls_out_w = get_tensor(string_format(TN_LS_OUT, prefix, il, "weight"), false); // no bias
layer.attn_post_norm_w = get_tensor(string_format(TN_ATTN_POST_NORM, prefix, il, "weight"), false); // no bias
layer.ff_post_norm_w = get_tensor(string_format(TN_FFN_POST_NORM, prefix, il, "weight"), false); // no bias
layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
@ -1713,6 +1784,32 @@ struct clip_model_loader {
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
} break;
case PROJECTOR_TYPE_GEMMA4V:
{
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
model.std_bias = get_tensor(TN_STD_BIAS, false);
model.std_scale = get_tensor(TN_STD_SCALE, false);
// load scalar for Gemma4ClippableLinear
for (auto * tensor : tensors_to_load) {
std::string name = tensor->name;
if (string_ends_with(name, ".weight")) {
std::string name_inp_max = name;
std::string name_inp_min = name;
std::string name_out_max = name;
std::string name_out_min = name;
string_replace_all(name_inp_max, ".weight", ".input_max");
string_replace_all(name_inp_min, ".weight", ".input_min");
string_replace_all(name_out_max, ".weight", ".output_max");
string_replace_all(name_out_min, ".weight", ".output_min");
model.clamp_info_map[name] = {
get_scalar(name_inp_max, FLT_MAX),
get_scalar(name_inp_min, -FLT_MAX),
get_scalar(name_out_max, FLT_MAX),
get_scalar(name_out_min, -FLT_MAX)
};
}
}
} break;
case PROJECTOR_TYPE_GEMMA3NV:
{
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
@ -2042,11 +2139,6 @@ struct clip_model_loader {
{
std::vector<uint8_t> read_buf;
auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
// alloc memory and offload data
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
@ -2345,7 +2437,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
// we can remove this check when we implement audio support for Gemma 3N
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV
|| ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA4V;
}
if (loader.has_audio && !skip_audio) {
@ -2581,6 +2674,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA4V:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
@ -3031,6 +3125,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
set_input_i32("patches", patches);
} break;
case PROJECTOR_TYPE_GEMMA4V:
{
// set (col, row) patch positions for learned positional embedding
const int n_cols = image_size_width / patch_size;
std::vector<int> pos_x(num_patches), pos_y(num_patches);
for (int i = 0; i < num_patches; i++) {
pos_x[i] = i % n_cols;
pos_y[i] = i / n_cols;
}
set_input_i32("pos_x", pos_x);
set_input_i32("pos_y", pos_y);
} break;
case PROJECTOR_TYPE_DEEPSEEKOCR:
{
GGML_ASSERT(pos_w == pos_h);
@ -3218,6 +3324,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA3NV:
return ctx->model.mm_input_proj_w->ne[0];
case PROJECTOR_TYPE_GEMMA4V:
return ctx->model.mm_input_proj_w->ne[1];
case PROJECTOR_TYPE_IDEFICS3:
return ctx->model.mm_fc_w->ne[1];
case PROJECTOR_TYPE_ULTRAVOX:

View file

@ -0,0 +1,151 @@
#include "models.h"
#include <cmath>
ggml_cgraph * clip_graph_gemma4v::build() {
ggml_tensor * inp_raw = build_inp_raw();
// patches = 2 * (patches - 0.5)
// equivalent to: patches * 2 - 1
inp_raw = ggml_scale_bias(ctx0, inp_raw, 2.0f, -1.0f);
ggml_set_name(inp_raw, "inp_raw_scaled");
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
ggml_set_name(inp, "inp");
// note: no patch bias
ggml_tensor * pos_x = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_x, "pos_x");
ggml_set_input(pos_x);
ggml_tensor * pos_y = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_y, "pos_y");
ggml_set_input(pos_y);
{
const int64_t pos_size = model.position_embeddings->ne[1];
const size_t nb1 = ggml_row_size(model.position_embeddings->type, n_embd);
// positional embeddings are stored as lookup tables (one for x, one for y)
ggml_tensor * tbl_x = ggml_view_2d(ctx0, model.position_embeddings,
n_embd, pos_size, nb1, 0);
ggml_tensor * tbl_y = ggml_view_2d(ctx0, model.position_embeddings,
n_embd, pos_size, nb1, pos_size * nb1);
// ggml_get_rows: [n_embd, n_patches]
ggml_tensor * emb_x = ggml_get_rows(ctx0, tbl_x, pos_x);
ggml_tensor * emb_y = ggml_get_rows(ctx0, tbl_y, pos_y);
inp = ggml_add(ctx0, inp, emb_x);
inp = ggml_add(ctx0, inp, emb_y);
cb(inp, "pos_embd", -1);
}
// similar to build_rope_2d, but use neox ordering
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
const int64_t n_dim = cur->ne[0];
const int64_t n_head = cur->ne[1];
const int64_t n_pos = cur->ne[2];
// first half
ggml_tensor * first;
{
first = ggml_view_3d(ctx0, cur,
n_dim/2, n_head, n_pos,
cur->nb[1],
cur->nb[2],
0);
first = ggml_rope_ext(
ctx0,
first,
pos_x, // positions
nullptr, // freq factors
n_dim/2, // n_dims
GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta,
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
);
}
// second half
ggml_tensor * second;
{
second = ggml_view_3d(ctx0, cur,
n_dim/2, n_head, n_pos,
cur->nb[1],
cur->nb[2],
n_dim/2 * ggml_element_size(cur));
second = ggml_rope_ext(
ctx0,
second,
pos_y, // positions
nullptr, // freq factors
n_dim/2, // n_dims
GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta,
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
);
}
cur = ggml_concat(ctx0, first, second, 0);
return cur;
};
kq_scale = 1.0f;
ggml_tensor * cur = build_vit(
inp, n_patches,
NORM_TYPE_RMS,
hparams.ffn_op,
nullptr, // pos embd is already handled above
add_pos);
// Gemma4VisionPooler
{
const int kernel_size = hparams.n_merge;
GGML_ASSERT(kernel_size > 0);
// [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1]
cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1);
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG,
kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
const int out_x = n_patches_x / kernel_size;
const int out_y = n_patches_y / kernel_size;
// [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y]
cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1);
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd));
cb(cur, "pooled", -1);
}
// hidden_states = (hidden_states - self.std_bias) * self.std_scale
if (model.std_bias && model.std_scale) {
cur = ggml_sub(ctx0, cur, model.std_bias);
cur = ggml_mul(ctx0, cur, model.std_scale);
cb(cur, "std_scaled", -1);
}
// Gemma4MultimodalEmbedder
cur = build_mm(model.mm_input_proj_w, cur);
cb(cur, "projected", -1);
// embedding_post_projection_norm
cur = ggml_rms_norm(ctx0, cur, hparams.eps);
cb(cur, "projected_normed", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
ggml_tensor * clip_graph_gemma4v::build_mm(ggml_tensor * w, ggml_tensor * x) const {
// Gemma4ClippableLinear
auto it = model.clamp_info_map.find(w->name);
if (it == model.clamp_info_map.end()) {
return ggml_mul_mat(ctx0, w, x);
} else {
const auto & clamp_info = it->second;
ggml_tensor * clamped = ggml_clamp(ctx0, x, clamp_info.inp_min, clamp_info.inp_max);
ggml_tensor * out = ggml_mul_mat(ctx0, w, clamped);
out = ggml_clamp(ctx0, out, clamp_info.out_min, clamp_info.out_max);
return out;
}
}

View file

@ -12,6 +12,12 @@ struct clip_graph_siglip : clip_graph {
ggml_cgraph * build() override;
};
struct clip_graph_gemma4v : clip_graph {
clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
};
struct clip_graph_pixtral : clip_graph {
clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;

View file

@ -394,6 +394,13 @@ struct mtmd_context {
img_end = "<|IMAGE_END|>";
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
} break;
case PROJECTOR_TYPE_GEMMA4V:
{
// <|image> ... (image embeddings) ... <image|>
img_beg = "<|image>";
img_end = "<image|>";
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
} break;
case PROJECTOR_TYPE_DEEPSEEKOCR:
{
img_end = "\n"; // prevent empty batch on llama-server
@ -974,6 +981,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
switch (ctx->proj_type_v()) {
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA4V:
return true;
default:
return false;