model, mtmd: fix gguf conversion for audio/vision mmproj (#21309)
* fix gguf conversion for audio/vision mmproj * fix test
This commit is contained in:
parent
223373742b
commit
63f8fe0ef4
27 changed files with 1462 additions and 41 deletions
|
|
@ -17,6 +17,7 @@ add_library(mtmd
|
|||
models/models.h
|
||||
models/cogvlm.cpp
|
||||
models/conformer.cpp
|
||||
models/gemma4v.cpp
|
||||
models/glm4v.cpp
|
||||
models/internvl.cpp
|
||||
models/kimivl.cpp
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ struct clip_graph {
|
|||
const int n_layer;
|
||||
const int n_mmproj_embd;
|
||||
const float eps;
|
||||
const float kq_scale;
|
||||
float kq_scale; // TODO: maybe move this to hparams
|
||||
const clip_flash_attn_type flash_attn_type;
|
||||
|
||||
ggml_context_ptr ctx0_ptr;
|
||||
|
|
|
|||
|
|
@ -88,8 +88,11 @@
|
|||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
|
||||
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
|
||||
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
|
||||
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
|
||||
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
|
||||
#define TN_LS_OUT "%s.blk.%d.out_scale.%s" // layer out scale (gemma4)
|
||||
#define TN_ATTN_POST_NORM "%s.blk.%d.attn_post_norm.%s" // post-attn norm (gemma4)
|
||||
#define TN_FFN_POST_NORM "%s.blk.%d.ffn_post_norm.%s" // post-FFN norm (gemma4)
|
||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||
#define TN_LN_POST "%s.post_ln.%s"
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
|
|
@ -213,6 +216,10 @@
|
|||
#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
|
||||
#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
|
||||
|
||||
// gemma4
|
||||
#define TN_STD_BIAS "v.std_bias"
|
||||
#define TN_STD_SCALE "v.std_scale"
|
||||
|
||||
|
||||
// align x to upper multiple of n
|
||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||
|
|
@ -233,6 +240,8 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_GEMMA3,
|
||||
PROJECTOR_TYPE_GEMMA3NV,
|
||||
PROJECTOR_TYPE_GEMMA3NA,
|
||||
PROJECTOR_TYPE_GEMMA4V,
|
||||
PROJECTOR_TYPE_GEMMA4A,
|
||||
PROJECTOR_TYPE_PHI4,
|
||||
PROJECTOR_TYPE_IDEFICS3,
|
||||
PROJECTOR_TYPE_PIXTRAL,
|
||||
|
|
@ -272,6 +281,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
|
||||
{ PROJECTOR_TYPE_GEMMA4V, "gemma4v"},
|
||||
{ PROJECTOR_TYPE_GEMMA4A, "gemma4a"},
|
||||
{ PROJECTOR_TYPE_PHI4, "phi4"},
|
||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||
|
|
@ -476,6 +487,18 @@ static std::vector<std::string> string_split_str(std::string s, const std::strin
|
|||
return tokens;
|
||||
}
|
||||
|
||||
// remove when moving to c++20
|
||||
inline bool string_starts_with(std::string_view str, std::string_view prefix) {
|
||||
return str.size() >= prefix.size() &&
|
||||
str.compare(0, prefix.size(), prefix) == 0;
|
||||
}
|
||||
|
||||
// remove when moving to c++20
|
||||
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
|
||||
return str.size() >= suffix.size() &&
|
||||
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
|
||||
}
|
||||
|
||||
//
|
||||
// gguf utils
|
||||
//
|
||||
|
|
|
|||
|
|
@ -143,6 +143,10 @@ struct clip_hparams {
|
|||
};
|
||||
|
||||
struct clip_layer {
|
||||
// layernorm 1 (or layer input norm, or pre-attention norm)
|
||||
ggml_tensor * ln_1_w = nullptr;
|
||||
ggml_tensor * ln_1_b = nullptr;
|
||||
|
||||
// attention
|
||||
ggml_tensor * k_w = nullptr;
|
||||
ggml_tensor * k_b = nullptr;
|
||||
|
|
@ -159,9 +163,7 @@ struct clip_layer {
|
|||
ggml_tensor * k_norm = nullptr;
|
||||
ggml_tensor * q_norm = nullptr;
|
||||
|
||||
// layernorm 1
|
||||
ggml_tensor * ln_1_w = nullptr;
|
||||
ggml_tensor * ln_1_b = nullptr;
|
||||
ggml_tensor * attn_post_norm_w = nullptr;
|
||||
|
||||
ggml_tensor * ff_up_w = nullptr;
|
||||
ggml_tensor * ff_up_b = nullptr;
|
||||
|
|
@ -170,13 +172,16 @@ struct clip_layer {
|
|||
ggml_tensor * ff_down_w = nullptr;
|
||||
ggml_tensor * ff_down_b = nullptr;
|
||||
|
||||
// layernorm 2
|
||||
// layernorm 2 (or pre-FFN norm)
|
||||
ggml_tensor * ln_2_w = nullptr;
|
||||
ggml_tensor * ln_2_b = nullptr;
|
||||
|
||||
ggml_tensor * ff_post_norm_w = nullptr;
|
||||
|
||||
// layer scale (no bias)
|
||||
ggml_tensor * ls_1_w = nullptr;
|
||||
ggml_tensor * ls_2_w = nullptr;
|
||||
ggml_tensor * ls_1_w = nullptr;
|
||||
ggml_tensor * ls_2_w = nullptr;
|
||||
ggml_tensor * ls_out_w = nullptr; // gemma4
|
||||
|
||||
// qwen3vl deepstack merger
|
||||
ggml_tensor * deepstack_norm_w = nullptr;
|
||||
|
|
@ -437,6 +442,18 @@ struct clip_model {
|
|||
ggml_tensor * pre_encode_out_w = nullptr;
|
||||
ggml_tensor * pre_encode_out_b = nullptr;
|
||||
|
||||
// gemma4
|
||||
ggml_tensor * std_bias = nullptr;
|
||||
ggml_tensor * std_scale = nullptr;
|
||||
// Gemma4ClippableLinear
|
||||
struct clamp_info {
|
||||
float inp_max;
|
||||
float inp_min;
|
||||
float out_max;
|
||||
float out_min;
|
||||
};
|
||||
std::map<std::string, clamp_info> clamp_info_map;
|
||||
|
||||
bool audio_has_avgpool() const {
|
||||
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@
|
|||
#include <limits>
|
||||
#include <array>
|
||||
#include <functional>
|
||||
#include <float.h>
|
||||
|
||||
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
|
||||
|
||||
|
|
@ -379,19 +380,34 @@ ggml_tensor * clip_graph::build_vit(
|
|||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
|
||||
if (layer.q_norm) {
|
||||
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||
cb(Qcur, "Qcur_norm", il);
|
||||
}
|
||||
// if true, norm must be applied after reshaping to (d_head, n_head, n_pos)
|
||||
bool norm_per_head = layer.q_norm && layer.q_norm->ne[0] == d_head;
|
||||
|
||||
if (layer.k_norm) {
|
||||
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||
cb(Kcur, "Kcur_norm", il);
|
||||
if (!norm_per_head) {
|
||||
if (layer.q_norm) {
|
||||
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||
cb(Qcur, "Qcur_norm", il);
|
||||
}
|
||||
if (layer.k_norm) {
|
||||
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||
cb(Kcur, "Kcur_norm", il);
|
||||
}
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
|
||||
if (norm_per_head) {
|
||||
if (layer.q_norm) {
|
||||
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||
cb(Qcur, "Qcur_norm_per_head", il);
|
||||
}
|
||||
if (layer.k_norm) {
|
||||
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||
cb(Kcur, "Kcur_norm_per_head", il);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
|
@ -405,6 +421,11 @@ ggml_tensor * clip_graph::build_vit(
|
|||
cb(Kcur, "Kcur_pos", il);
|
||||
}
|
||||
|
||||
if (proj_type == PROJECTOR_TYPE_GEMMA4V) {
|
||||
Vcur = ggml_rms_norm(ctx0, Vcur, eps);
|
||||
cb(Vcur, "Vcur_normed", il);
|
||||
}
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b,
|
||||
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
|
|
@ -415,6 +436,11 @@ ggml_tensor * clip_graph::build_vit(
|
|||
cb(cur, "attn_out_scaled", il);
|
||||
}
|
||||
|
||||
if (layer.attn_post_norm_w) {
|
||||
cur = build_norm(cur, layer.attn_post_norm_w, nullptr, norm_t, eps, il);
|
||||
cb(cur, "attn_post_normed", il);
|
||||
}
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
|
|
@ -422,7 +448,7 @@ ggml_tensor * clip_graph::build_vit(
|
|||
|
||||
cb(cur, "ffn_inp", il);
|
||||
|
||||
// layernorm2
|
||||
// layernorm2 (pre-ffn norm)
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
||||
cb(cur, "ffn_inp_normed", il);
|
||||
|
||||
|
|
@ -435,6 +461,11 @@ ggml_tensor * clip_graph::build_vit(
|
|||
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
if (layer.ff_post_norm_w) {
|
||||
cur = build_norm(cur, layer.ff_post_norm_w, nullptr, norm_t, eps, il);
|
||||
cb(cur, "ffn_post_normed", il);
|
||||
}
|
||||
|
||||
if (layer.ls_2_w) {
|
||||
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
|
||||
cb(cur, "ffn_out_scaled", il);
|
||||
|
|
@ -444,6 +475,11 @@ ggml_tensor * clip_graph::build_vit(
|
|||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
if (layer.ls_out_w) {
|
||||
cur = ggml_mul(ctx0, cur, layer.ls_out_w);
|
||||
cb(cur, "layer_out_scaled", il);
|
||||
}
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
|
|
@ -808,6 +844,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
{
|
||||
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_gemma4v>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
|
|
@ -1257,6 +1297,17 @@ struct clip_model_loader {
|
|||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
{
|
||||
hparams.rope_theta = 100.0f;
|
||||
hparams.n_merge = 3; // pooling_kernel_size
|
||||
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
// @ngxson : the model performs quite poor with small images, we need to bump minimum image tokens to 40 to avoid that
|
||||
hparams.set_limit_image_tokens(252, 280);
|
||||
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
|
||||
|
|
@ -1442,6 +1493,11 @@ struct clip_model_loader {
|
|||
std::map<std::string, size_t> tensor_offset;
|
||||
std::vector<ggml_tensor *> tensors_to_load;
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
|
||||
// TODO @ngxson : support both audio and video in the future
|
||||
const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
|
||||
|
||||
|
|
@ -1478,6 +1534,18 @@ struct clip_model_loader {
|
|||
return cur;
|
||||
};
|
||||
|
||||
auto get_scalar = [&](const std::string & name, float default_val) {
|
||||
auto it = tensor_offset.find(name);
|
||||
if (it == tensor_offset.end()) {
|
||||
return default_val;
|
||||
}
|
||||
size_t offset = it->second;
|
||||
fin.seekg(offset, std::ios::beg);
|
||||
float value;
|
||||
fin.read(reinterpret_cast<char*>(&value), sizeof(float));
|
||||
return value;
|
||||
};
|
||||
|
||||
model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
|
||||
|
||||
model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
|
||||
|
|
@ -1512,8 +1580,11 @@ struct clip_model_loader {
|
|||
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
|
||||
layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
|
||||
layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
|
||||
layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
|
||||
layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
|
||||
layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
|
||||
layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
|
||||
layer.ls_out_w = get_tensor(string_format(TN_LS_OUT, prefix, il, "weight"), false); // no bias
|
||||
layer.attn_post_norm_w = get_tensor(string_format(TN_ATTN_POST_NORM, prefix, il, "weight"), false); // no bias
|
||||
layer.ff_post_norm_w = get_tensor(string_format(TN_FFN_POST_NORM, prefix, il, "weight"), false); // no bias
|
||||
|
||||
layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
|
||||
layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
|
||||
|
|
@ -1713,6 +1784,32 @@ struct clip_model_loader {
|
|||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
{
|
||||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.std_bias = get_tensor(TN_STD_BIAS, false);
|
||||
model.std_scale = get_tensor(TN_STD_SCALE, false);
|
||||
// load scalar for Gemma4ClippableLinear
|
||||
for (auto * tensor : tensors_to_load) {
|
||||
std::string name = tensor->name;
|
||||
if (string_ends_with(name, ".weight")) {
|
||||
std::string name_inp_max = name;
|
||||
std::string name_inp_min = name;
|
||||
std::string name_out_max = name;
|
||||
std::string name_out_min = name;
|
||||
string_replace_all(name_inp_max, ".weight", ".input_max");
|
||||
string_replace_all(name_inp_min, ".weight", ".input_min");
|
||||
string_replace_all(name_out_max, ".weight", ".output_max");
|
||||
string_replace_all(name_out_min, ".weight", ".output_min");
|
||||
model.clamp_info_map[name] = {
|
||||
get_scalar(name_inp_max, FLT_MAX),
|
||||
get_scalar(name_inp_min, -FLT_MAX),
|
||||
get_scalar(name_out_max, FLT_MAX),
|
||||
get_scalar(name_out_min, -FLT_MAX)
|
||||
};
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
|
||||
|
|
@ -2042,11 +2139,6 @@ struct clip_model_loader {
|
|||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
|
||||
// alloc memory and offload data
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
|
||||
ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
|
||||
|
|
@ -2345,7 +2437,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
|||
|
||||
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
|
||||
// we can remove this check when we implement audio support for Gemma 3N
|
||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
|
||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV
|
||||
|| ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA4V;
|
||||
}
|
||||
|
||||
if (loader.has_audio && !skip_audio) {
|
||||
|
|
@ -2581,6 +2674,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
n_patches = x_patch * y_patch;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
||||
|
|
@ -3031,6 +3125,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
}
|
||||
set_input_i32("patches", patches);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
{
|
||||
// set (col, row) patch positions for learned positional embedding
|
||||
const int n_cols = image_size_width / patch_size;
|
||||
std::vector<int> pos_x(num_patches), pos_y(num_patches);
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
pos_x[i] = i % n_cols;
|
||||
pos_y[i] = i / n_cols;
|
||||
}
|
||||
set_input_i32("pos_x", pos_x);
|
||||
set_input_i32("pos_y", pos_y);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
{
|
||||
GGML_ASSERT(pos_w == pos_h);
|
||||
|
|
@ -3218,6 +3324,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
return ctx->model.mm_input_proj_w->ne[0];
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
return ctx->model.mm_input_proj_w->ne[1];
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
return ctx->model.mm_fc_w->ne[1];
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
|
|
|
|||
151
tools/mtmd/models/gemma4v.cpp
Normal file
151
tools/mtmd/models/gemma4v.cpp
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
#include "models.h"
|
||||
#include <cmath>
|
||||
|
||||
ggml_cgraph * clip_graph_gemma4v::build() {
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
|
||||
// patches = 2 * (patches - 0.5)
|
||||
// equivalent to: patches * 2 - 1
|
||||
inp_raw = ggml_scale_bias(ctx0, inp_raw, 2.0f, -1.0f);
|
||||
ggml_set_name(inp_raw, "inp_raw_scaled");
|
||||
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
|
||||
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||
ggml_set_name(inp, "inp");
|
||||
// note: no patch bias
|
||||
|
||||
ggml_tensor * pos_x = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_x, "pos_x");
|
||||
ggml_set_input(pos_x);
|
||||
|
||||
ggml_tensor * pos_y = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_y, "pos_y");
|
||||
ggml_set_input(pos_y);
|
||||
|
||||
{
|
||||
const int64_t pos_size = model.position_embeddings->ne[1];
|
||||
const size_t nb1 = ggml_row_size(model.position_embeddings->type, n_embd);
|
||||
|
||||
// positional embeddings are stored as lookup tables (one for x, one for y)
|
||||
ggml_tensor * tbl_x = ggml_view_2d(ctx0, model.position_embeddings,
|
||||
n_embd, pos_size, nb1, 0);
|
||||
ggml_tensor * tbl_y = ggml_view_2d(ctx0, model.position_embeddings,
|
||||
n_embd, pos_size, nb1, pos_size * nb1);
|
||||
|
||||
// ggml_get_rows: [n_embd, n_patches]
|
||||
ggml_tensor * emb_x = ggml_get_rows(ctx0, tbl_x, pos_x);
|
||||
ggml_tensor * emb_y = ggml_get_rows(ctx0, tbl_y, pos_y);
|
||||
|
||||
inp = ggml_add(ctx0, inp, emb_x);
|
||||
inp = ggml_add(ctx0, inp, emb_y);
|
||||
cb(inp, "pos_embd", -1);
|
||||
}
|
||||
|
||||
// similar to build_rope_2d, but use neox ordering
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
const int64_t n_dim = cur->ne[0];
|
||||
const int64_t n_head = cur->ne[1];
|
||||
const int64_t n_pos = cur->ne[2];
|
||||
|
||||
// first half
|
||||
ggml_tensor * first;
|
||||
{
|
||||
first = ggml_view_3d(ctx0, cur,
|
||||
n_dim/2, n_head, n_pos,
|
||||
cur->nb[1],
|
||||
cur->nb[2],
|
||||
0);
|
||||
first = ggml_rope_ext(
|
||||
ctx0,
|
||||
first,
|
||||
pos_x, // positions
|
||||
nullptr, // freq factors
|
||||
n_dim/2, // n_dims
|
||||
GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta,
|
||||
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
||||
);
|
||||
}
|
||||
|
||||
// second half
|
||||
ggml_tensor * second;
|
||||
{
|
||||
second = ggml_view_3d(ctx0, cur,
|
||||
n_dim/2, n_head, n_pos,
|
||||
cur->nb[1],
|
||||
cur->nb[2],
|
||||
n_dim/2 * ggml_element_size(cur));
|
||||
second = ggml_rope_ext(
|
||||
ctx0,
|
||||
second,
|
||||
pos_y, // positions
|
||||
nullptr, // freq factors
|
||||
n_dim/2, // n_dims
|
||||
GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta,
|
||||
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
||||
);
|
||||
}
|
||||
|
||||
cur = ggml_concat(ctx0, first, second, 0);
|
||||
return cur;
|
||||
};
|
||||
|
||||
kq_scale = 1.0f;
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_RMS,
|
||||
hparams.ffn_op,
|
||||
nullptr, // pos embd is already handled above
|
||||
add_pos);
|
||||
|
||||
// Gemma4VisionPooler
|
||||
{
|
||||
const int kernel_size = hparams.n_merge;
|
||||
GGML_ASSERT(kernel_size > 0);
|
||||
|
||||
// [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1]
|
||||
cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1);
|
||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG,
|
||||
kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
||||
const int out_x = n_patches_x / kernel_size;
|
||||
const int out_y = n_patches_y / kernel_size;
|
||||
// [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y]
|
||||
cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1);
|
||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd));
|
||||
cb(cur, "pooled", -1);
|
||||
}
|
||||
|
||||
// hidden_states = (hidden_states - self.std_bias) * self.std_scale
|
||||
if (model.std_bias && model.std_scale) {
|
||||
cur = ggml_sub(ctx0, cur, model.std_bias);
|
||||
cur = ggml_mul(ctx0, cur, model.std_scale);
|
||||
cb(cur, "std_scaled", -1);
|
||||
}
|
||||
|
||||
// Gemma4MultimodalEmbedder
|
||||
cur = build_mm(model.mm_input_proj_w, cur);
|
||||
cb(cur, "projected", -1);
|
||||
|
||||
// embedding_post_projection_norm
|
||||
cur = ggml_rms_norm(ctx0, cur, hparams.eps);
|
||||
cb(cur, "projected_normed", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
|
||||
ggml_tensor * clip_graph_gemma4v::build_mm(ggml_tensor * w, ggml_tensor * x) const {
|
||||
// Gemma4ClippableLinear
|
||||
|
||||
auto it = model.clamp_info_map.find(w->name);
|
||||
if (it == model.clamp_info_map.end()) {
|
||||
return ggml_mul_mat(ctx0, w, x);
|
||||
} else {
|
||||
const auto & clamp_info = it->second;
|
||||
ggml_tensor * clamped = ggml_clamp(ctx0, x, clamp_info.inp_min, clamp_info.inp_max);
|
||||
ggml_tensor * out = ggml_mul_mat(ctx0, w, clamped);
|
||||
out = ggml_clamp(ctx0, out, clamp_info.out_min, clamp_info.out_max);
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
|
@ -12,6 +12,12 @@ struct clip_graph_siglip : clip_graph {
|
|||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_gemma4v : clip_graph {
|
||||
clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
|
||||
};
|
||||
|
||||
struct clip_graph_pixtral : clip_graph {
|
||||
clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
|
|
|||
|
|
@ -394,6 +394,13 @@ struct mtmd_context {
|
|||
img_end = "<|IMAGE_END|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
{
|
||||
// <|image> ... (image embeddings) ... <image|>
|
||||
img_beg = "<|image>";
|
||||
img_end = "<image|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
{
|
||||
img_end = "\n"; // prevent empty batch on llama-server
|
||||
|
|
@ -974,6 +981,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
|
|||
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||
switch (ctx->proj_type_v()) {
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue