* mtmd: llama.cpp DeepSeekOCR support init commit * loading sam tensors * mtmd: fix vision model processing * deepseek-ocr clip-vit model impl * mtmd: add DeepSeek-OCR LM support with standard attention * mtmd: successfully runs DeepSeek-OCR LM in llama-cli * mtmd: Fix RoPE type for DeepSeek-OCR LM. * loading LM testing Vision model loading * sam warmup working * sam erroneous return corrected * clip-vit: corrected cls_embd concat * clip-vit: model convert qkv_proj split * corrected combining of image encoders' results * fix: update callback for ffn_moe_weighted and add callback for attn_out in deepseek2 model * concat image_newline and image_seperator tokens * visual_model warmup (technically) works * window partitioning using standard ggml ops * sam implementation without using CPU only ops * clip: fixed warnings * Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into sf/deepseek-ocr * mtmd: fix get_rel_pos * mtmd: fixed the wrong scaler for get_rel_pos * image encoding technically works but the output can't be checked singe image decoding fails * mtmd: minor changed * mtmd: add native resolution support * - image encoding debugged - issues fixed mainly related wrong config like n_patches etc. - configs need to be corrected in the converter * mtmd: correct token order * - dynamic resizing - changes are concerning PR https://github.com/sfallah/llama.cpp/pull/4 * mtmd: quick fix token order * mtmd: fix danling pointer * mtmd: SAM numerically works * mtmd: debug CLIP-L (vit_pre_ln) * mtmd: debug CLIP-L & first working DeepSeek-OCR model * mtmd : add --dsocr-mode CLI argument for DeepSeek-OCR resolution control & all native resolution modes work * mtmd: simplify SAM patch embedding * mtmd: adapt Pillow image resizing function * mtmd: simplify DeepSeek-OCR dynamic resolution preprocessing * mtmd: remove --dsocr-mode argument * mtmd: refactor code & remove unused helper functions * mtmd: fix tensor names for image newlines and view separator * clean up * reverting automatically removed spaces * reverting automatically removed spaces * mtmd: fixed bad ocr check in Deepseek2 (LM) * mtmd: support combined QKV projection in buid_vit * using common build_attn in sam * corrected code-branch when flash-attn disabled enabling usage of --flash-attn option * mtmd: minor fix * minor formatting and style * fixed flake8 lint issues * minor editorconfig-check fixes * minor editorconfig-check fixes * mtmd: simplify get_rel_pos * mtmd: make sam hparams configurable * mtmd: add detailed comments for resize_bicubic_pillow * mtmd: fixed wrong input setting * mtmd: convert model in FP16 * mtmd: minor fix * mtmd: remove tweak to llama-mtmd-cli & deepseek-ocr template * fix: test-1.jpg ORC issue with small (640) resolution setting min-resolution base (1024) max large (1280) for dynamic-resolution * minor: editconfig-check fix * merge with changes from https://github.com/ggml-org/llama.cpp/pull/17909 added new opt to tests.sh to disable flash-attn * minor: editconfig-check fix * testing deepseek-ocr quick and dirty test script comparing results of Qwen2.5-VL vs DeepSeek-OCR * quick and (potential) dirty merge with https://github.com/ggml-org/llama.cpp/pull/17909 * refactoring, one single builder function and static helpers * added deepseek-ocr test to tests.sh * minor formatting fixes * check with fixed expected resutls * minor formatting * editorconfig-check fix * merge with changes from https://github.com/ggml-org/llama.cpp/pull/18042 * minor - added GLM-4.6V to big tests - added missing deps for python test * convert: minor fix * mtmd: format code * convert: quick fix * convert: quick fix * minor python formatting * fixed merge build issue * merge resolved - fixed issues in convert - tested several deepseek models * minor fix * minor * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * - removed clip_is_deepseekocr - removed redundant RESIZE_ALGO_BICUBIC_PILLOW resize-algo - simplified image-preprocessing - removed/simplified debug functions * - cleaning commented out code * fixing instabilities issues reintroducing resize_bicubic_pillow * - use f16 model for deepseek-ocr test - ignore llama-arch test for deepseek-ocr * rename fc_w --> mm_fc_w * add links to OCR discussion * cleaner loading code * add missing .weight to some tensors * add default jinja template (to be used by server) * move test model to ggml-org * rolling back upscale change * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: bluebread <hotbread70127@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
434 lines
14 KiB
C++
434 lines
14 KiB
C++
#pragma once
|
|
|
|
#include "ggml.h"
|
|
#include "clip.h"
|
|
#include "clip-impl.h"
|
|
|
|
#include <array>
|
|
#include <vector>
|
|
#include <unordered_set>
|
|
#include <cstdint>
|
|
#include <cmath>
|
|
|
|
enum ffn_op_type {
|
|
FFN_GELU,
|
|
FFN_GELU_ERF,
|
|
FFN_SILU,
|
|
FFN_GELU_QUICK,
|
|
FFN_RELU_SQR,
|
|
};
|
|
|
|
enum norm_type {
|
|
NORM_TYPE_NORMAL,
|
|
NORM_TYPE_RMS,
|
|
};
|
|
|
|
enum patch_merge_type {
|
|
PATCH_MERGE_FLAT,
|
|
PATCH_MERGE_SPATIAL_UNPAD,
|
|
};
|
|
|
|
struct clip_hparams {
|
|
int32_t image_size = 0;
|
|
int32_t patch_size = 0;
|
|
int32_t n_embd = 0;
|
|
int32_t n_ff = 0;
|
|
int32_t projection_dim = 0;
|
|
int32_t n_head = 0;
|
|
int32_t n_layer = 0;
|
|
// idefics3
|
|
int32_t image_longest_edge = 0;
|
|
int32_t image_min_pixels = -1;
|
|
int32_t image_max_pixels = -1;
|
|
int32_t n_merge = 0; // number of patch merges **per-side**
|
|
|
|
int32_t preproc_min_tiles = 0;
|
|
int32_t preproc_max_tiles = 0;
|
|
|
|
float image_mean[3];
|
|
float image_std[3];
|
|
|
|
// for models using dynamic image size, we need to have a smaller image size to warmup
|
|
// otherwise, user will get OOM every time they load the model
|
|
int32_t warmup_image_size = 0;
|
|
int32_t warmup_audio_size = 3000;
|
|
|
|
ffn_op_type ffn_op = FFN_GELU;
|
|
|
|
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
|
|
|
|
float eps = 1e-6;
|
|
float rope_theta = 0.0;
|
|
|
|
std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
|
|
int32_t image_crop_resolution;
|
|
std::unordered_set<int32_t> vision_feature_layer;
|
|
int32_t attn_window_size = 0;
|
|
int32_t n_wa_pattern = 0;
|
|
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
|
|
|
|
// deepseek-ocr (sam)
|
|
int32_t sam_n_layer = 0;
|
|
int32_t sam_n_head = 0;
|
|
int32_t sam_n_embd = 0;
|
|
|
|
// audio
|
|
int32_t n_mel_bins = 0; // whisper preprocessor
|
|
int32_t proj_stack_factor = 0; // ultravox
|
|
|
|
// audio-to-mel preprocessor params
|
|
int32_t audio_chunk_len = -1; // in seconds
|
|
int32_t audio_sample_rate = -1;
|
|
int32_t audio_n_fft = -1;
|
|
int32_t audio_window_len = -1;
|
|
int32_t audio_hop_len = -1;
|
|
|
|
// legacy
|
|
bool has_llava_projector = false;
|
|
int minicpmv_version = 0;
|
|
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
|
|
|
|
// custom value provided by user, can be undefined if not set
|
|
int32_t custom_image_min_tokens = -1;
|
|
int32_t custom_image_max_tokens = -1;
|
|
|
|
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
|
|
const int cur_merge = n_merge == 0 ? 1 : n_merge;
|
|
const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
|
|
image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
|
|
image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
|
|
warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
|
|
}
|
|
|
|
void set_warmup_n_tokens(int n_tokens) {
|
|
int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
|
|
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
|
|
const int cur_merge = n_merge == 0 ? 1 : n_merge;
|
|
warmup_image_size = n_tok_per_side * patch_size * cur_merge;
|
|
// TODO: support warmup size for custom token numbers
|
|
}
|
|
// sam vit deepseek-ocr
|
|
std::vector<int32_t> global_attn_indices() const {
|
|
return { 2, 5, 8, 11 };
|
|
}
|
|
bool is_global_attn(int32_t layer) const {
|
|
const auto indices = global_attn_indices();
|
|
|
|
for (const auto & idx : indices) {
|
|
if (layer == idx) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
};
|
|
|
|
struct clip_layer {
|
|
// attention
|
|
ggml_tensor * k_w = nullptr;
|
|
ggml_tensor * k_b = nullptr;
|
|
ggml_tensor * q_w = nullptr;
|
|
ggml_tensor * q_b = nullptr;
|
|
ggml_tensor * v_w = nullptr;
|
|
ggml_tensor * v_b = nullptr;
|
|
ggml_tensor * qkv_w = nullptr;
|
|
ggml_tensor * qkv_b = nullptr;
|
|
|
|
ggml_tensor * o_w = nullptr;
|
|
ggml_tensor * o_b = nullptr;
|
|
|
|
ggml_tensor * k_norm = nullptr;
|
|
ggml_tensor * q_norm = nullptr;
|
|
|
|
// layernorm 1
|
|
ggml_tensor * ln_1_w = nullptr;
|
|
ggml_tensor * ln_1_b = nullptr;
|
|
|
|
ggml_tensor * ff_up_w = nullptr;
|
|
ggml_tensor * ff_up_b = nullptr;
|
|
ggml_tensor * ff_gate_w = nullptr;
|
|
ggml_tensor * ff_gate_b = nullptr;
|
|
ggml_tensor * ff_down_w = nullptr;
|
|
ggml_tensor * ff_down_b = nullptr;
|
|
|
|
// layernorm 2
|
|
ggml_tensor * ln_2_w = nullptr;
|
|
ggml_tensor * ln_2_b = nullptr;
|
|
|
|
// layer scale (no bias)
|
|
ggml_tensor * ls_1_w = nullptr;
|
|
ggml_tensor * ls_2_w = nullptr;
|
|
|
|
// qwen3vl deepstack merger
|
|
ggml_tensor * deepstack_norm_w = nullptr;
|
|
ggml_tensor * deepstack_norm_b = nullptr;
|
|
ggml_tensor * deepstack_fc1_w = nullptr;
|
|
ggml_tensor * deepstack_fc1_b = nullptr;
|
|
ggml_tensor * deepstack_fc2_w = nullptr;
|
|
ggml_tensor * deepstack_fc2_b = nullptr;
|
|
|
|
// sam rel_pos
|
|
ggml_tensor * rel_pos_w = nullptr;
|
|
ggml_tensor * rel_pos_h = nullptr;
|
|
// lfm2
|
|
ggml_tensor * ff_norm_w = nullptr;
|
|
ggml_tensor * ff_norm_b = nullptr;
|
|
ggml_tensor * ff_norm_1_w = nullptr;
|
|
ggml_tensor * ff_norm_1_b = nullptr;
|
|
ggml_tensor * ff_up_1_w = nullptr;
|
|
ggml_tensor * ff_up_1_b = nullptr;
|
|
ggml_tensor * ff_down_1_w = nullptr;
|
|
ggml_tensor * ff_down_1_b = nullptr;
|
|
ggml_tensor * pos_bias_u = nullptr;
|
|
ggml_tensor * pos_bias_v = nullptr;
|
|
ggml_tensor * norm_conv_w = nullptr;
|
|
ggml_tensor * norm_conv_b = nullptr;
|
|
ggml_tensor * linear_pos_w = nullptr;
|
|
|
|
ggml_tensor * conv_norm_w = nullptr;
|
|
ggml_tensor * conv_norm_b = nullptr;
|
|
ggml_tensor * conv_dw_w = nullptr;
|
|
ggml_tensor * conv_dw_b = nullptr;
|
|
ggml_tensor * conv_pw1_w = nullptr;
|
|
ggml_tensor * conv_pw1_b = nullptr;
|
|
ggml_tensor * conv_pw2_w = nullptr;
|
|
ggml_tensor * conv_pw2_b = nullptr;
|
|
|
|
bool has_deepstack() const {
|
|
return deepstack_fc1_w != nullptr;
|
|
}
|
|
};
|
|
|
|
// Expanded MobileNetV5 block structure for Gemma3n vision encoder
|
|
struct mobilenetv5_block {
|
|
// Stage 0 (Edge Residual)
|
|
ggml_tensor * s0_conv_exp_w = nullptr;
|
|
ggml_tensor * s0_bn1_w = nullptr;
|
|
ggml_tensor * s0_conv_pwl_w = nullptr;
|
|
ggml_tensor * s0_bn2_w = nullptr;
|
|
|
|
// Stage 1+ (Universal Inverted Residual)
|
|
ggml_tensor * dw_start_w = nullptr;
|
|
ggml_tensor * dw_start_bn_w = nullptr;
|
|
|
|
ggml_tensor * pw_exp_w = nullptr;
|
|
ggml_tensor * pw_exp_bn_w = nullptr;
|
|
|
|
ggml_tensor * dw_mid_w = nullptr;
|
|
ggml_tensor * dw_mid_bn_w = nullptr;
|
|
|
|
ggml_tensor * pw_proj_w = nullptr;
|
|
ggml_tensor * pw_proj_bn_w = nullptr;
|
|
|
|
ggml_tensor * layer_scale_w = nullptr;
|
|
|
|
// Attention (MQA) components
|
|
ggml_tensor * attn_q_w = nullptr;
|
|
ggml_tensor * attn_k_w = nullptr;
|
|
ggml_tensor * attn_v_w = nullptr;
|
|
ggml_tensor * attn_o_w = nullptr;
|
|
|
|
// Optional downsampling/norm in attention
|
|
ggml_tensor * attn_k_dw_w = nullptr;
|
|
ggml_tensor * attn_k_norm_w = nullptr;
|
|
ggml_tensor * attn_v_dw_w = nullptr;
|
|
ggml_tensor * attn_v_norm_w = nullptr;
|
|
|
|
// Block norm (often present in attention blocks)
|
|
ggml_tensor * attn_norm_w = nullptr;
|
|
};
|
|
|
|
struct clip_model {
|
|
clip_modality modality = CLIP_MODALITY_VISION;
|
|
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
|
clip_hparams hparams;
|
|
|
|
// embeddings
|
|
ggml_tensor * class_embedding = nullptr;
|
|
ggml_tensor * patch_embeddings_0 = nullptr;
|
|
ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temporal dimension (Qwen2VL)
|
|
ggml_tensor * patch_bias = nullptr;
|
|
ggml_tensor * position_embeddings = nullptr;
|
|
ggml_tensor * norm_embd_w = nullptr;
|
|
ggml_tensor * norm_embd_b = nullptr;
|
|
|
|
ggml_tensor * pre_ln_w = nullptr;
|
|
ggml_tensor * pre_ln_b = nullptr;
|
|
|
|
std::vector<clip_layer> layers;
|
|
|
|
int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
|
|
|
|
ggml_tensor * post_ln_w;
|
|
ggml_tensor * post_ln_b;
|
|
|
|
ggml_tensor * mm_fc_w;
|
|
ggml_tensor * mm_fc_b;
|
|
ggml_tensor * mm_ffn_up_w = nullptr;
|
|
ggml_tensor * mm_ffn_up_b = nullptr;
|
|
ggml_tensor * mm_ffn_gate_w = nullptr;
|
|
ggml_tensor * mm_ffn_gate_b = nullptr;
|
|
ggml_tensor * mm_ffn_down_w = nullptr;
|
|
ggml_tensor * mm_ffn_down_b = nullptr;
|
|
ggml_tensor * mm_post_norm_w = nullptr;
|
|
ggml_tensor * mm_post_norm_b = nullptr;
|
|
|
|
// LLaVA projection
|
|
ggml_tensor * mm_input_norm_w = nullptr;
|
|
ggml_tensor * mm_input_norm_b = nullptr;
|
|
ggml_tensor * mm_0_w = nullptr;
|
|
ggml_tensor * mm_0_b = nullptr;
|
|
ggml_tensor * mm_2_w = nullptr;
|
|
ggml_tensor * mm_2_b = nullptr;
|
|
|
|
ggml_tensor * image_newline = nullptr;
|
|
ggml_tensor * view_seperator = nullptr;
|
|
|
|
|
|
// Yi type models with mlp+normalization projection
|
|
ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
|
|
ggml_tensor * mm_1_b = nullptr;
|
|
ggml_tensor * mm_3_w = nullptr;
|
|
ggml_tensor * mm_3_b = nullptr;
|
|
ggml_tensor * mm_4_w = nullptr;
|
|
ggml_tensor * mm_4_b = nullptr;
|
|
|
|
// GLMV-Edge projection
|
|
ggml_tensor * mm_model_adapter_conv_w = nullptr;
|
|
ggml_tensor * mm_model_adapter_conv_b = nullptr;
|
|
|
|
// MobileVLM projection
|
|
ggml_tensor * mm_model_mlp_1_w = nullptr;
|
|
ggml_tensor * mm_model_mlp_1_b = nullptr;
|
|
ggml_tensor * mm_model_mlp_3_w = nullptr;
|
|
ggml_tensor * mm_model_mlp_3_b = nullptr;
|
|
ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
|
|
ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
|
|
ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
|
|
ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
|
|
ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
|
|
ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
|
|
ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
|
|
ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
|
|
ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
|
|
ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
|
|
ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
|
|
ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
|
|
ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
|
|
ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
|
|
ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
|
|
ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
|
|
ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
|
|
ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
|
|
ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
|
|
ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
|
|
|
|
// MobileVLM_V2 projection
|
|
ggml_tensor * mm_model_mlp_0_w = nullptr;
|
|
ggml_tensor * mm_model_mlp_0_b = nullptr;
|
|
ggml_tensor * mm_model_mlp_2_w = nullptr;
|
|
ggml_tensor * mm_model_mlp_2_b = nullptr;
|
|
ggml_tensor * mm_model_peg_0_w = nullptr;
|
|
ggml_tensor * mm_model_peg_0_b = nullptr;
|
|
|
|
// MINICPMV projection
|
|
ggml_tensor * mm_model_pos_embed_k = nullptr;
|
|
ggml_tensor * mm_model_query = nullptr;
|
|
ggml_tensor * mm_model_proj = nullptr;
|
|
ggml_tensor * mm_model_kv_proj = nullptr;
|
|
ggml_tensor * mm_model_attn_q_w = nullptr;
|
|
ggml_tensor * mm_model_attn_q_b = nullptr;
|
|
ggml_tensor * mm_model_attn_k_w = nullptr;
|
|
ggml_tensor * mm_model_attn_k_b = nullptr;
|
|
ggml_tensor * mm_model_attn_v_w = nullptr;
|
|
ggml_tensor * mm_model_attn_v_b = nullptr;
|
|
ggml_tensor * mm_model_attn_o_w = nullptr;
|
|
ggml_tensor * mm_model_attn_o_b = nullptr;
|
|
ggml_tensor * mm_model_ln_q_w = nullptr;
|
|
ggml_tensor * mm_model_ln_q_b = nullptr;
|
|
ggml_tensor * mm_model_ln_kv_w = nullptr;
|
|
ggml_tensor * mm_model_ln_kv_b = nullptr;
|
|
ggml_tensor * mm_model_ln_post_w = nullptr;
|
|
ggml_tensor * mm_model_ln_post_b = nullptr;
|
|
|
|
// gemma3
|
|
ggml_tensor * mm_input_proj_w = nullptr;
|
|
ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
|
|
|
// mobilenetv5 for gemma3n
|
|
std::vector<mobilenetv5_block> mobilenet_blocks;
|
|
std::vector<int> mobilenet_stage_ends;
|
|
ggml_tensor * mobilenet_stem_conv_w = nullptr;
|
|
ggml_tensor * mobilenet_stem_conv_b = nullptr;
|
|
ggml_tensor * mobilenet_stem_norm_w = nullptr;
|
|
ggml_tensor * mm_post_proj_norm_w = nullptr;
|
|
|
|
// Multi-Scale Fusion Adapter (MSFA) components
|
|
ggml_tensor * msfa_concat_conv_w = nullptr;
|
|
ggml_tensor * msfa_concat_norm_w = nullptr;
|
|
ggml_tensor * msfa_ffn_expand_w = nullptr;
|
|
ggml_tensor * msfa_ffn_project_w = nullptr;
|
|
ggml_tensor * msfa_ffn_expand_bn = nullptr;
|
|
ggml_tensor * msfa_ffn_project_bn = nullptr;
|
|
|
|
|
|
// pixtral, glm4v
|
|
ggml_tensor * token_embd_img_break = nullptr;
|
|
ggml_tensor * mm_patch_merger_w = nullptr;
|
|
ggml_tensor * mm_patch_merger_b = nullptr;
|
|
|
|
// ultravox / whisper encoder
|
|
ggml_tensor * conv1d_1_w = nullptr;
|
|
ggml_tensor * conv1d_1_b = nullptr;
|
|
ggml_tensor * conv1d_2_w = nullptr;
|
|
ggml_tensor * conv1d_2_b = nullptr;
|
|
ggml_tensor * mm_norm_pre_w = nullptr;
|
|
ggml_tensor * mm_norm_pre_b = nullptr;
|
|
ggml_tensor * mm_norm_mid_w = nullptr;
|
|
|
|
// cogvlm
|
|
ggml_tensor * mm_post_fc_norm_w = nullptr;
|
|
ggml_tensor * mm_post_fc_norm_b = nullptr;
|
|
ggml_tensor * mm_h_to_4h_w = nullptr;
|
|
ggml_tensor * mm_gate_w = nullptr;
|
|
ggml_tensor * mm_4h_to_h_w = nullptr;
|
|
ggml_tensor * mm_boi = nullptr;
|
|
ggml_tensor * mm_eoi = nullptr;
|
|
|
|
// deepseek ocr sam
|
|
ggml_tensor * patch_embed_proj_w = nullptr;
|
|
ggml_tensor * patch_embed_proj_b = nullptr;
|
|
ggml_tensor * pos_embed = nullptr;
|
|
|
|
ggml_tensor * neck_0_w;
|
|
ggml_tensor * neck_1_w;
|
|
ggml_tensor * neck_1_b;
|
|
ggml_tensor * neck_2_w;
|
|
ggml_tensor * neck_3_w;
|
|
ggml_tensor * neck_3_b;
|
|
ggml_tensor * net_2;
|
|
ggml_tensor * net_3;
|
|
|
|
int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder
|
|
|
|
std::vector<clip_layer> sam_layers;
|
|
// lfm2 audio
|
|
std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
|
|
std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
|
|
ggml_tensor * pre_encode_out_w = nullptr;
|
|
ggml_tensor * pre_encode_out_b = nullptr;
|
|
|
|
bool audio_has_avgpool() const {
|
|
return proj_type == PROJECTOR_TYPE_QWEN2A
|
|
|| proj_type == PROJECTOR_TYPE_VOXTRAL
|
|
|| proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
|
|
}
|
|
|
|
bool audio_has_stack_frames() const {
|
|
return proj_type == PROJECTOR_TYPE_ULTRAVOX
|
|
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
|
}
|
|
};
|
|
|
|
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx);
|