model: Add PaddleOCR-VL model support (#18825)
* support PaddleOCR-VL * clip: update PaddleOCR model loader parameters to prevent OOM during warmup * [update] add paddleocr vl text model instead of ernie4.5 * [update] restore change of minicpmv * [update] format * [update] format * [update] positions and patch merge permute * [update] mtmd_decode_use_mrope for paddleocr * [update] image min/max pixels * [update] remove set_limit_image_tokens * upate: preprocess without padding * clean up * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
parent
abb9f3c42b
commit
237958db33
16 changed files with 327 additions and 0 deletions
|
|
@ -24,6 +24,7 @@ add_library(mtmd
|
|||
models/llama4.cpp
|
||||
models/llava.cpp
|
||||
models/minicpmv.cpp
|
||||
models/paddleocr.cpp
|
||||
models/pixtral.cpp
|
||||
models/qwen2vl.cpp
|
||||
models/qwen3vl.cpp
|
||||
|
|
|
|||
|
|
@ -229,6 +229,7 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_MUSIC_FLAMINGO,
|
||||
PROJECTOR_TYPE_LFM2,
|
||||
PROJECTOR_TYPE_KIMIVL,
|
||||
PROJECTOR_TYPE_PADDLEOCR,
|
||||
PROJECTOR_TYPE_LIGHTONOCR,
|
||||
PROJECTOR_TYPE_COGVLM,
|
||||
PROJECTOR_TYPE_JANUS_PRO,
|
||||
|
|
@ -264,6 +265,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
|
||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
|
||||
{ PROJECTOR_TYPE_PADDLEOCR, "paddleocr"},
|
||||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
||||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||
|
|
|
|||
|
|
@ -841,6 +841,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
{
|
||||
builder = std::make_unique<clip_graph_kimivl>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_paddleocr>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_kimik25>(ctx, img);
|
||||
|
|
@ -1256,6 +1260,14 @@ struct clip_model_loader {
|
|||
hparams.audio_window_len = 400;
|
||||
hparams.audio_hop_len = 160;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
{
|
||||
hparams.n_merge = 2;
|
||||
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
|
||||
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
|
||||
|
||||
hparams.set_warmup_n_tokens(28*28); // avoid OOM on warmup
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
// audio preprocessing params
|
||||
|
|
@ -1704,6 +1716,7 @@ struct clip_model_loader {
|
|||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
|
||||
|
|
@ -2990,6 +3003,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
{
|
||||
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
||||
clip_image_u8 resized;
|
||||
|
|
@ -3330,6 +3344,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
|||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->nx / params.patch_size) / 2;
|
||||
default:
|
||||
|
|
@ -3346,6 +3361,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
|||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->ny / params.patch_size) / 2;
|
||||
default:
|
||||
|
|
@ -3443,6 +3459,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
|
||||
n_patches = x_patch * y_patch;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
{
|
||||
// dynamic size
|
||||
int n_merge = ctx->model.hparams.n_merge;
|
||||
int stride = n_merge * n_merge;
|
||||
n_patches = CLIP_ALIGN(n_patches, stride) / stride;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
|
|
@ -3690,6 +3713,30 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
}
|
||||
}
|
||||
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
{
|
||||
const int merge_ratio = hparams.n_merge;
|
||||
const int pw = image_size_width / patch_size;
|
||||
const int ph = image_size_height / patch_size;
|
||||
std::vector<int> positions(n_pos * 4);
|
||||
int ptr = 0;
|
||||
// NOTE: same as Qwen-VL, but x and y are swapped
|
||||
for (int y = 0; y < ph; y += merge_ratio) {
|
||||
for (int dy = 0; dy < 2; dy++) {
|
||||
for (int x = 0; x < pw; x += merge_ratio) {
|
||||
for (int dx = 0; dx < 2; dx++) {
|
||||
positions[ ptr] = y + dy;
|
||||
positions[ num_patches + ptr] = x + dx;
|
||||
positions[2 * num_patches + ptr] = y + dy;
|
||||
positions[3 * num_patches + ptr] = x + dx;
|
||||
ptr++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
|
|
@ -4003,6 +4050,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
|
|
|
|||
|
|
@ -57,6 +57,11 @@ struct clip_graph_kimivl : clip_graph {
|
|||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_paddleocr : clip_graph {
|
||||
clip_graph_paddleocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_cogvlm : clip_graph {
|
||||
clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
|
|
|||
52
tools/mtmd/models/paddleocr.cpp
Normal file
52
tools/mtmd/models/paddleocr.cpp
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_paddleocr::build() {
|
||||
const int n_pos = n_patches;
|
||||
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
|
||||
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
return ggml_rope_multi(
|
||||
ctx0, cur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
|
||||
32768, 10000, 1, 0, 1, 32, 1);
|
||||
};
|
||||
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
add_pos);
|
||||
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
{
|
||||
// mlp_AR paddleocr projector
|
||||
float proj_norm_eps = 1e-5;
|
||||
cur = build_norm(cur,
|
||||
model.mm_input_norm_w, model.mm_input_norm_b,
|
||||
NORM_TYPE_NORMAL, proj_norm_eps, -1);
|
||||
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
hparams.ffn_op, -1);
|
||||
cb(cur, "mlp_out", -1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -325,6 +325,10 @@ struct mtmd_context {
|
|||
img_beg = "<|begin_of_image|>";
|
||||
img_end = "<|end_of_image|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_PADDLEOCR) {
|
||||
// <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
|
||||
img_beg = "<|IMAGE_START|>";
|
||||
img_end = "<|IMAGE_END|>";
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -890,6 +894,7 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
|||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue