model: support GLM-OCR (#19677)

* model: support GLM-OCR * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-02-18 17:51:40 +01:00 · 2026-02-18 17:51:40 +01:00 · eeef3cfced
commit eeef3cfced
parent e99f1083a0
8 changed files with 122 additions and 43 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -4584,7 +4584,7 @@ class Qwen3VLVisionModel(MmprojModel):
        yield from super().modify_tensors(data_torch, name, bid)


-@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration")
+@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration")
 class Glm4VVisionModel(Qwen3VLVisionModel):
    def set_gguf_parameters(self):
        MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters
@ -8776,7 +8776,7 @@ class Glm4Model(TextModel):
            n_head = self.hparams["num_attention_heads"]
            n_kv_head = self.hparams["num_key_value_heads"]
            n_embd = self.hparams["hidden_size"]
-            head_dim = n_embd // n_head
+            head_dim = self.hparams.get("head_dim", n_embd // n_head)
            # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here
            if name.endswith(("q_proj.weight", "q_proj.bias")):
                data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor)
@ -8785,6 +8785,27 @@ class Glm4Model(TextModel):
        yield from super().modify_tensors(data_torch, name, bid)


+@ModelBase.register("GlmOcrForConditionalGeneration")
+class GlmOCRModel(Glm4Model):
+    model_arch = gguf.MODEL_ARCH.GLM4
+    use_mrope = False
+    partial_rotary_factor = 0.5
+
+    # Note: GLM-OCR is the same as GLM4, but with an extra NextN/MTP prediction layer
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # GLM-OCR has num_hidden_layers + 1 actual layers (including NextN layer)
+        self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
+        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        # NextN/MTP prediction layers
+        if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
+            self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
+
+
@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration")
 class Glm4MoeModel(TextModel):
    model_arch = gguf.MODEL_ARCH.GLM4_MOE