ggml: add GATED_DELTA_NET op (#19504)

* ggml: add GATED_DELTA_NET op * remove the transpose * add KDA * add qwen35 dense * llama : check for fused gated delta net backend support --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-03-07 15:41:10 +08:00 · 2026-03-07 15:41:10 +08:00 · c5a778891b
commit c5a778891b
parent 6fce5c6a7d
15 changed files with 627 additions and 10 deletions
--- a/src/models/delta-net-base.cpp
+++ b/src/models/delta-net-base.cpp
@ -1,5 +1,7 @@
 #include "models.h"

+#include "llama-impl.h"
+
 // utility to get one slice from the third dimension
 // input dim:  [x, y, c, b]
 // output dim: [x, y, 1, b]
@ -39,6 +41,13 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
    GGML_ASSERT(b->ne[0] == 1   && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
    GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v      && s->ne[3] == n_seqs);

+    if (cparams.fused_gdn_ch) {
+        //ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s);
+        //cb(result, LLAMA_TENSOR_NAME_FGDNCH, il);
+
+        GGML_ABORT("not implemented yet");
+    }
+
    const float scale = 1.0f / sqrtf(S_k);

    q = ggml_scale(ctx0, q, scale);
@ -316,6 +325,26 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
    GGML_ASSERT(b->ne[0] == 1   && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
    GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v      && s->ne[3] == n_seqs);

+    if (cparams.fused_gdn_ar) {
+        ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s);
+        cb(result, LLAMA_TENSOR_NAME_FGDNAR, il);
+
+        ggml_tensor * output = ggml_view_4d(ctx0, result,
+            S_v, H_v, n_tokens, n_seqs,
+            ggml_row_size(result->type, S_v),
+            ggml_row_size(result->type, S_v * H_v),
+            ggml_row_size(result->type, S_v * H_v * n_tokens), 0);
+
+        ggml_tensor * new_state = ggml_view_4d(ctx0, result,
+            S_v, S_v, H_v, n_seqs,
+            ggml_row_size(result->type, S_v),
+            ggml_row_size(result->type, S_v * S_v),
+            ggml_row_size(result->type, S_v * S_v * H_v),
+            ggml_row_size(result->type, S_v * H_v * n_tokens * n_seqs));
+
+        return {output, new_state};
+    }
+
    const float scale = 1.0f / sqrtf(S_k);

    q = ggml_scale(ctx0, q, scale);