ggml: add GATED_DELTA_NET op (#19504)

* ggml: add GATED_DELTA_NET op

* remove the transpose

* add KDA

* add qwen35 dense

* llama : check for fused gated delta net backend support

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Aman Gupta 2026-03-07 15:41:10 +08:00 committed by GitHub
parent 6fce5c6a7d
commit c5a778891b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 627 additions and 10 deletions

View file

@ -1,5 +1,7 @@
#include "models.h"
#include "llama-impl.h"
// utility to get one slice from the third dimension
// input dim: [x, y, c, b]
// output dim: [x, y, 1, b]
@ -39,6 +41,13 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
if (cparams.fused_gdn_ch) {
//ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s);
//cb(result, LLAMA_TENSOR_NAME_FGDNCH, il);
GGML_ABORT("not implemented yet");
}
const float scale = 1.0f / sqrtf(S_k);
q = ggml_scale(ctx0, q, scale);
@ -316,6 +325,26 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
if (cparams.fused_gdn_ar) {
ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s);
cb(result, LLAMA_TENSOR_NAME_FGDNAR, il);
ggml_tensor * output = ggml_view_4d(ctx0, result,
S_v, H_v, n_tokens, n_seqs,
ggml_row_size(result->type, S_v),
ggml_row_size(result->type, S_v * H_v),
ggml_row_size(result->type, S_v * H_v * n_tokens), 0);
ggml_tensor * new_state = ggml_view_4d(ctx0, result,
S_v, S_v, H_v, n_seqs,
ggml_row_size(result->type, S_v),
ggml_row_size(result->type, S_v * S_v),
ggml_row_size(result->type, S_v * S_v * H_v),
ggml_row_size(result->type, S_v * H_v * n_tokens * n_seqs));
return {output, new_state};
}
const float scale = 1.0f / sqrtf(S_k);
q = ggml_scale(ctx0, q, scale);