ggml: add GATED_DELTA_NET op (#19504)
* ggml: add GATED_DELTA_NET op * remove the transpose * add KDA * add qwen35 dense * llama : check for fused gated delta net backend support --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
6fce5c6a7d
commit
c5a778891b
15 changed files with 627 additions and 10 deletions
|
|
@ -1,5 +1,7 @@
|
|||
#include "models.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
// utility to get one slice from the third dimension
|
||||
// input dim: [x, y, c, b]
|
||||
// output dim: [x, y, 1, b]
|
||||
|
|
@ -39,6 +41,13 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
|
||||
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
|
||||
|
||||
if (cparams.fused_gdn_ch) {
|
||||
//ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s);
|
||||
//cb(result, LLAMA_TENSOR_NAME_FGDNCH, il);
|
||||
|
||||
GGML_ABORT("not implemented yet");
|
||||
}
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_k);
|
||||
|
||||
q = ggml_scale(ctx0, q, scale);
|
||||
|
|
@ -316,6 +325,26 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
|
||||
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
|
||||
|
||||
if (cparams.fused_gdn_ar) {
|
||||
ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s);
|
||||
cb(result, LLAMA_TENSOR_NAME_FGDNAR, il);
|
||||
|
||||
ggml_tensor * output = ggml_view_4d(ctx0, result,
|
||||
S_v, H_v, n_tokens, n_seqs,
|
||||
ggml_row_size(result->type, S_v),
|
||||
ggml_row_size(result->type, S_v * H_v),
|
||||
ggml_row_size(result->type, S_v * H_v * n_tokens), 0);
|
||||
|
||||
ggml_tensor * new_state = ggml_view_4d(ctx0, result,
|
||||
S_v, S_v, H_v, n_seqs,
|
||||
ggml_row_size(result->type, S_v),
|
||||
ggml_row_size(result->type, S_v * S_v),
|
||||
ggml_row_size(result->type, S_v * S_v * H_v),
|
||||
ggml_row_size(result->type, S_v * H_v * n_tokens * n_seqs));
|
||||
|
||||
return {output, new_state};
|
||||
}
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_k);
|
||||
|
||||
q = ggml_scale(ctx0, q, scale);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue