cuda : fix nkvo, offload and cuda graph node properties matching (#19165)
* cuda : fix nkvo * cont : more robust cuda graph node property matching * cont : restore pre-leafs implementation * cont : comments + static_assert
This commit is contained in:
parent
7b7ae857f6
commit
4fdbc1e4db
4 changed files with 59 additions and 32 deletions
|
|
@ -1630,11 +1630,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|||
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
|
||||
cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
|
||||
|
||||
if (!cparams.offload_kqv) {
|
||||
// all nodes between the KV store and the attention output are run on the CPU
|
||||
ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu);
|
||||
}
|
||||
|
||||
ggml_flash_attn_ext_add_sinks(cur, sinks);
|
||||
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue