ggml-cpu: FA split across kv for faster TG (#19209)

* ggml-cpu: split across kv for faster TG

* simplify sinks application

* add ref impl
This commit is contained in:
Aman Gupta 2026-02-03 01:19:55 +08:00 committed by GitHub
parent a3fa035822
commit 9f682fb640
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 220 additions and 69 deletions

View file

@ -8591,6 +8591,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
output_printer->print_operation(info);
return false;
}
// Use reference implementation on the CPU backend for comparison
using ggml_backend_cpu_set_use_ref_t = void (*)(ggml_backend_t, bool);
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
auto * set_use_ref = (ggml_backend_cpu_set_use_ref_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_use_ref");
if (set_use_ref) {
set_use_ref(backend_cpu, true);
}
size_t n_ok = 0;
size_t tests_run = 0;