ggml-webgpu: move from parameter buffer pool to single buffer with offsets (#21278)

* Work towards removing bitcast

* Move rest of existing types over

* Add timeout back to wait and remove synchronous set_tensor/memset_tensor

* move to unpackf16 for wider compatibility

* cleanup

* Remove deadlock condition in free_bufs

* Start work on removing parameter buffer pools

* Simplify and optimize further

* simplify profile futures

* Fix stride

* Try using a single command buffer per batch

* formatting
This commit is contained in:
Reese Levine 2026-04-03 11:40:14 -07:00 committed by GitHub
parent e439700992
commit d006858316
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 373 additions and 416 deletions

View file

@ -437,12 +437,18 @@ inline uint32_t ggml_webgpu_flash_attn_pick_vec_ne(const ggml_webgpu_flash_attn_
// Head-dim specializations used by the tuned vec f16 path.
switch (key.head_dim_qk) {
case 64: return 2u;
case 96: return 4u;
case 128: return 1u;
case 192: return 2u;
case 576: return 2u;
default: return 1u;
case 64:
return 2u;
case 96:
return 4u;
case 128:
return 1u;
case 192:
return 2u;
case 576:
return 2u;
default:
return 1u;
}
}
@ -1857,8 +1863,7 @@ class ggml_webgpu_shader_lib {
defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
uint32_t q_tile = context.sg_mat_m;
uint32_t kv_tile =
std::min(ggml_webgpu_flash_attn_max_kv_tile(context),
uint32_t kv_tile = std::min(ggml_webgpu_flash_attn_max_kv_tile(context),
context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
if (context.key.use_vec) {
q_tile = 1;

File diff suppressed because it is too large Load diff