From cd18a50ea573c081e7fa5604c85e4d571fd6ae4f Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 8 Mar 2026 06:33:48 -0500 Subject: [PATCH] vulkan: Fix data races in coopmat1 mul_mat(_id) (#20084) * vulkan: Fix data races in coopmat1 mul_mat(_id) Add barriers between coopmat store and regular loads. We sort of got away with this because it was the same subgroup accessing the values, but it's still a race and may not work. * switch to subgroup control barriers --- ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index 79344d330..23f3bd8d6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -377,6 +377,7 @@ void main() { [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) { coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor); + barrier(); [[unroll]] for (uint col = 0; col < TN; col += storestride) { const uint row_i = dc + cm_col * TN + col + store_c; if (row_i >= _ne1) break; @@ -387,6 +388,7 @@ void main() { data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]); } } + barrier(); } } #else @@ -404,18 +406,22 @@ void main() { // Full coopMat is within bounds, but stride_d is not aligned coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor); + controlBarrier(gl_ScopeSubgroup, gl_ScopeSubgroup, gl_StorageSemanticsShared, gl_SemanticsAcquireRelease); [[unroll]] for (uint col = 0; col < TN; col += storestride) { data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]); } + controlBarrier(gl_ScopeSubgroup, gl_ScopeSubgroup, gl_StorageSemanticsShared, gl_SemanticsAcquireRelease); } else if (dr + cm_row * TM < p.M && dc + cm_col * TN < p.N) { // Partial coopMat is within bounds coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor); + controlBarrier(gl_ScopeSubgroup, gl_ScopeSubgroup, gl_StorageSemanticsShared, gl_SemanticsAcquireRelease); [[unroll]] for (uint col = 0; col < TN; col += storestride) { if (dr + cm_row * TM + store_r < p.M && dc + cm_col * TN + col + store_c < p.N) { data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]); } } + controlBarrier(gl_ScopeSubgroup, gl_ScopeSubgroup, gl_StorageSemanticsShared, gl_SemanticsAcquireRelease); } } }