ggml : remove GGML_KQ_MASK_PAD constant (#17910)

* ggml : remove GGML_KQ_MASK_PAD constant * cont : remove comment
2025-12-10 20:53:16 +02:00 · 2025-12-10 20:53:16 +02:00 · 4dff236a52
commit 4dff236a52
parent 4df6e859e9
7 changed files with 19 additions and 36 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -93,14 +93,6 @@ llama_context::llama_context(
    // with causal attention, the batch size is limited by the context size
    cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;

-    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
-    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
-    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
-    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
-    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
-        cparams.n_batch = GGML_KQ_MASK_PAD;
-    }
    cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);

    cparams.op_offload = params.op_offload;