ggml : remove GGML_KQ_MASK_PAD constant (#17910)

* ggml : remove GGML_KQ_MASK_PAD constant

* cont : remove comment
This commit is contained in:
Georgi Gerganov 2025-12-10 20:53:16 +02:00 committed by GitHub
parent 4df6e859e9
commit 4dff236a52
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 19 additions and 36 deletions

View file

@ -93,14 +93,6 @@ llama_context::llama_context(
// with causal attention, the batch size is limited by the context size
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
cparams.n_batch = GGML_KQ_MASK_PAD;
}
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
cparams.op_offload = params.op_offload;