graph : utilize ggml_build_forward_select() to avoid reallocations (#18898)

* graph : avoid branches between embedding and token inputs * models : make deepstack graphs (e.g. Qwen3 VL) have constant topology * ci : enable -DGGML_SCHED_NO_REALLOC=ON for server CI * cont : pad token embeddings to n_embd_inp
2026-01-23 18:22:34 +02:00 · 2026-01-23 18:22:34 +02:00 · 557515be1e
commit 557515be1e
parent cb6caca191
7 changed files with 69 additions and 53 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -2903,7 +2903,7 @@ void llama_context::opt_epoch_iter(
                };
                ctx_compute_opt = ggml_init(params);
            }
-            ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits());
+            ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_inp_tokens(), res->get_logits());
            ggml_opt_alloc(opt_ctx, train);

            res->set_inputs(&ubatch);