common/parser: handle reasoning budget (#20297)

* v1

* Finished!

* Handlie cli

* Reasoning sampler

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Less explosive terminology :)

* Add utf-8 case and tests

* common : migrate reasoning budget sampler to common

* cont : clean up

* cont : expose state and allow passing as initial state

* cont : remove unused imports

* cont : update state machine doc string

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Co-authored-by: Alde Rojas <hello@alde.dev>
This commit is contained in:
Piotr Wilkin (ilintar) 2026-03-11 10:26:12 +01:00 committed by GitHub
parent 5f91b1d5d5
commit acb7c79069
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 670 additions and 10 deletions

View file

@ -235,6 +235,14 @@ struct common_params_sampling {
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
// reasoning budget sampler parameters
// these are populated by the server/CLI based on chat template params
int32_t reasoning_budget_tokens = -1; // -1 = disabled, >= 0 = token budget
bool reasoning_budget_activate_immediately = false;
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
bool backend_sampling = false;
bool has_logit_bias() const {
@ -536,7 +544,9 @@ struct common_params {
bool use_jinja = true; // NOLINT
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
int reasoning_budget = -1;
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time