common/parser: handle reasoning budget (#20297)

* v1

* Finished!

* Handlie cli

* Reasoning sampler

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Less explosive terminology :)

* Add utf-8 case and tests

* common : migrate reasoning budget sampler to common

* cont : clean up

* cont : expose state and allow passing as initial state

* cont : remove unused imports

* cont : update state machine doc string

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Co-authored-by: Alde Rojas <hello@alde.dev>
This commit is contained in:
Piotr Wilkin (ilintar) 2026-03-11 10:26:12 +01:00 committed by GitHub
parent 5f91b1d5d5
commit acb7c79069
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 670 additions and 10 deletions

View file

@ -57,6 +57,8 @@ struct cli_context {
std::vector<raw_buffer> input_files;
task_params defaults;
bool verbose_prompt;
int reasoning_budget = -1;
std::string reasoning_budget_message;
// thread for showing "loading" animation
std::atomic<bool> loading_show;
@ -73,6 +75,8 @@ struct cli_context {
// defaults.return_progress = true; // TODO: show progress
verbose_prompt = params.verbose_prompt;
reasoning_budget = params.reasoning_budget;
reasoning_budget_message = params.reasoning_budget_message;
}
std::string generate_completion(result_timings & out_timings) {
@ -95,6 +99,24 @@ struct cli_context {
task.params.chat_parser_params.parser.load(chat_params.parser);
}
// reasoning budget sampler
if (reasoning_budget >= 0 && !chat_params.thinking_end_tag.empty()) {
const llama_vocab * vocab = llama_model_get_vocab(
llama_get_model(ctx_server.get_llama_context()));
task.params.sampling.reasoning_budget_tokens = reasoning_budget;
task.params.sampling.reasoning_budget_activate_immediately = chat_params.thinking_forced_open;
if (!chat_params.thinking_start_tag.empty()) {
task.params.sampling.reasoning_budget_start =
common_tokenize(vocab, chat_params.thinking_start_tag, false, true);
}
task.params.sampling.reasoning_budget_end =
common_tokenize(vocab, chat_params.thinking_end_tag, false, true);
task.params.sampling.reasoning_budget_forced =
common_tokenize(vocab, reasoning_budget_message + chat_params.thinking_end_tag, false, true);
}
rd.post_task({std::move(task)});
}