common : inhibit lazy grammar sampler while reasoning is active (#20970)

* common : inhibit grammar while reasoning budget is active

* cont : update force_pos in accept

* cont : fix tests

* cont : tweak should apply logic

* cont : return early not using grammar sampler

* Add tests

* cont : prevent backend sampling when reasoning budget enabled

* cont : fix typo

---------

Co-authored-by: Piotr Wilkin <piotr.wilkin@syndatis.com>
This commit is contained in:
Aldehir Rojas 2026-03-27 12:30:40 -05:00 committed by GitHub
parent ff934e29bc
commit 59d840209a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 295 additions and 106 deletions

View file

@ -478,19 +478,17 @@ task_params server_task::params_from_json_cmpl(
// Parse reasoning budget sampler parameters
{
const int32_t budget = json_value(data, "reasoning_budget_tokens", (int32_t) -1);
if (budget >= 0) {
const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string());
const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string());
const auto message = json_value(data, "reasoning_budget_message", std::string());
params.sampling.reasoning_budget_tokens = budget;
const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string());
const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string());
const auto message = json_value(data, "reasoning_budget_message", std::string());
params.sampling.reasoning_budget_tokens = budget;
if (!start_tag.empty()) {
params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
}
if (!end_tag.empty()) {
params.sampling.reasoning_budget_end = common_tokenize(vocab, end_tag, false, true);
params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true);
}
if (!start_tag.empty()) {
params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
}
if (!end_tag.empty()) {
params.sampling.reasoning_budget_end = common_tokenize(vocab, end_tag, false, true);
params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true);
SRV_DBG("reasoning budget: tokens=%d, generation_prompt='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n",
budget, params.sampling.generation_prompt.c_str(),