common/parser: add proper reasoning tag prefill reading (#20424)

* Implement proper prefill extraction

* Refactor cli parameters, update docs, move reasoning budget sampler part to common/reasoning-budget.cpp

* Update tools/server/server-task.cpp

* refactor: move grammars to variant, remove grammar_external, handle exception internally

* Make code less C++y

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Piotr Wilkin (ilintar) 2026-03-19 16:58:21 +01:00 committed by GitHub
parent c1258830b2
commit 5e54d51b19
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
33 changed files with 651 additions and 454 deletions

View file

@ -105,7 +105,7 @@ struct cli_context {
llama_get_model(ctx_server.get_llama_context()));
task.params.sampling.reasoning_budget_tokens = reasoning_budget;
task.params.sampling.reasoning_budget_activate_immediately = chat_params.thinking_forced_open;
task.params.sampling.generation_prompt = chat_params.generation_prompt;
if (!chat_params.thinking_start_tag.empty()) {
task.params.sampling.reasoning_budget_start =

View file

@ -282,7 +282,7 @@ static void render_scenario(const common_chat_template & tmpl,
LOG_ERR("Messages:\n%s\n", final_messages.dump(2).c_str());
try {
autoparser::templates_params inputs;
autoparser::generation_params inputs;
inputs.messages = final_messages;
inputs.add_generation_prompt = add_generation_prompt;
inputs.extra_context["enable_thinking"] = enable_thinking;
@ -395,7 +395,7 @@ int main(int argc, char ** argv) {
analysis.analyze_template(chat_template);
// Generate Parser
autoparser::templates_params params;
autoparser::generation_params params;
params.messages = json::array({ build_user_message() });
params.reasoning_format =
opts.enable_reasoning ? COMMON_REASONING_FORMAT_DEEPSEEK : COMMON_REASONING_FORMAT_NONE;

View file

@ -400,12 +400,12 @@ static void analyze_template(const std::string & template_path) {
{
json user_msg = make_user_msg();
autoparser::templates_params params_no_tools;
autoparser::generation_params params_no_tools;
params_no_tools.messages = json::array({ user_msg });
params_no_tools.add_generation_prompt = false;
params_no_tools.tools = json::array();
autoparser::templates_params params_with_tools = params_no_tools;
autoparser::generation_params params_with_tools = params_no_tools;
params_with_tools.tools = tools;
std::string output_no_tools = common_chat_template_direct_apply(chat_template, params_no_tools);
@ -419,12 +419,12 @@ static void analyze_template(const std::string & template_path) {
{
json user_msg = make_user_msg();
autoparser::templates_params params_no_prompt;
autoparser::generation_params params_no_prompt;
params_no_prompt.messages = json::array({ user_msg });
params_no_prompt.add_generation_prompt = false;
params_no_prompt.tools = json::array();
autoparser::templates_params params_with_prompt = params_no_prompt;
autoparser::generation_params params_with_prompt = params_no_prompt;
params_with_prompt.add_generation_prompt = true;
std::string output_no_prompt = common_chat_template_direct_apply(chat_template, params_no_prompt);
@ -438,12 +438,12 @@ static void analyze_template(const std::string & template_path) {
{
json user_msg = make_user_msg();
autoparser::templates_params params_no_reasoning;
autoparser::generation_params params_no_reasoning;
params_no_reasoning.messages = json::array({ user_msg, make_assistant_no_reasoning() });
params_no_reasoning.add_generation_prompt = false;
params_no_reasoning.enable_thinking = true;
autoparser::templates_params params_with_reasoning = params_no_reasoning;
autoparser::generation_params params_with_reasoning = params_no_reasoning;
params_with_reasoning.messages = json::array({ user_msg, make_assistant_with_reasoning() });
std::string output_no_reasoning = common_chat_template_direct_apply(chat_template, params_no_reasoning);
@ -458,12 +458,12 @@ static void analyze_template(const std::string & template_path) {
json user_msg = make_user_msg();
json user_msg2 = make_user_msg2();
autoparser::templates_params params_no_reasoning;
autoparser::generation_params params_no_reasoning;
params_no_reasoning.messages = json::array({ user_msg, make_assistant_no_reasoning(), user_msg2 });
params_no_reasoning.add_generation_prompt = false;
params_no_reasoning.enable_thinking = true;
autoparser::templates_params params_with_reasoning = params_no_reasoning;
autoparser::generation_params params_with_reasoning = params_no_reasoning;
params_with_reasoning.messages = json::array({ user_msg, make_assistant_with_reasoning(), user_msg2 });
std::string output_no_reasoning = common_chat_template_direct_apply(chat_template, params_no_reasoning);
@ -477,12 +477,12 @@ static void analyze_template(const std::string & template_path) {
{
json user_msg = make_user_msg();
autoparser::templates_params params_no_tool;
autoparser::generation_params params_no_tool;
params_no_tool.messages = json::array({ user_msg, make_assistant_no_tool() });
params_no_tool.add_generation_prompt = false;
params_no_tool.tools = tools;
autoparser::templates_params params_with_tool = params_no_tool;
autoparser::generation_params params_with_tool = params_no_tool;
params_with_tool.messages = json::array({ user_msg, make_assistant_one_tool() });
std::string output_no_tool = common_chat_template_direct_apply(chat_template, params_no_tool);
@ -497,12 +497,12 @@ static void analyze_template(const std::string & template_path) {
json user_msg = make_user_msg();
json user_msg2 = make_user_msg2_continue();
autoparser::templates_params params_no_tool;
autoparser::generation_params params_no_tool;
params_no_tool.messages = json::array({ user_msg, make_assistant_no_tool(), user_msg2 });
params_no_tool.add_generation_prompt = false;
params_no_tool.tools = tools;
autoparser::templates_params params_with_tool = params_no_tool;
autoparser::generation_params params_with_tool = params_no_tool;
params_with_tool.messages = json::array({ user_msg, make_assistant_one_tool(), user_msg2 });
std::string output_no_tool = common_chat_template_direct_apply(chat_template, params_no_tool);
@ -516,12 +516,12 @@ static void analyze_template(const std::string & template_path) {
{
json user_msg = make_user_msg();
autoparser::templates_params params_one_tool;
autoparser::generation_params params_one_tool;
params_one_tool.messages = json::array({ user_msg, make_assistant_one_tool() });
params_one_tool.add_generation_prompt = false;
params_one_tool.tools = tools;
autoparser::templates_params params_two_tools = params_one_tool;
autoparser::generation_params params_two_tools = params_one_tool;
params_two_tools.messages = json::array({ user_msg, make_assistant_two_tools() });
std::string output_one_tool = common_chat_template_direct_apply(chat_template, params_one_tool);
@ -536,12 +536,12 @@ static void analyze_template(const std::string & template_path) {
json user_msg = make_user_msg();
json user_msg2 = make_user_msg2_continue();
autoparser::templates_params params_one_tool;
autoparser::generation_params params_one_tool;
params_one_tool.messages = json::array({ user_msg, make_assistant_one_tool(), user_msg2 });
params_one_tool.add_generation_prompt = false;
params_one_tool.tools = tools;
autoparser::templates_params params_two_tools = params_one_tool;
autoparser::generation_params params_two_tools = params_one_tool;
params_two_tools.messages = json::array({ user_msg, make_assistant_two_tools(), user_msg2 });
std::string output_one_tool = common_chat_template_direct_apply(chat_template, params_one_tool);
@ -555,13 +555,13 @@ static void analyze_template(const std::string & template_path) {
{
json user_msg = make_user_msg();
autoparser::templates_params params_no_reasoning;
autoparser::generation_params params_no_reasoning;
params_no_reasoning.messages = json::array({ user_msg, make_assistant_one_tool() });
params_no_reasoning.add_generation_prompt = false;
params_no_reasoning.tools = tools;
params_no_reasoning.enable_thinking = true;
autoparser::templates_params params_with_reasoning = params_no_reasoning;
autoparser::generation_params params_with_reasoning = params_no_reasoning;
params_with_reasoning.messages = json::array({ user_msg, make_assistant_one_tool_with_reasoning() });
std::string output_no_reasoning = common_chat_template_direct_apply(chat_template, params_no_reasoning);

View file

@ -907,7 +907,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
"chat_format": "GPT-OSS",
"reasoning_format": "none",
"reasoning_in_content": false,
"thinking_forced_open": false,
"generation_prompt": "",
"samplers": [
"penalties",
"dry",
@ -972,7 +972,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
"chat_format": "GPT-OSS",
"reasoning_format": "none",
"reasoning_in_content": false,
"thinking_forced_open": false,
"generation_prompt": "",
"samplers": [
"penalties",
"dry",
@ -1193,7 +1193,7 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":
`reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.
`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models.
`generation_prompt`: The generation prompt that was prefilled in by the template. Prepended to model output before parsing.
`parse_tool_calls`: Whether to parse the generated tool call.

Binary file not shown.

View file

@ -1081,20 +1081,21 @@ json oaicompat_chat_params_parse(
}
}
llama_params["chat_format"] = static_cast<int>(chat_params.format);
llama_params["prompt"] = chat_params.prompt;
llama_params["chat_format"] = static_cast<int>(chat_params.format);
llama_params["prompt"] = chat_params.prompt;
if (!chat_params.grammar.empty()) {
llama_params["grammar"] = chat_params.grammar;
llama_params["grammar"] = chat_params.grammar;
llama_params["grammar_type"] = std::string("tool_calls");
}
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
auto grammar_triggers = json::array();
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
auto grammar_triggers = json::array();
for (const auto & trigger : chat_params.grammar_triggers) {
server_grammar_trigger ct(trigger);
grammar_triggers.push_back(ct.to_json());
}
llama_params["grammar_triggers"] = grammar_triggers;
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
llama_params["thinking_forced_open"] = chat_params.thinking_forced_open;
llama_params["grammar_triggers"] = grammar_triggers;
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
llama_params["generation_prompt"] = chat_params.generation_prompt;
for (const auto & stop : chat_params.additional_stops) {
llama_params["stop"].push_back(stop);
}
@ -1114,7 +1115,6 @@ json oaicompat_chat_params_parse(
llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag;
llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag;
llama_params["reasoning_budget_message"] = opt.reasoning_budget_message;
llama_params["reasoning_budget_activate_immediately"] = chat_params.thinking_forced_open;
}
}

View file

@ -15,6 +15,7 @@
#include <algorithm>
#include <cstddef>
#include <cinttypes>
#include <exception>
#include <memory>
#include <filesystem>
@ -1152,11 +1153,11 @@ private:
// initialize samplers
if (task.need_sampling()) {
slot.smpl.reset(common_sampler_init(model, task.params.sampling));
if (slot.smpl == nullptr) {
// for now, the only error that may happen here is invalid grammar
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
try {
slot.smpl.reset(common_sampler_init(model, task.params.sampling));
} catch (std::exception & e) {
std::string err_msg = std::string("Failed to initialize samplers: ") + e.what();
send_error(task, err_msg, ERROR_TYPE_INVALID_REQUEST);
return false;
}

View file

@ -72,7 +72,7 @@ json task_params::to_json(bool only_metrics) const {
{"chat_format", common_chat_format_name(chat_parser_params.format)},
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
{"thinking_forced_open", chat_parser_params.thinking_forced_open},
{"generation_prompt", chat_parser_params.generation_prompt},
{"samplers", samplers},
{"speculative.n_max", speculative.n_max},
{"speculative.n_min", speculative.n_min},
@ -128,14 +128,14 @@ json task_params::to_json(bool only_metrics) const {
{"logit_bias", format_logit_bias(sampling.logit_bias)},
{"n_probs", sampling.n_probs},
{"min_keep", sampling.min_keep},
{"grammar", sampling.grammar},
{"grammar", common_grammar_value(sampling.grammar)},
{"grammar_lazy", sampling.grammar_lazy},
{"grammar_triggers", grammar_triggers},
{"preserved_tokens", sampling.preserved_tokens},
{"chat_format", common_chat_format_name(chat_parser_params.format)},
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
{"thinking_forced_open", chat_parser_params.thinking_forced_open},
{"generation_prompt", chat_parser_params.generation_prompt},
{"samplers", samplers},
{"speculative.n_max", speculative.n_max},
{"speculative.n_min", speculative.n_min},
@ -376,14 +376,25 @@ task_params server_task::params_from_json_cmpl(
try {
auto schema = json_value(data, "json_schema", json::object());
SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
params.sampling.grammar = json_schema_to_grammar(schema);
SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
std::string grammar_str = json_schema_to_grammar(schema);
SRV_DBG("Converted grammar: %s\n", grammar_str.c_str());
params.sampling.grammar = {COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, std::move(grammar_str)};
} catch (const std::exception & e) {
throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
}
} else {
params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
std::string grammar_str = json_value(data, "grammar", std::string());
if (!grammar_str.empty()) {
// grammar_type key is set by the server when converting chat template grammars
std::string grammar_type = json_value(data, "grammar_type", std::string());
if (grammar_type == "tool_calls") {
params.sampling.grammar = {COMMON_GRAMMAR_TYPE_TOOL_CALLS, std::move(grammar_str)};
} else {
// explicit grammar from the user (API field "grammar")
params.sampling.grammar = {COMMON_GRAMMAR_TYPE_USER, std::move(grammar_str)};
}
SRV_DBG("Grammar (%s): %s\n", grammar_type.c_str(), common_grammar_value(params.sampling.grammar).c_str());
}
params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
}
@ -402,7 +413,8 @@ task_params server_task::params_from_json_cmpl(
}
params.chat_parser_params.reasoning_format = reasoning_format;
params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
params.chat_parser_params.generation_prompt = json_value(data, "generation_prompt", std::string());
params.sampling.generation_prompt = params.chat_parser_params.generation_prompt;
params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
if (data.contains("chat_parser")) {
params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
@ -469,10 +481,7 @@ task_params server_task::params_from_json_cmpl(
const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string());
const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string());
const auto message = json_value(data, "reasoning_budget_message", std::string());
const bool activate_imm = json_value(data, "reasoning_budget_activate_immediately", false);
params.sampling.reasoning_budget_tokens = budget;
params.sampling.reasoning_budget_activate_immediately = activate_imm;
if (!start_tag.empty()) {
params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
@ -482,8 +491,8 @@ task_params server_task::params_from_json_cmpl(
params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true);
}
SRV_DBG("reasoning budget: tokens=%d, activate_immediately=%s, start=%zu toks, end=%zu toks, forced=%zu toks\n",
budget, activate_imm ? "true" : "false",
SRV_DBG("reasoning budget: tokens=%d, generation_prompt='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n",
budget, params.sampling.generation_prompt.c_str(),
params.sampling.reasoning_budget_start.size(),
params.sampling.reasoning_budget_end.size(),
params.sampling.reasoning_budget_forced.size());

View file

@ -210,6 +210,7 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int
def test_completion_with_json_schema(jinja: bool, json_schema: dict, n_predicted: int, re_content: str):
global server
server.jinja = jinja
server.debug = True
server.start()
res = server.make_request("POST", "/chat/completions", data={
"max_tokens": n_predicted,

View file

@ -51,7 +51,7 @@ describe('ParameterSyncService', () => {
chat_format: '',
reasoning_format: '',
reasoning_in_content: false,
thinking_forced_open: false,
generation_prompt: '',
'speculative.n_max': 0,
'speculative.n_min': 0,
'speculative.p_min': 0.0,
@ -116,7 +116,7 @@ describe('ParameterSyncService', () => {
chat_format: '',
reasoning_format: '',
reasoning_in_content: false,
thinking_forced_open: false,
generation_prompt: '',
'speculative.n_max': 0,
'speculative.n_min': 0,
'speculative.p_min': 0.0,

View file

@ -164,7 +164,7 @@ export interface ApiLlamaCppServerProps {
chat_format: string;
reasoning_format: string;
reasoning_in_content: boolean;
thinking_forced_open: boolean;
generation_prompt: string;
samplers: string[];
backend_sampling: boolean;
'speculative.n_max': number;
@ -332,7 +332,7 @@ export interface ApiSlotData {
chat_format: string;
reasoning_format: string;
reasoning_in_content: boolean;
thinking_forced_open: boolean;
generation_prompt: string;
samplers: string[];
backend_sampling: boolean;
'speculative.n_max': number;