common/parser: add proper reasoning tag prefill reading (#20424)
* Implement proper prefill extraction * Refactor cli parameters, update docs, move reasoning budget sampler part to common/reasoning-budget.cpp * Update tools/server/server-task.cpp * refactor: move grammars to variant, remove grammar_external, handle exception internally * Make code less C++y Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
c1258830b2
commit
5e54d51b19
33 changed files with 651 additions and 454 deletions
|
|
@ -105,7 +105,7 @@ struct cli_context {
|
|||
llama_get_model(ctx_server.get_llama_context()));
|
||||
|
||||
task.params.sampling.reasoning_budget_tokens = reasoning_budget;
|
||||
task.params.sampling.reasoning_budget_activate_immediately = chat_params.thinking_forced_open;
|
||||
task.params.sampling.generation_prompt = chat_params.generation_prompt;
|
||||
|
||||
if (!chat_params.thinking_start_tag.empty()) {
|
||||
task.params.sampling.reasoning_budget_start =
|
||||
|
|
|
|||
|
|
@ -282,7 +282,7 @@ static void render_scenario(const common_chat_template & tmpl,
|
|||
LOG_ERR("Messages:\n%s\n", final_messages.dump(2).c_str());
|
||||
|
||||
try {
|
||||
autoparser::templates_params inputs;
|
||||
autoparser::generation_params inputs;
|
||||
inputs.messages = final_messages;
|
||||
inputs.add_generation_prompt = add_generation_prompt;
|
||||
inputs.extra_context["enable_thinking"] = enable_thinking;
|
||||
|
|
@ -395,7 +395,7 @@ int main(int argc, char ** argv) {
|
|||
analysis.analyze_template(chat_template);
|
||||
|
||||
// Generate Parser
|
||||
autoparser::templates_params params;
|
||||
autoparser::generation_params params;
|
||||
params.messages = json::array({ build_user_message() });
|
||||
params.reasoning_format =
|
||||
opts.enable_reasoning ? COMMON_REASONING_FORMAT_DEEPSEEK : COMMON_REASONING_FORMAT_NONE;
|
||||
|
|
|
|||
|
|
@ -400,12 +400,12 @@ static void analyze_template(const std::string & template_path) {
|
|||
{
|
||||
json user_msg = make_user_msg();
|
||||
|
||||
autoparser::templates_params params_no_tools;
|
||||
autoparser::generation_params params_no_tools;
|
||||
params_no_tools.messages = json::array({ user_msg });
|
||||
params_no_tools.add_generation_prompt = false;
|
||||
params_no_tools.tools = json::array();
|
||||
|
||||
autoparser::templates_params params_with_tools = params_no_tools;
|
||||
autoparser::generation_params params_with_tools = params_no_tools;
|
||||
params_with_tools.tools = tools;
|
||||
|
||||
std::string output_no_tools = common_chat_template_direct_apply(chat_template, params_no_tools);
|
||||
|
|
@ -419,12 +419,12 @@ static void analyze_template(const std::string & template_path) {
|
|||
{
|
||||
json user_msg = make_user_msg();
|
||||
|
||||
autoparser::templates_params params_no_prompt;
|
||||
autoparser::generation_params params_no_prompt;
|
||||
params_no_prompt.messages = json::array({ user_msg });
|
||||
params_no_prompt.add_generation_prompt = false;
|
||||
params_no_prompt.tools = json::array();
|
||||
|
||||
autoparser::templates_params params_with_prompt = params_no_prompt;
|
||||
autoparser::generation_params params_with_prompt = params_no_prompt;
|
||||
params_with_prompt.add_generation_prompt = true;
|
||||
|
||||
std::string output_no_prompt = common_chat_template_direct_apply(chat_template, params_no_prompt);
|
||||
|
|
@ -438,12 +438,12 @@ static void analyze_template(const std::string & template_path) {
|
|||
{
|
||||
json user_msg = make_user_msg();
|
||||
|
||||
autoparser::templates_params params_no_reasoning;
|
||||
autoparser::generation_params params_no_reasoning;
|
||||
params_no_reasoning.messages = json::array({ user_msg, make_assistant_no_reasoning() });
|
||||
params_no_reasoning.add_generation_prompt = false;
|
||||
params_no_reasoning.enable_thinking = true;
|
||||
|
||||
autoparser::templates_params params_with_reasoning = params_no_reasoning;
|
||||
autoparser::generation_params params_with_reasoning = params_no_reasoning;
|
||||
params_with_reasoning.messages = json::array({ user_msg, make_assistant_with_reasoning() });
|
||||
|
||||
std::string output_no_reasoning = common_chat_template_direct_apply(chat_template, params_no_reasoning);
|
||||
|
|
@ -458,12 +458,12 @@ static void analyze_template(const std::string & template_path) {
|
|||
json user_msg = make_user_msg();
|
||||
json user_msg2 = make_user_msg2();
|
||||
|
||||
autoparser::templates_params params_no_reasoning;
|
||||
autoparser::generation_params params_no_reasoning;
|
||||
params_no_reasoning.messages = json::array({ user_msg, make_assistant_no_reasoning(), user_msg2 });
|
||||
params_no_reasoning.add_generation_prompt = false;
|
||||
params_no_reasoning.enable_thinking = true;
|
||||
|
||||
autoparser::templates_params params_with_reasoning = params_no_reasoning;
|
||||
autoparser::generation_params params_with_reasoning = params_no_reasoning;
|
||||
params_with_reasoning.messages = json::array({ user_msg, make_assistant_with_reasoning(), user_msg2 });
|
||||
|
||||
std::string output_no_reasoning = common_chat_template_direct_apply(chat_template, params_no_reasoning);
|
||||
|
|
@ -477,12 +477,12 @@ static void analyze_template(const std::string & template_path) {
|
|||
{
|
||||
json user_msg = make_user_msg();
|
||||
|
||||
autoparser::templates_params params_no_tool;
|
||||
autoparser::generation_params params_no_tool;
|
||||
params_no_tool.messages = json::array({ user_msg, make_assistant_no_tool() });
|
||||
params_no_tool.add_generation_prompt = false;
|
||||
params_no_tool.tools = tools;
|
||||
|
||||
autoparser::templates_params params_with_tool = params_no_tool;
|
||||
autoparser::generation_params params_with_tool = params_no_tool;
|
||||
params_with_tool.messages = json::array({ user_msg, make_assistant_one_tool() });
|
||||
|
||||
std::string output_no_tool = common_chat_template_direct_apply(chat_template, params_no_tool);
|
||||
|
|
@ -497,12 +497,12 @@ static void analyze_template(const std::string & template_path) {
|
|||
json user_msg = make_user_msg();
|
||||
json user_msg2 = make_user_msg2_continue();
|
||||
|
||||
autoparser::templates_params params_no_tool;
|
||||
autoparser::generation_params params_no_tool;
|
||||
params_no_tool.messages = json::array({ user_msg, make_assistant_no_tool(), user_msg2 });
|
||||
params_no_tool.add_generation_prompt = false;
|
||||
params_no_tool.tools = tools;
|
||||
|
||||
autoparser::templates_params params_with_tool = params_no_tool;
|
||||
autoparser::generation_params params_with_tool = params_no_tool;
|
||||
params_with_tool.messages = json::array({ user_msg, make_assistant_one_tool(), user_msg2 });
|
||||
|
||||
std::string output_no_tool = common_chat_template_direct_apply(chat_template, params_no_tool);
|
||||
|
|
@ -516,12 +516,12 @@ static void analyze_template(const std::string & template_path) {
|
|||
{
|
||||
json user_msg = make_user_msg();
|
||||
|
||||
autoparser::templates_params params_one_tool;
|
||||
autoparser::generation_params params_one_tool;
|
||||
params_one_tool.messages = json::array({ user_msg, make_assistant_one_tool() });
|
||||
params_one_tool.add_generation_prompt = false;
|
||||
params_one_tool.tools = tools;
|
||||
|
||||
autoparser::templates_params params_two_tools = params_one_tool;
|
||||
autoparser::generation_params params_two_tools = params_one_tool;
|
||||
params_two_tools.messages = json::array({ user_msg, make_assistant_two_tools() });
|
||||
|
||||
std::string output_one_tool = common_chat_template_direct_apply(chat_template, params_one_tool);
|
||||
|
|
@ -536,12 +536,12 @@ static void analyze_template(const std::string & template_path) {
|
|||
json user_msg = make_user_msg();
|
||||
json user_msg2 = make_user_msg2_continue();
|
||||
|
||||
autoparser::templates_params params_one_tool;
|
||||
autoparser::generation_params params_one_tool;
|
||||
params_one_tool.messages = json::array({ user_msg, make_assistant_one_tool(), user_msg2 });
|
||||
params_one_tool.add_generation_prompt = false;
|
||||
params_one_tool.tools = tools;
|
||||
|
||||
autoparser::templates_params params_two_tools = params_one_tool;
|
||||
autoparser::generation_params params_two_tools = params_one_tool;
|
||||
params_two_tools.messages = json::array({ user_msg, make_assistant_two_tools(), user_msg2 });
|
||||
|
||||
std::string output_one_tool = common_chat_template_direct_apply(chat_template, params_one_tool);
|
||||
|
|
@ -555,13 +555,13 @@ static void analyze_template(const std::string & template_path) {
|
|||
{
|
||||
json user_msg = make_user_msg();
|
||||
|
||||
autoparser::templates_params params_no_reasoning;
|
||||
autoparser::generation_params params_no_reasoning;
|
||||
params_no_reasoning.messages = json::array({ user_msg, make_assistant_one_tool() });
|
||||
params_no_reasoning.add_generation_prompt = false;
|
||||
params_no_reasoning.tools = tools;
|
||||
params_no_reasoning.enable_thinking = true;
|
||||
|
||||
autoparser::templates_params params_with_reasoning = params_no_reasoning;
|
||||
autoparser::generation_params params_with_reasoning = params_no_reasoning;
|
||||
params_with_reasoning.messages = json::array({ user_msg, make_assistant_one_tool_with_reasoning() });
|
||||
|
||||
std::string output_no_reasoning = common_chat_template_direct_apply(chat_template, params_no_reasoning);
|
||||
|
|
|
|||
|
|
@ -907,7 +907,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
|
|||
"chat_format": "GPT-OSS",
|
||||
"reasoning_format": "none",
|
||||
"reasoning_in_content": false,
|
||||
"thinking_forced_open": false,
|
||||
"generation_prompt": "",
|
||||
"samplers": [
|
||||
"penalties",
|
||||
"dry",
|
||||
|
|
@ -972,7 +972,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
|
|||
"chat_format": "GPT-OSS",
|
||||
"reasoning_format": "none",
|
||||
"reasoning_in_content": false,
|
||||
"thinking_forced_open": false,
|
||||
"generation_prompt": "",
|
||||
"samplers": [
|
||||
"penalties",
|
||||
"dry",
|
||||
|
|
@ -1193,7 +1193,7 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":
|
|||
|
||||
`reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.
|
||||
|
||||
`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models.
|
||||
`generation_prompt`: The generation prompt that was prefilled in by the template. Prepended to model output before parsing.
|
||||
|
||||
`parse_tool_calls`: Whether to parse the generated tool call.
|
||||
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -1081,20 +1081,21 @@ json oaicompat_chat_params_parse(
|
|||
}
|
||||
}
|
||||
|
||||
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
||||
llama_params["prompt"] = chat_params.prompt;
|
||||
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
||||
llama_params["prompt"] = chat_params.prompt;
|
||||
if (!chat_params.grammar.empty()) {
|
||||
llama_params["grammar"] = chat_params.grammar;
|
||||
llama_params["grammar"] = chat_params.grammar;
|
||||
llama_params["grammar_type"] = std::string("tool_calls");
|
||||
}
|
||||
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
||||
auto grammar_triggers = json::array();
|
||||
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : chat_params.grammar_triggers) {
|
||||
server_grammar_trigger ct(trigger);
|
||||
grammar_triggers.push_back(ct.to_json());
|
||||
}
|
||||
llama_params["grammar_triggers"] = grammar_triggers;
|
||||
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
||||
llama_params["thinking_forced_open"] = chat_params.thinking_forced_open;
|
||||
llama_params["grammar_triggers"] = grammar_triggers;
|
||||
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
||||
llama_params["generation_prompt"] = chat_params.generation_prompt;
|
||||
for (const auto & stop : chat_params.additional_stops) {
|
||||
llama_params["stop"].push_back(stop);
|
||||
}
|
||||
|
|
@ -1114,7 +1115,6 @@ json oaicompat_chat_params_parse(
|
|||
llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag;
|
||||
llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag;
|
||||
llama_params["reasoning_budget_message"] = opt.reasoning_budget_message;
|
||||
llama_params["reasoning_budget_activate_immediately"] = chat_params.thinking_forced_open;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@
|
|||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cinttypes>
|
||||
#include <exception>
|
||||
#include <memory>
|
||||
#include <filesystem>
|
||||
|
||||
|
|
@ -1152,11 +1153,11 @@ private:
|
|||
|
||||
// initialize samplers
|
||||
if (task.need_sampling()) {
|
||||
slot.smpl.reset(common_sampler_init(model, task.params.sampling));
|
||||
|
||||
if (slot.smpl == nullptr) {
|
||||
// for now, the only error that may happen here is invalid grammar
|
||||
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
|
||||
try {
|
||||
slot.smpl.reset(common_sampler_init(model, task.params.sampling));
|
||||
} catch (std::exception & e) {
|
||||
std::string err_msg = std::string("Failed to initialize samplers: ") + e.what();
|
||||
send_error(task, err_msg, ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ json task_params::to_json(bool only_metrics) const {
|
|||
{"chat_format", common_chat_format_name(chat_parser_params.format)},
|
||||
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
|
||||
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
|
||||
{"thinking_forced_open", chat_parser_params.thinking_forced_open},
|
||||
{"generation_prompt", chat_parser_params.generation_prompt},
|
||||
{"samplers", samplers},
|
||||
{"speculative.n_max", speculative.n_max},
|
||||
{"speculative.n_min", speculative.n_min},
|
||||
|
|
@ -128,14 +128,14 @@ json task_params::to_json(bool only_metrics) const {
|
|||
{"logit_bias", format_logit_bias(sampling.logit_bias)},
|
||||
{"n_probs", sampling.n_probs},
|
||||
{"min_keep", sampling.min_keep},
|
||||
{"grammar", sampling.grammar},
|
||||
{"grammar", common_grammar_value(sampling.grammar)},
|
||||
{"grammar_lazy", sampling.grammar_lazy},
|
||||
{"grammar_triggers", grammar_triggers},
|
||||
{"preserved_tokens", sampling.preserved_tokens},
|
||||
{"chat_format", common_chat_format_name(chat_parser_params.format)},
|
||||
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
|
||||
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
|
||||
{"thinking_forced_open", chat_parser_params.thinking_forced_open},
|
||||
{"generation_prompt", chat_parser_params.generation_prompt},
|
||||
{"samplers", samplers},
|
||||
{"speculative.n_max", speculative.n_max},
|
||||
{"speculative.n_min", speculative.n_min},
|
||||
|
|
@ -376,14 +376,25 @@ task_params server_task::params_from_json_cmpl(
|
|||
try {
|
||||
auto schema = json_value(data, "json_schema", json::object());
|
||||
SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
|
||||
params.sampling.grammar = json_schema_to_grammar(schema);
|
||||
SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
|
||||
std::string grammar_str = json_schema_to_grammar(schema);
|
||||
SRV_DBG("Converted grammar: %s\n", grammar_str.c_str());
|
||||
params.sampling.grammar = {COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, std::move(grammar_str)};
|
||||
} catch (const std::exception & e) {
|
||||
throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
|
||||
}
|
||||
} else {
|
||||
params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
|
||||
SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
|
||||
std::string grammar_str = json_value(data, "grammar", std::string());
|
||||
if (!grammar_str.empty()) {
|
||||
// grammar_type key is set by the server when converting chat template grammars
|
||||
std::string grammar_type = json_value(data, "grammar_type", std::string());
|
||||
if (grammar_type == "tool_calls") {
|
||||
params.sampling.grammar = {COMMON_GRAMMAR_TYPE_TOOL_CALLS, std::move(grammar_str)};
|
||||
} else {
|
||||
// explicit grammar from the user (API field "grammar")
|
||||
params.sampling.grammar = {COMMON_GRAMMAR_TYPE_USER, std::move(grammar_str)};
|
||||
}
|
||||
SRV_DBG("Grammar (%s): %s\n", grammar_type.c_str(), common_grammar_value(params.sampling.grammar).c_str());
|
||||
}
|
||||
params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
|
||||
SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
|
||||
}
|
||||
|
|
@ -402,7 +413,8 @@ task_params server_task::params_from_json_cmpl(
|
|||
}
|
||||
params.chat_parser_params.reasoning_format = reasoning_format;
|
||||
params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
|
||||
params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
|
||||
params.chat_parser_params.generation_prompt = json_value(data, "generation_prompt", std::string());
|
||||
params.sampling.generation_prompt = params.chat_parser_params.generation_prompt;
|
||||
params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
|
||||
if (data.contains("chat_parser")) {
|
||||
params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
|
||||
|
|
@ -469,10 +481,7 @@ task_params server_task::params_from_json_cmpl(
|
|||
const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string());
|
||||
const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string());
|
||||
const auto message = json_value(data, "reasoning_budget_message", std::string());
|
||||
const bool activate_imm = json_value(data, "reasoning_budget_activate_immediately", false);
|
||||
|
||||
params.sampling.reasoning_budget_tokens = budget;
|
||||
params.sampling.reasoning_budget_activate_immediately = activate_imm;
|
||||
|
||||
if (!start_tag.empty()) {
|
||||
params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
|
||||
|
|
@ -482,8 +491,8 @@ task_params server_task::params_from_json_cmpl(
|
|||
params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true);
|
||||
}
|
||||
|
||||
SRV_DBG("reasoning budget: tokens=%d, activate_immediately=%s, start=%zu toks, end=%zu toks, forced=%zu toks\n",
|
||||
budget, activate_imm ? "true" : "false",
|
||||
SRV_DBG("reasoning budget: tokens=%d, generation_prompt='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n",
|
||||
budget, params.sampling.generation_prompt.c_str(),
|
||||
params.sampling.reasoning_budget_start.size(),
|
||||
params.sampling.reasoning_budget_end.size(),
|
||||
params.sampling.reasoning_budget_forced.size());
|
||||
|
|
|
|||
|
|
@ -210,6 +210,7 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int
|
|||
def test_completion_with_json_schema(jinja: bool, json_schema: dict, n_predicted: int, re_content: str):
|
||||
global server
|
||||
server.jinja = jinja
|
||||
server.debug = True
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": n_predicted,
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ describe('ParameterSyncService', () => {
|
|||
chat_format: '',
|
||||
reasoning_format: '',
|
||||
reasoning_in_content: false,
|
||||
thinking_forced_open: false,
|
||||
generation_prompt: '',
|
||||
'speculative.n_max': 0,
|
||||
'speculative.n_min': 0,
|
||||
'speculative.p_min': 0.0,
|
||||
|
|
@ -116,7 +116,7 @@ describe('ParameterSyncService', () => {
|
|||
chat_format: '',
|
||||
reasoning_format: '',
|
||||
reasoning_in_content: false,
|
||||
thinking_forced_open: false,
|
||||
generation_prompt: '',
|
||||
'speculative.n_max': 0,
|
||||
'speculative.n_min': 0,
|
||||
'speculative.p_min': 0.0,
|
||||
|
|
|
|||
4
tools/server/webui/src/lib/types/api.d.ts
vendored
4
tools/server/webui/src/lib/types/api.d.ts
vendored
|
|
@ -164,7 +164,7 @@ export interface ApiLlamaCppServerProps {
|
|||
chat_format: string;
|
||||
reasoning_format: string;
|
||||
reasoning_in_content: boolean;
|
||||
thinking_forced_open: boolean;
|
||||
generation_prompt: string;
|
||||
samplers: string[];
|
||||
backend_sampling: boolean;
|
||||
'speculative.n_max': number;
|
||||
|
|
@ -332,7 +332,7 @@ export interface ApiSlotData {
|
|||
chat_format: string;
|
||||
reasoning_format: string;
|
||||
reasoning_in_content: boolean;
|
||||
thinking_forced_open: boolean;
|
||||
generation_prompt: string;
|
||||
samplers: string[];
|
||||
backend_sampling: boolean;
|
||||
'speculative.n_max': number;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue