server: /v1/responses (partial) (#18486)

* from previous PR

* Make instruction(system) as first message

* Convert [input_message] (text/image/file)

* Rename convert_responses_to_chatcmpl(body) -> response_body

* Initial tool call support

* Erase instructions field from chatcmpl body

* Feed reasoning texts to chat template

* Use std::vector instead of opaque json array

* Make output_item.added events consistent

* Move `server_task_result_cmpl_partial::update` from header to source

* Match ID of output_item.added and .done events

* Add function_call only if there is no "fc_" prefix

* Add function call output at non-streaming API

* Test if ID is persistent

* Add doc

* Fix style - use trailing comma

* Rewrite state management

* catch up with upstream/master

* Fix style - "type" is the first item of SSE data

* Explicitly check "instructions" from response_body

* Make lambdas static

* Check if reasoning content exists

* Add `oai_resp_id` to task_result_state(also initialized at ctor), server_task_result_cmpl_partial, and server_task_result_cmpl_final

* Reject `input_file` since it is not supported by chatcmpl

* Add "fc_" prefix to non-straming function call id as coderabbit pointed out

---------

Co-authored-by: openingnow <>
This commit is contained in:
손희준 2026-01-22 01:47:23 +09:00 committed by GitHub
parent 33f890e579
commit fbbf3ad190
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 836 additions and 40 deletions

View file

@ -584,6 +584,8 @@ json server_task_result_cmpl_final::to_json() {
return to_json_oaicompat();
case TASK_RESPONSE_TYPE_OAI_CHAT:
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
case TASK_RESPONSE_TYPE_OAI_RESP:
return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
case TASK_RESPONSE_TYPE_ANTHROPIC:
return stream ? to_json_anthropic_stream() : to_json_anthropic();
default:
@ -801,6 +803,186 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
return deltas;
}
json server_task_result_cmpl_final::to_json_oaicompat_resp() {
common_chat_msg msg;
if (!oaicompat_msg.empty()) {
msg = oaicompat_msg;
} else {
msg.role = "assistant";
msg.content = content;
}
std::vector<json> output;
if (msg.reasoning_content != "") {
output.push_back(json {
{"id", "rs_" + random_string()},
{"summary", json::array()},
{"type", "reasoning"},
{"content", json::array({ json {
{"text", msg.reasoning_content},
{"type", "reasoning_text"},
}})},
{"encrypted_content", ""},
{"status", "completed"},
});
}
if (msg.content != "") {
output.push_back(json {
{"content", json::array({ json {
{"type", "output_text"},
{"annotations", json::array()},
{"logprobs", json::array()},
{"text", msg.content},
}})},
{"id", "msg_" + random_string()},
{"role", msg.role},
{"status", "completed"},
{"type", "message"},
});
}
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
output.push_back(json {
{"type", "function_call"},
{"status", "completed"},
{"arguments", tool_call.arguments},
{"call_id", "fc_" + tool_call.id},
{"name", tool_call.name},
});
}
std::time_t t = std::time(0);
json res = {
{"completed_at", t},
{"created_at", t},
{"id", oai_resp_id},
{"model", oaicompat_model},
{"object", "response"},
{"output", output},
{"status", "completed"},
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens},
}},
};
return res;
}
json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
std::vector<json> server_sent_events;
std::vector<json> output;
if (oaicompat_msg.reasoning_content != "") {
const json output_item = json {
{"id", oai_resp_reasoning_id},
{"summary", json::array()},
{"type", "reasoning"},
{"content", json::array({ json {
{"text", oaicompat_msg.reasoning_content},
{"type", "reasoning_text"},
}})},
{"encrypted_content", ""},
};
server_sent_events.push_back(json {
{"event", "response.output_item.done"},
{"data", json {
{"type", "response.output_item.done"},
{"item", output_item}
}}
});
output.push_back(output_item);
}
if (oaicompat_msg.content != "") {
server_sent_events.push_back(json {
{"event", "response.output_text.done"},
{"data", json {
{"type", "response.output_text.done"},
{"item_id", oai_resp_message_id},
{"text", oaicompat_msg.content}
}}
});
const json content_part = {
{"type", "output_text"},
{"annotations", json::array()},
{"logprobs", json::array()},
{"text", oaicompat_msg.content}
};
server_sent_events.push_back(json {
{"event", "response.content_part.done"},
{"data", json {
{"type", "response.content_part.done"},
{"item_id", oai_resp_message_id},
{"part", content_part}
}}
});
const json output_item = {
{"type", "message"},
{"status", "completed"},
{"id", oai_resp_message_id},
{"content", json::array({content_part})},
{"role", "assistant"}
};
server_sent_events.push_back(json {
{"event", "response.output_item.done"},
{"data", json {
{"type", "response.output_item.done"},
{"item", output_item}
}}
});
output.push_back(output_item);
}
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
const json output_item = {
{"type", "function_call"},
{"status", "completed"},
{"arguments", tool_call.arguments},
{"call_id", "fc_" + tool_call.id},
{"name", tool_call.name}
};
server_sent_events.push_back(json {
{"event", "response.output_item.done"},
{"data", json {
{"type", "response.output_item.done"},
{"item", output_item}
}}
});
output.push_back(output_item);
}
std::time_t t = std::time(0);
server_sent_events.push_back(json {
{"event", "response.completed"},
{"data", json {
{"type", "response.completed"},
{"response", json {
{"id", oai_resp_id},
{"object", "response"},
{"created_at", t},
{"status", "completed"},
{"model", oaicompat_model},
{"output", output},
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens}
}}
}},
}}
});
return server_sent_events;
}
json server_task_result_cmpl_final::to_json_anthropic() {
std::string stop_reason = "max_tokens";
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
@ -1057,6 +1239,36 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
//
// server_task_result_cmpl_partial
//
void server_task_result_cmpl_partial::update(task_result_state & state) {
is_updated = true;
state.update_chat_msg(content, true, oaicompat_msg_diffs);
// Copy current state for use in to_json_*() (reflects state BEFORE this chunk)
thinking_block_started = state.thinking_block_started;
text_block_started = state.text_block_started;
oai_resp_id = state.oai_resp_id;
oai_resp_reasoning_id = state.oai_resp_reasoning_id;
oai_resp_message_id = state.oai_resp_message_id;
oai_resp_fc_id = state.oai_resp_fc_id;
// track if the accumulated message has any reasoning content
anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
// Pre-compute state updates based on diffs (for next chunk)
for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
if (!diff.reasoning_content_delta.empty() && !state.thinking_block_started) {
state.thinking_block_started = true;
}
if (!diff.content_delta.empty() && !state.text_block_started) {
state.text_block_started = true;
}
if (!diff.tool_call_delta.name.empty()) {
state.oai_resp_fc_id = diff.tool_call_delta.id;
}
}
}
json server_task_result_cmpl_partial::to_json() {
GGML_ASSERT(is_updated && "update() must be called before to_json()");
switch (res_type) {
@ -1066,6 +1278,8 @@ json server_task_result_cmpl_partial::to_json() {
return to_json_oaicompat();
case TASK_RESPONSE_TYPE_OAI_CHAT:
return to_json_oaicompat_chat();
case TASK_RESPONSE_TYPE_OAI_RESP:
return to_json_oaicompat_resp();
case TASK_RESPONSE_TYPE_ANTHROPIC:
return to_json_anthropic();
default:
@ -1190,6 +1404,132 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
return deltas;
}
json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
std::vector<json> events;
if (n_decoded == 1) {
events.push_back(json {
{"event", "response.created"},
{"data", json {
{"type", "response.created"},
{"response", json {
{"id", oai_resp_id},
{"object", "response"},
{"status", "in_progress"},
}},
}},
});
events.push_back(json {
{"event", "response.in_progress"},
{"data", json {
{"type", "response.in_progress"},
{"response", json {
{"id", oai_resp_id},
{"object", "response"},
{"status", "in_progress"},
}},
}},
});
}
for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
if (!diff.reasoning_content_delta.empty()) {
if (!thinking_block_started) {
events.push_back(json {
{"event", "response.output_item.added"},
{"data", json {
{"type", "response.output_item.added"},
{"item", json {
{"id", oai_resp_reasoning_id},
{"summary", json::array()},
{"type", "reasoning"},
{"content", json::array()},
{"encrypted_content", ""},
{"status", "in_progress"},
}},
}},
});
thinking_block_started = true;
}
events.push_back(json {
{"event", "response.reasoning_text.delta"},
{"data", json {
{"type", "response.reasoning_text.delta"},
{"delta", diff.reasoning_content_delta},
{"item_id", oai_resp_reasoning_id},
}},
});
}
if (!diff.content_delta.empty()) {
if (!text_block_started) {
events.push_back(json {
{"event", "response.output_item.added"},
{"data", json {
{"type", "response.output_item.added"},
{"item", json {
{"content", json::array()},
{"id", oai_resp_message_id},
{"role", "assistant"},
{"status", "in_progress"},
{"type", "message"},
}},
}},
});
events.push_back(json {
{"event", "response.content_part.added"},
{"data", json {
{"type", "response.content_part.added"},
{"item_id", oai_resp_message_id},
{"part", json {
{"type", "output_text"},
{"text", ""},
}},
}},
});
text_block_started = true;
}
events.push_back(json {
{"event", "response.output_text.delta"},
{"data", json {
{"type", "response.output_text.delta"},
{"item_id", oai_resp_message_id},
{"delta", diff.content_delta},
}},
});
}
if (!diff.tool_call_delta.name.empty()) {
events.push_back(json {
{"event", "response.output_item.added"},
{"data", json {
{"type", "response.output_item.added"},
{"item", json {
{"arguments", ""},
{"call_id", "fc_" + diff.tool_call_delta.id},
{"name", diff.tool_call_delta.name},
{"type", "function_call"},
{"status", "in_progress"},
}},
}},
});
oai_resp_fc_id = diff.tool_call_delta.id;
}
if (!diff.tool_call_delta.arguments.empty()) {
events.push_back(json {
{"event", "response.function_call_arguments.delta"},
{"data", json {
{"type", "response.function_call_arguments.delta"},
{"delta", diff.tool_call_delta.arguments},
{"item_id", "fc_" + oai_resp_fc_id},
}},
});
}
}
return events;
}
//
// server_task_result_embd
//
@ -1260,8 +1600,8 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
// use local copies of streaming state (copied from task_result_state in update())
// these reflect the state BEFORE this chunk was processed
bool thinking_started = anthropic_thinking_block_started;
bool text_started = anthropic_text_block_started;
bool thinking_started = thinking_block_started;
bool text_started = text_block_started;
for (const auto & diff : oaicompat_msg_diffs) {
// handle thinking/reasoning content