server: /v1/responses (partial) (#18486)
* from previous PR * Make instruction(system) as first message * Convert [input_message] (text/image/file) * Rename convert_responses_to_chatcmpl(body) -> response_body * Initial tool call support * Erase instructions field from chatcmpl body * Feed reasoning texts to chat template * Use std::vector instead of opaque json array * Make output_item.added events consistent * Move `server_task_result_cmpl_partial::update` from header to source * Match ID of output_item.added and .done events * Add function_call only if there is no "fc_" prefix * Add function call output at non-streaming API * Test if ID is persistent * Add doc * Fix style - use trailing comma * Rewrite state management * catch up with upstream/master * Fix style - "type" is the first item of SSE data * Explicitly check "instructions" from response_body * Make lambdas static * Check if reasoning content exists * Add `oai_resp_id` to task_result_state(also initialized at ctor), server_task_result_cmpl_partial, and server_task_result_cmpl_final * Reject `input_file` since it is not supported by chatcmpl * Add "fc_" prefix to non-straming function call id as coderabbit pointed out --------- Co-authored-by: openingnow <>
This commit is contained in:
parent
33f890e579
commit
fbbf3ad190
11 changed files with 836 additions and 40 deletions
|
|
@ -584,6 +584,8 @@ json server_task_result_cmpl_final::to_json() {
|
|||
return to_json_oaicompat();
|
||||
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
||||
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
|
||||
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||
return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
|
||||
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||
return stream ? to_json_anthropic_stream() : to_json_anthropic();
|
||||
default:
|
||||
|
|
@ -801,6 +803,186 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
|
|||
return deltas;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_final::to_json_oaicompat_resp() {
|
||||
common_chat_msg msg;
|
||||
if (!oaicompat_msg.empty()) {
|
||||
msg = oaicompat_msg;
|
||||
} else {
|
||||
msg.role = "assistant";
|
||||
msg.content = content;
|
||||
}
|
||||
|
||||
std::vector<json> output;
|
||||
|
||||
if (msg.reasoning_content != "") {
|
||||
output.push_back(json {
|
||||
{"id", "rs_" + random_string()},
|
||||
{"summary", json::array()},
|
||||
{"type", "reasoning"},
|
||||
{"content", json::array({ json {
|
||||
{"text", msg.reasoning_content},
|
||||
{"type", "reasoning_text"},
|
||||
}})},
|
||||
{"encrypted_content", ""},
|
||||
{"status", "completed"},
|
||||
});
|
||||
}
|
||||
|
||||
if (msg.content != "") {
|
||||
output.push_back(json {
|
||||
{"content", json::array({ json {
|
||||
{"type", "output_text"},
|
||||
{"annotations", json::array()},
|
||||
{"logprobs", json::array()},
|
||||
{"text", msg.content},
|
||||
}})},
|
||||
{"id", "msg_" + random_string()},
|
||||
{"role", msg.role},
|
||||
{"status", "completed"},
|
||||
{"type", "message"},
|
||||
});
|
||||
}
|
||||
|
||||
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
|
||||
output.push_back(json {
|
||||
{"type", "function_call"},
|
||||
{"status", "completed"},
|
||||
{"arguments", tool_call.arguments},
|
||||
{"call_id", "fc_" + tool_call.id},
|
||||
{"name", tool_call.name},
|
||||
});
|
||||
}
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
json res = {
|
||||
{"completed_at", t},
|
||||
{"created_at", t},
|
||||
{"id", oai_resp_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "response"},
|
||||
{"output", output},
|
||||
{"status", "completed"},
|
||||
{"usage", json {
|
||||
{"input_tokens", n_prompt_tokens},
|
||||
{"output_tokens", n_decoded},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||
}},
|
||||
};
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
|
||||
std::vector<json> server_sent_events;
|
||||
std::vector<json> output;
|
||||
|
||||
if (oaicompat_msg.reasoning_content != "") {
|
||||
const json output_item = json {
|
||||
{"id", oai_resp_reasoning_id},
|
||||
{"summary", json::array()},
|
||||
{"type", "reasoning"},
|
||||
{"content", json::array({ json {
|
||||
{"text", oaicompat_msg.reasoning_content},
|
||||
{"type", "reasoning_text"},
|
||||
}})},
|
||||
{"encrypted_content", ""},
|
||||
};
|
||||
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_item.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.done"},
|
||||
{"item", output_item}
|
||||
}}
|
||||
});
|
||||
output.push_back(output_item);
|
||||
}
|
||||
|
||||
if (oaicompat_msg.content != "") {
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_text.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_text.done"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"text", oaicompat_msg.content}
|
||||
}}
|
||||
});
|
||||
|
||||
const json content_part = {
|
||||
{"type", "output_text"},
|
||||
{"annotations", json::array()},
|
||||
{"logprobs", json::array()},
|
||||
{"text", oaicompat_msg.content}
|
||||
};
|
||||
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.content_part.done"},
|
||||
{"data", json {
|
||||
{"type", "response.content_part.done"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"part", content_part}
|
||||
}}
|
||||
});
|
||||
const json output_item = {
|
||||
{"type", "message"},
|
||||
{"status", "completed"},
|
||||
{"id", oai_resp_message_id},
|
||||
{"content", json::array({content_part})},
|
||||
{"role", "assistant"}
|
||||
};
|
||||
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_item.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.done"},
|
||||
{"item", output_item}
|
||||
}}
|
||||
});
|
||||
output.push_back(output_item);
|
||||
}
|
||||
|
||||
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
|
||||
const json output_item = {
|
||||
{"type", "function_call"},
|
||||
{"status", "completed"},
|
||||
{"arguments", tool_call.arguments},
|
||||
{"call_id", "fc_" + tool_call.id},
|
||||
{"name", tool_call.name}
|
||||
};
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_item.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.done"},
|
||||
{"item", output_item}
|
||||
}}
|
||||
});
|
||||
output.push_back(output_item);
|
||||
}
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.completed"},
|
||||
{"data", json {
|
||||
{"type", "response.completed"},
|
||||
{"response", json {
|
||||
{"id", oai_resp_id},
|
||||
{"object", "response"},
|
||||
{"created_at", t},
|
||||
{"status", "completed"},
|
||||
{"model", oaicompat_model},
|
||||
{"output", output},
|
||||
{"usage", json {
|
||||
{"input_tokens", n_prompt_tokens},
|
||||
{"output_tokens", n_decoded},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens}
|
||||
}}
|
||||
}},
|
||||
}}
|
||||
});
|
||||
|
||||
return server_sent_events;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_final::to_json_anthropic() {
|
||||
std::string stop_reason = "max_tokens";
|
||||
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||
|
|
@ -1057,6 +1239,36 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
|
|||
//
|
||||
// server_task_result_cmpl_partial
|
||||
//
|
||||
void server_task_result_cmpl_partial::update(task_result_state & state) {
|
||||
is_updated = true;
|
||||
state.update_chat_msg(content, true, oaicompat_msg_diffs);
|
||||
|
||||
// Copy current state for use in to_json_*() (reflects state BEFORE this chunk)
|
||||
thinking_block_started = state.thinking_block_started;
|
||||
text_block_started = state.text_block_started;
|
||||
|
||||
oai_resp_id = state.oai_resp_id;
|
||||
oai_resp_reasoning_id = state.oai_resp_reasoning_id;
|
||||
oai_resp_message_id = state.oai_resp_message_id;
|
||||
oai_resp_fc_id = state.oai_resp_fc_id;
|
||||
|
||||
// track if the accumulated message has any reasoning content
|
||||
anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
|
||||
|
||||
// Pre-compute state updates based on diffs (for next chunk)
|
||||
for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
|
||||
if (!diff.reasoning_content_delta.empty() && !state.thinking_block_started) {
|
||||
state.thinking_block_started = true;
|
||||
}
|
||||
if (!diff.content_delta.empty() && !state.text_block_started) {
|
||||
state.text_block_started = true;
|
||||
}
|
||||
if (!diff.tool_call_delta.name.empty()) {
|
||||
state.oai_resp_fc_id = diff.tool_call_delta.id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_partial::to_json() {
|
||||
GGML_ASSERT(is_updated && "update() must be called before to_json()");
|
||||
switch (res_type) {
|
||||
|
|
@ -1066,6 +1278,8 @@ json server_task_result_cmpl_partial::to_json() {
|
|||
return to_json_oaicompat();
|
||||
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
||||
return to_json_oaicompat_chat();
|
||||
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||
return to_json_oaicompat_resp();
|
||||
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||
return to_json_anthropic();
|
||||
default:
|
||||
|
|
@ -1190,6 +1404,132 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
|
|||
return deltas;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
|
||||
std::vector<json> events;
|
||||
|
||||
if (n_decoded == 1) {
|
||||
events.push_back(json {
|
||||
{"event", "response.created"},
|
||||
{"data", json {
|
||||
{"type", "response.created"},
|
||||
{"response", json {
|
||||
{"id", oai_resp_id},
|
||||
{"object", "response"},
|
||||
{"status", "in_progress"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
events.push_back(json {
|
||||
{"event", "response.in_progress"},
|
||||
{"data", json {
|
||||
{"type", "response.in_progress"},
|
||||
{"response", json {
|
||||
{"id", oai_resp_id},
|
||||
{"object", "response"},
|
||||
{"status", "in_progress"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
}
|
||||
|
||||
for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
|
||||
if (!diff.reasoning_content_delta.empty()) {
|
||||
if (!thinking_block_started) {
|
||||
events.push_back(json {
|
||||
{"event", "response.output_item.added"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.added"},
|
||||
{"item", json {
|
||||
{"id", oai_resp_reasoning_id},
|
||||
{"summary", json::array()},
|
||||
{"type", "reasoning"},
|
||||
{"content", json::array()},
|
||||
{"encrypted_content", ""},
|
||||
{"status", "in_progress"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
thinking_block_started = true;
|
||||
}
|
||||
events.push_back(json {
|
||||
{"event", "response.reasoning_text.delta"},
|
||||
{"data", json {
|
||||
{"type", "response.reasoning_text.delta"},
|
||||
{"delta", diff.reasoning_content_delta},
|
||||
{"item_id", oai_resp_reasoning_id},
|
||||
}},
|
||||
});
|
||||
}
|
||||
|
||||
if (!diff.content_delta.empty()) {
|
||||
if (!text_block_started) {
|
||||
events.push_back(json {
|
||||
{"event", "response.output_item.added"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.added"},
|
||||
{"item", json {
|
||||
{"content", json::array()},
|
||||
{"id", oai_resp_message_id},
|
||||
{"role", "assistant"},
|
||||
{"status", "in_progress"},
|
||||
{"type", "message"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
events.push_back(json {
|
||||
{"event", "response.content_part.added"},
|
||||
{"data", json {
|
||||
{"type", "response.content_part.added"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"part", json {
|
||||
{"type", "output_text"},
|
||||
{"text", ""},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
text_block_started = true;
|
||||
}
|
||||
events.push_back(json {
|
||||
{"event", "response.output_text.delta"},
|
||||
{"data", json {
|
||||
{"type", "response.output_text.delta"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"delta", diff.content_delta},
|
||||
}},
|
||||
});
|
||||
}
|
||||
|
||||
if (!diff.tool_call_delta.name.empty()) {
|
||||
events.push_back(json {
|
||||
{"event", "response.output_item.added"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.added"},
|
||||
{"item", json {
|
||||
{"arguments", ""},
|
||||
{"call_id", "fc_" + diff.tool_call_delta.id},
|
||||
{"name", diff.tool_call_delta.name},
|
||||
{"type", "function_call"},
|
||||
{"status", "in_progress"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
oai_resp_fc_id = diff.tool_call_delta.id;
|
||||
}
|
||||
|
||||
if (!diff.tool_call_delta.arguments.empty()) {
|
||||
events.push_back(json {
|
||||
{"event", "response.function_call_arguments.delta"},
|
||||
{"data", json {
|
||||
{"type", "response.function_call_arguments.delta"},
|
||||
{"delta", diff.tool_call_delta.arguments},
|
||||
{"item_id", "fc_" + oai_resp_fc_id},
|
||||
}},
|
||||
});
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
//
|
||||
// server_task_result_embd
|
||||
//
|
||||
|
|
@ -1260,8 +1600,8 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
|
|||
|
||||
// use local copies of streaming state (copied from task_result_state in update())
|
||||
// these reflect the state BEFORE this chunk was processed
|
||||
bool thinking_started = anthropic_thinking_block_started;
|
||||
bool text_started = anthropic_text_block_started;
|
||||
bool thinking_started = thinking_block_started;
|
||||
bool text_started = text_block_started;
|
||||
|
||||
for (const auto & diff : oaicompat_msg_diffs) {
|
||||
// handle thinking/reasoning content
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue