server: move msg diffs tracking to HTTP thread (#17740)

* server: move msg diffs tracking to HTTP thread

* wip

* tool call tests ok

* minor : style

* cont : fix

* move states to server_response_reader

* add safe-guard

* fix

* fix 2

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Xuan-Son Nguyen 2025-12-04 15:46:08 +01:00 committed by GitHub
parent 817d743cc1
commit c4c10bfb86
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 167 additions and 94 deletions

View file

@ -565,6 +565,7 @@ std::vector<unsigned char> completion_token_output::str_to_bytes(const std::stri
// server_task_result_cmpl_final
//
json server_task_result_cmpl_final::to_json() {
GGML_ASSERT(is_updated && "update() must be called before to_json()");
switch (res_type) {
case TASK_RESPONSE_TYPE_NONE:
return to_json_non_oaicompat();
@ -582,8 +583,8 @@ json server_task_result_cmpl_final::to_json() {
json server_task_result_cmpl_final::to_json_non_oaicompat() {
json res = json {
{"index", index},
{"content", stream ? "" : content}, // in stream mode, content is already in last partial chunk
{"tokens", stream ? llama_tokens {} : tokens},
{"content", content},
{"tokens", tokens},
{"id_slot", id_slot},
{"stop", true},
{"model", oaicompat_model},
@ -619,7 +620,7 @@ json server_task_result_cmpl_final::to_json_oaicompat() {
json res = json {
{"choices", json::array({
json{
{"text", stream ? "" : content}, // in stream mode, content is already in last partial chunk
{"text", content},
{"index", index},
{"logprobs", logprobs},
{"finish_reason", finish_reason},
@ -700,6 +701,25 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat() {
return res;
}
common_chat_msg task_result_state::update_chat_msg(
const std::string & text_added,
bool is_partial,
std::vector<common_chat_msg_diff> & diffs) {
generated_text += text_added;
auto msg_prv_copy = chat_msg;
SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
auto new_msg = common_chat_parse(
generated_text,
is_partial,
oaicompat_chat_syntax);
if (!new_msg.empty()) {
new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id);
chat_msg = new_msg;
diffs = common_chat_msg_diff::compute_diffs(msg_prv_copy, new_msg.empty() ? msg_prv_copy : new_msg);
}
return chat_msg;
}
json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
std::time_t t = std::time(0);
std::string finish_reason = "length";
@ -956,6 +976,7 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
// server_task_result_cmpl_partial
//
json server_task_result_cmpl_partial::to_json() {
GGML_ASSERT(is_updated && "update() must be called before to_json()");
switch (res_type) {
case TASK_RESPONSE_TYPE_NONE:
return to_json_non_oaicompat();