server: Add cached_tokens info to oaicompat responses (#19361)

* tests : fix fetch_server_test_models.py

* server: to_json_oaicompat cached_tokens

Adds OpenAI and Anthropic compatible information about the
number of cached prompt tokens used in a response.
This commit is contained in:
Ryan Goulden 2026-03-19 11:09:33 -07:00 committed by GitHub
parent 76f2dc70c3
commit 26c9ce1288
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 61 additions and 31 deletions

View file

@ -755,6 +755,15 @@ json server_task_result_cmpl_final::to_json_non_oaicompat() {
return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
}
json server_task_result_cmpl_final::usage_json_oaicompat() {
return json {
{"completion_tokens", n_decoded},
{"prompt_tokens", n_prompt_tokens},
{"total_tokens", n_decoded + n_prompt_tokens},
{"prompt_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
};
}
json server_task_result_cmpl_final::to_json_oaicompat() {
std::time_t t = std::time(0);
json logprobs = json(nullptr); // OAI default to null
@ -780,11 +789,7 @@ json server_task_result_cmpl_final::to_json_oaicompat() {
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"object", "text_completion"},
{"usage", json {
{"completion_tokens", n_decoded},
{"prompt_tokens", n_prompt_tokens},
{"total_tokens", n_decoded + n_prompt_tokens}
}},
{"usage", usage_json_oaicompat()},
{"id", oaicompat_cmpl_id}
};
@ -832,11 +837,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat() {
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"object", "chat.completion"},
{"usage", json {
{"completion_tokens", n_decoded},
{"prompt_tokens", n_prompt_tokens},
{"total_tokens", n_decoded + n_prompt_tokens}
}},
{"usage", usage_json_oaicompat()},
{"id", oaicompat_cmpl_id}
};
@ -901,11 +902,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"object", "chat.completion.chunk"},
{"usage", json {
{"completion_tokens", n_decoded},
{"prompt_tokens", n_prompt_tokens},
{"total_tokens", n_decoded + n_prompt_tokens},
}},
{"usage", usage_json_oaicompat()},
});
}
@ -984,6 +981,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp() {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens},
{"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
}},
};
@ -1092,7 +1090,8 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens}
{"total_tokens", n_decoded + n_prompt_tokens},
{"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
}}
}},
}}
@ -1158,7 +1157,8 @@ json server_task_result_cmpl_final::to_json_anthropic() {
{"stop_reason", stop_reason},
{"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)},
{"usage", {
{"input_tokens", n_prompt_tokens},
{"cache_read_input_tokens", n_prompt_tokens_cache},
{"input_tokens", n_prompt_tokens - n_prompt_tokens_cache},
{"output_tokens", n_decoded}
}}
};
@ -1668,7 +1668,8 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
{"stop_reason", nullptr},
{"stop_sequence", nullptr},
{"usage", {
{"input_tokens", n_prompt_tokens},
{"cache_read_input_tokens", n_prompt_tokens_cache},
{"input_tokens", n_prompt_tokens - n_prompt_tokens_cache},
{"output_tokens", 0}
}}
}}