server: Add cached_tokens info to oaicompat responses (#19361)

* tests : fix fetch_server_test_models.py

* server: to_json_oaicompat cached_tokens

Adds OpenAI and Anthropic compatible information about the
number of cached prompt tokens used in a response.
This commit is contained in:
Ryan Goulden 2026-03-19 11:09:33 -07:00 committed by GitHub
parent 76f2dc70c3
commit 26c9ce1288
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 61 additions and 31 deletions

View file

@ -1432,9 +1432,10 @@ private:
res->tokens = { tkn.tok };
}
res->n_decoded = slot.n_decoded;
res->n_prompt_tokens = slot.task->n_tokens();
res->post_sampling_probs = slot.task->params.post_sampling_probs;
res->n_decoded = slot.n_decoded;
res->n_prompt_tokens = slot.task->n_tokens();
res->n_prompt_tokens_cache = slot.n_prompt_tokens_cache;
res->post_sampling_probs = slot.task->params.post_sampling_probs;
res->verbose = slot.task->params.verbose;
res->res_type = slot.task->params.res_type;
@ -1479,14 +1480,15 @@ private:
res->prompt = slot.task->tokens.detokenize(ctx, true);
res->response_fields = std::move(slot.task->params.response_fields);
res->truncated = slot.truncated;
res->n_decoded = slot.n_decoded;
res->n_prompt_tokens = slot.task->n_tokens();
res->n_tokens_cached = slot.prompt.n_tokens();
res->has_new_line = slot.has_new_line;
res->stopping_word = slot.stopping_word;
res->stop = slot.stop;
res->post_sampling_probs = slot.task->params.post_sampling_probs;
res->truncated = slot.truncated;
res->n_decoded = slot.n_decoded;
res->n_prompt_tokens = slot.task->n_tokens();
res->n_prompt_tokens_cache = slot.n_prompt_tokens_cache;
res->n_tokens_cached = slot.prompt.n_tokens();
res->has_new_line = slot.has_new_line;
res->stopping_word = slot.stopping_word;
res->stop = slot.stop;
res->post_sampling_probs = slot.task->params.post_sampling_probs;
res->verbose = slot.task->params.verbose;
res->stream = slot.task->params.stream;