server: Add cached_tokens info to oaicompat responses (#19361)
* tests : fix fetch_server_test_models.py * server: to_json_oaicompat cached_tokens Adds OpenAI and Anthropic compatible information about the number of cached prompt tokens used in a response.
This commit is contained in:
parent
76f2dc70c3
commit
26c9ce1288
6 changed files with 61 additions and 31 deletions
|
|
@ -1432,9 +1432,10 @@ private:
|
|||
res->tokens = { tkn.tok };
|
||||
}
|
||||
|
||||
res->n_decoded = slot.n_decoded;
|
||||
res->n_prompt_tokens = slot.task->n_tokens();
|
||||
res->post_sampling_probs = slot.task->params.post_sampling_probs;
|
||||
res->n_decoded = slot.n_decoded;
|
||||
res->n_prompt_tokens = slot.task->n_tokens();
|
||||
res->n_prompt_tokens_cache = slot.n_prompt_tokens_cache;
|
||||
res->post_sampling_probs = slot.task->params.post_sampling_probs;
|
||||
|
||||
res->verbose = slot.task->params.verbose;
|
||||
res->res_type = slot.task->params.res_type;
|
||||
|
|
@ -1479,14 +1480,15 @@ private:
|
|||
res->prompt = slot.task->tokens.detokenize(ctx, true);
|
||||
res->response_fields = std::move(slot.task->params.response_fields);
|
||||
|
||||
res->truncated = slot.truncated;
|
||||
res->n_decoded = slot.n_decoded;
|
||||
res->n_prompt_tokens = slot.task->n_tokens();
|
||||
res->n_tokens_cached = slot.prompt.n_tokens();
|
||||
res->has_new_line = slot.has_new_line;
|
||||
res->stopping_word = slot.stopping_word;
|
||||
res->stop = slot.stop;
|
||||
res->post_sampling_probs = slot.task->params.post_sampling_probs;
|
||||
res->truncated = slot.truncated;
|
||||
res->n_decoded = slot.n_decoded;
|
||||
res->n_prompt_tokens = slot.task->n_tokens();
|
||||
res->n_prompt_tokens_cache = slot.n_prompt_tokens_cache;
|
||||
res->n_tokens_cached = slot.prompt.n_tokens();
|
||||
res->has_new_line = slot.has_new_line;
|
||||
res->stopping_word = slot.stopping_word;
|
||||
res->stop = slot.stop;
|
||||
res->post_sampling_probs = slot.task->params.post_sampling_probs;
|
||||
|
||||
res->verbose = slot.task->params.verbose;
|
||||
res->stream = slot.task->params.stream;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue