server: prevent data race from HTTP threads (#18263)
* server: prevent data race from HTTP threads * fix params * fix default_generation_settings * nits: make handle_completions_impl looks less strange * stricter const * fix GGML_ASSERT(idx < states.size()) * move index to be managed by server_response_reader * http: make sure req & res lifecycle are tied together * fix compile * fix index handling buggy * fix data race for lora endpoint * nits: fix shadow variable * nits: revert redundant changes * nits: correct naming for json_webui_settings
This commit is contained in:
parent
3997c78e33
commit
6ce863c803
11 changed files with 459 additions and 366 deletions
|
|
@ -32,8 +32,8 @@ json task_params::to_json(bool only_metrics) const {
|
|||
}
|
||||
|
||||
json lora = json::array();
|
||||
for (size_t i = 0; i < this->lora.size(); ++i) {
|
||||
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
|
||||
for (auto & it : this->lora) {
|
||||
lora.push_back({{"id", it.first}, {"scale", it.second}});
|
||||
}
|
||||
|
||||
if (only_metrics) {
|
||||
|
|
@ -145,12 +145,10 @@ json task_params::to_json(bool only_metrics) const {
|
|||
//
|
||||
|
||||
task_params server_task::params_from_json_cmpl(
|
||||
const llama_context * ctx,
|
||||
const llama_vocab * vocab,
|
||||
const common_params & params_base,
|
||||
const int n_ctx_slot,
|
||||
const json & data) {
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
task_params params;
|
||||
|
||||
// Sampling parameter defaults are loaded from the global server context (but individual requests can still them)
|
||||
|
|
@ -223,12 +221,12 @@ task_params server_task::params_from_json_cmpl(
|
|||
|
||||
if (data.contains("lora")) {
|
||||
if (data.at("lora").is_array()) {
|
||||
params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
|
||||
params.lora = parse_lora_request(data.at("lora"));
|
||||
} else {
|
||||
throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
|
||||
}
|
||||
} else {
|
||||
params.lora = params_base.lora_adapters;
|
||||
params.lora = {};
|
||||
}
|
||||
|
||||
// TODO: add more sanity checks for the input parameters
|
||||
|
|
@ -243,11 +241,11 @@ task_params server_task::params_from_json_cmpl(
|
|||
|
||||
if (params.sampling.penalty_last_n == -1) {
|
||||
// note: should be the slot's context and not the full context, but it's ok
|
||||
params.sampling.penalty_last_n = llama_n_ctx(ctx);
|
||||
params.sampling.penalty_last_n = n_ctx_slot;
|
||||
}
|
||||
|
||||
if (params.sampling.dry_penalty_last_n == -1) {
|
||||
params.sampling.dry_penalty_last_n = llama_n_ctx(ctx);
|
||||
params.sampling.dry_penalty_last_n = n_ctx_slot;
|
||||
}
|
||||
|
||||
if (params.sampling.dry_base < 1.0f) {
|
||||
|
|
@ -1324,6 +1322,30 @@ json server_task_result_slot_erase::to_json() {
|
|||
};
|
||||
}
|
||||
|
||||
//
|
||||
// server_task_result_get_lora
|
||||
//
|
||||
|
||||
json server_task_result_get_lora::to_json() {
|
||||
json result = json::array();
|
||||
for (size_t i = 0; i < loras.size(); ++i) {
|
||||
auto & lora = loras[i];
|
||||
json entry = {
|
||||
{"id", i},
|
||||
{"path", lora.info.path},
|
||||
{"scale", lora.info.scale},
|
||||
{"task_name", lora.info.task_name},
|
||||
{"prompt_prefix", lora.info.prompt_prefix},
|
||||
};
|
||||
if (!lora.alora_invocation_tokens.empty()) {
|
||||
entry["alora_invocation_string"] = lora.alora_invocation_string;
|
||||
entry["alora_invocation_tokens"] = lora.alora_invocation_tokens;
|
||||
}
|
||||
result.push_back(std::move(entry));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
//
|
||||
// server_task_result_apply_lora
|
||||
//
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue