server: prevent data race from HTTP threads (#18263)

* server: prevent data race from HTTP threads * fix params * fix default_generation_settings * nits: make handle_completions_impl looks less strange * stricter const * fix GGML_ASSERT(idx < states.size()) * move index to be managed by server_response_reader * http: make sure req & res lifecycle are tied together * fix compile * fix index handling buggy * fix data race for lora endpoint * nits: fix shadow variable * nits: revert redundant changes * nits: correct naming for json_webui_settings
2025-12-22 14:23:34 +01:00 · 2025-12-22 14:23:34 +01:00 · 6ce863c803
commit 6ce863c803
parent 3997c78e33
11 changed files with 459 additions and 366 deletions
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@ -32,8 +32,8 @@ json task_params::to_json(bool only_metrics) const {
    }

    json lora = json::array();
-    for (size_t i = 0; i < this->lora.size(); ++i) {
-        lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
+    for (auto & it : this->lora) {
+        lora.push_back({{"id", it.first}, {"scale", it.second}});
    }

    if (only_metrics) {
@ -145,12 +145,10 @@ json task_params::to_json(bool only_metrics) const {
 //

 task_params server_task::params_from_json_cmpl(
-        const llama_context * ctx,
+        const llama_vocab * vocab,
        const common_params & params_base,
+        const int n_ctx_slot,
        const json & data) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
    task_params params;

    // Sampling parameter defaults are loaded from the global server context (but individual requests can still them)
@ -223,12 +221,12 @@ task_params server_task::params_from_json_cmpl(

    if (data.contains("lora")) {
        if (data.at("lora").is_array()) {
-            params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
+            params.lora = parse_lora_request(data.at("lora"));
        } else {
            throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
        }
    } else {
-        params.lora = params_base.lora_adapters;
+        params.lora = {};
    }

    // TODO: add more sanity checks for the input parameters
@ -243,11 +241,11 @@ task_params server_task::params_from_json_cmpl(

    if (params.sampling.penalty_last_n == -1) {
        // note: should be the slot's context and not the full context, but it's ok
-        params.sampling.penalty_last_n = llama_n_ctx(ctx);
+        params.sampling.penalty_last_n = n_ctx_slot;
    }

    if (params.sampling.dry_penalty_last_n == -1) {
-        params.sampling.dry_penalty_last_n = llama_n_ctx(ctx);
+        params.sampling.dry_penalty_last_n = n_ctx_slot;
    }

    if (params.sampling.dry_base < 1.0f) {
@ -1324,6 +1322,30 @@ json server_task_result_slot_erase::to_json() {
    };
 }

+//
+// server_task_result_get_lora
+//
+
+json server_task_result_get_lora::to_json() {
+    json result = json::array();
+    for (size_t i = 0; i < loras.size(); ++i) {
+        auto & lora = loras[i];
+        json entry = {
+            {"id",            i},
+            {"path",          lora.info.path},
+            {"scale",         lora.info.scale},
+            {"task_name",     lora.info.task_name},
+            {"prompt_prefix", lora.info.prompt_prefix},
+        };
+        if (!lora.alora_invocation_tokens.empty()) {
+            entry["alora_invocation_string"] = lora.alora_invocation_string;
+            entry["alora_invocation_tokens"] = lora.alora_invocation_tokens;
+        }
+        result.push_back(std::move(entry));
+    }
+    return result;
+}
+
 //
 // server_task_result_apply_lora
 //