server: save and clear idle slots on new task (--clear-idle) (#20993)

* server: clear idle slots KV from VRAM (LLAMA_KV_KEEP_ONLY_ACTIVE) * server: move idle slot KV clearing to slot release The save "cost" is now paid by the finishing request. * server: add --kv-clear-idle flag, enable by default * server: skip clearing last idle slot, clear on launch * server: test --no-kv-clear-idle flag * server: simplify on-release clearing loop * server: remove on-release KV clearing, keep launch-only * cont : clean-up * tests: update log strings after --clear-idle rename * tests: use debug tags instead of log message matching * test: fix Windows CI by dropping temp log file unlink --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-04-03 20:02:27 +03:00 · 2026-04-03 20:02:27 +03:00 · 50e0ad08fb
commit 50e0ad08fb
parent f1f793ad06
9 changed files with 187 additions and 18 deletions
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -605,6 +605,17 @@ private:
        llama_batch_free(batch);
    }

+    void slot_save_and_clear(server_slot & slot) {
+        if (slot.prompt.n_tokens() == 0) {
+            return;
+        }
+        SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
+        SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n");
+        slot.prompt_save(*prompt_cache);
+        slot.prompt_clear(false);
+        prompt_cache->update();
+    }
+
    void handle_sleeping_state(bool new_state) {
        GGML_ASSERT(sleeping != new_state);
        if (new_state) {
@ -864,6 +875,19 @@ private:

        metrics.init();

+        if (params_base.clear_idle) {
+            if (!params_base.kv_unified) {
+                SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
+                params_base.clear_idle = false;
+            } else if (params_base.cache_ram_mib == 0) {
+                SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
+                params_base.clear_idle = false;
+            } else {
+                SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
+                SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
+            }
+        }
+
        // populate webui settings
        {
            if (!params_base.webui_config_json.empty()) {
@ -1010,15 +1034,15 @@ private:
            // cache prompts only for completion tasks
            update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;

-            // don't update the cache if the slot's context is empty
-            update_cache = update_cache && tokens.size() > 0;
-
            if (update_cache) {
                SRV_WRN("%s", "updating prompt cache\n");

                const int64_t t_start = ggml_time_us();

-                ret->prompt_save(*prompt_cache);
+                // don't save the slot's state if its context is empty
+                if (tokens.size() > 0) {
+                    ret->prompt_save(*prompt_cache);
+                }

                if (!ret->prompt_load(*prompt_cache, task.tokens)) {
                    ret->prompt_clear(false);
@ -1692,9 +1716,7 @@ private:
                    const int id_slot = task.id_slot;
                    const int id_task = task.id;

-                    server_slot * slot = id_slot != -1
-                                            ? get_slot_by_id(id_slot)
-                                            : get_available_slot(task);
+                    server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);

                    //
                    // slot scheduling logic
@ -1731,6 +1753,14 @@ private:
                        SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
                        break; // drop the task
                    }
+
+                    if (params_base.clear_idle) {
+                        for (auto & s : slots) {
+                            if (!s.is_processing()) {
+                                slot_save_and_clear(s);
+                            }
+                        }
+                    }
                } break;
            case SERVER_TASK_TYPE_CANCEL:
                {