server: save and clear idle slots on new task (--clear-idle) (#20993)

* server: clear idle slots KV from VRAM (LLAMA_KV_KEEP_ONLY_ACTIVE)

* server: move idle slot KV clearing to slot release

The save "cost" is now paid by the finishing request.

* server: add --kv-clear-idle flag, enable by default

* server: skip clearing last idle slot, clear on launch

* server: test --no-kv-clear-idle flag

* server: simplify on-release clearing loop

* server: remove on-release KV clearing, keep launch-only

* cont : clean-up

* tests: update log strings after --clear-idle rename

* tests: use debug tags instead of log message matching

* test: fix Windows CI by dropping temp log file unlink

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Yes You Can Have Your Own 2026-04-03 20:02:27 +03:00 committed by GitHub
parent f1f793ad06
commit 50e0ad08fb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 187 additions and 18 deletions

View file

@ -605,6 +605,17 @@ private:
llama_batch_free(batch);
}
void slot_save_and_clear(server_slot & slot) {
if (slot.prompt.n_tokens() == 0) {
return;
}
SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n");
slot.prompt_save(*prompt_cache);
slot.prompt_clear(false);
prompt_cache->update();
}
void handle_sleeping_state(bool new_state) {
GGML_ASSERT(sleeping != new_state);
if (new_state) {
@ -864,6 +875,19 @@ private:
metrics.init();
if (params_base.clear_idle) {
if (!params_base.kv_unified) {
SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
params_base.clear_idle = false;
} else if (params_base.cache_ram_mib == 0) {
SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
params_base.clear_idle = false;
} else {
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
}
}
// populate webui settings
{
if (!params_base.webui_config_json.empty()) {
@ -1010,15 +1034,15 @@ private:
// cache prompts only for completion tasks
update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
// don't update the cache if the slot's context is empty
update_cache = update_cache && tokens.size() > 0;
if (update_cache) {
SRV_WRN("%s", "updating prompt cache\n");
const int64_t t_start = ggml_time_us();
ret->prompt_save(*prompt_cache);
// don't save the slot's state if its context is empty
if (tokens.size() > 0) {
ret->prompt_save(*prompt_cache);
}
if (!ret->prompt_load(*prompt_cache, task.tokens)) {
ret->prompt_clear(false);
@ -1692,9 +1716,7 @@ private:
const int id_slot = task.id_slot;
const int id_task = task.id;
server_slot * slot = id_slot != -1
? get_slot_by_id(id_slot)
: get_available_slot(task);
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
//
// slot scheduling logic
@ -1731,6 +1753,14 @@ private:
SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
break; // drop the task
}
if (params_base.clear_idle) {
for (auto & s : slots) {
if (!s.is_processing()) {
slot_save_and_clear(s);
}
}
}
} break;
case SERVER_TASK_TYPE_CANCEL:
{