server: save and clear idle slots on new task (--clear-idle) (#20993)
* server: clear idle slots KV from VRAM (LLAMA_KV_KEEP_ONLY_ACTIVE) * server: move idle slot KV clearing to slot release The save "cost" is now paid by the finishing request. * server: add --kv-clear-idle flag, enable by default * server: skip clearing last idle slot, clear on launch * server: test --no-kv-clear-idle flag * server: simplify on-release clearing loop * server: remove on-release KV clearing, keep launch-only * cont : clean-up * tests: update log strings after --clear-idle rename * tests: use debug tags instead of log message matching * test: fix Windows CI by dropping temp log file unlink --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
f1f793ad06
commit
50e0ad08fb
9 changed files with 187 additions and 18 deletions
|
|
@ -605,6 +605,17 @@ private:
|
|||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
void slot_save_and_clear(server_slot & slot) {
|
||||
if (slot.prompt.n_tokens() == 0) {
|
||||
return;
|
||||
}
|
||||
SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
|
||||
SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n");
|
||||
slot.prompt_save(*prompt_cache);
|
||||
slot.prompt_clear(false);
|
||||
prompt_cache->update();
|
||||
}
|
||||
|
||||
void handle_sleeping_state(bool new_state) {
|
||||
GGML_ASSERT(sleeping != new_state);
|
||||
if (new_state) {
|
||||
|
|
@ -864,6 +875,19 @@ private:
|
|||
|
||||
metrics.init();
|
||||
|
||||
if (params_base.clear_idle) {
|
||||
if (!params_base.kv_unified) {
|
||||
SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
|
||||
params_base.clear_idle = false;
|
||||
} else if (params_base.cache_ram_mib == 0) {
|
||||
SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
|
||||
params_base.clear_idle = false;
|
||||
} else {
|
||||
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
|
||||
SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
|
||||
}
|
||||
}
|
||||
|
||||
// populate webui settings
|
||||
{
|
||||
if (!params_base.webui_config_json.empty()) {
|
||||
|
|
@ -1010,15 +1034,15 @@ private:
|
|||
// cache prompts only for completion tasks
|
||||
update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
|
||||
|
||||
// don't update the cache if the slot's context is empty
|
||||
update_cache = update_cache && tokens.size() > 0;
|
||||
|
||||
if (update_cache) {
|
||||
SRV_WRN("%s", "updating prompt cache\n");
|
||||
|
||||
const int64_t t_start = ggml_time_us();
|
||||
|
||||
ret->prompt_save(*prompt_cache);
|
||||
// don't save the slot's state if its context is empty
|
||||
if (tokens.size() > 0) {
|
||||
ret->prompt_save(*prompt_cache);
|
||||
}
|
||||
|
||||
if (!ret->prompt_load(*prompt_cache, task.tokens)) {
|
||||
ret->prompt_clear(false);
|
||||
|
|
@ -1692,9 +1716,7 @@ private:
|
|||
const int id_slot = task.id_slot;
|
||||
const int id_task = task.id;
|
||||
|
||||
server_slot * slot = id_slot != -1
|
||||
? get_slot_by_id(id_slot)
|
||||
: get_available_slot(task);
|
||||
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
|
||||
|
||||
//
|
||||
// slot scheduling logic
|
||||
|
|
@ -1731,6 +1753,14 @@ private:
|
|||
SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
|
||||
break; // drop the task
|
||||
}
|
||||
|
||||
if (params_base.clear_idle) {
|
||||
for (auto & s : slots) {
|
||||
if (!s.is_processing()) {
|
||||
slot_save_and_clear(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case SERVER_TASK_TYPE_CANCEL:
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue