llama-cpp-turboquant/tools/server/tests/unit/test_kv_keep_only_active.py
Yes You Can Have Your Own 50e0ad08fb
server: save and clear idle slots on new task (--clear-idle) (#20993)
* server: clear idle slots KV from VRAM (LLAMA_KV_KEEP_ONLY_ACTIVE)

* server: move idle slot KV clearing to slot release

The save "cost" is now paid by the finishing request.

* server: add --kv-clear-idle flag, enable by default

* server: skip clearing last idle slot, clear on launch

* server: test --no-kv-clear-idle flag

* server: simplify on-release clearing loop

* server: remove on-release KV clearing, keep launch-only

* cont : clean-up

* tests: update log strings after --clear-idle rename

* tests: use debug tags instead of log message matching

* test: fix Windows CI by dropping temp log file unlink

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-04-03 19:02:27 +02:00

115 lines
3.4 KiB
Python

import os
import tempfile
import pytest
from utils import *
server = ServerPreset.tinyllama2()
class LogReader:
def __init__(self, path):
self.path = path
self.pos = 0
def drain(self):
with open(self.path) as f:
f.seek(self.pos)
content = f.read()
self.pos = f.tell()
return content
@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
server.n_slots = 2
server.n_predict = 4
server.temperature = 0.0
server.server_slots = True
server.cache_ram = 100
server.kv_unified = True
server.debug = True
fd, server.log_path = tempfile.mkstemp(suffix='.log')
os.close(fd)
yield
LONG_PROMPT = (
"Once upon a time in a land far away, there lived a brave knight "
"who traveled across mountains and rivers to find the legendary "
"golden sword hidden deep within the enchanted forest of whispers. "
"He met many creatures along the way including dragons and fairies "
"and wizards who helped him on his noble quest to save the kingdom."
)
# idle slot cleared on launch should restore from cache-ram
def test_clear_and_restore():
global server
server.start()
log = LogReader(server.log_path)
# verify feature is enabled
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain()
res = server.make_request("POST", "/completion", data={
"prompt": LONG_PROMPT,
"id_slot": 0,
"cache_prompt": True,
})
assert res.status_code == 200
original_prompt_n = res.body["timings"]["prompt_n"]
# Slot 0 is the only slot with KV — should NOT be cleared
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
# Launching slot 1 clears idle slot 0
res = server.make_request("POST", "/completion", data={
"prompt": "The quick brown fox",
"id_slot": 1,
"cache_prompt": True,
})
assert res.status_code == 200
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain()
# Re-send same prompt — should restore from cache-ram
res = server.make_request("POST", "/completion", data={
"prompt": LONG_PROMPT,
"cache_prompt": True,
})
assert res.status_code == 200
assert "updating prompt cache" in log.drain()
assert res.body["timings"]["cache_n"] > 0
assert res.body["timings"]["prompt_n"] < original_prompt_n
# Follow-up — slot 0 kept its KV, no clearing needed
res = server.make_request("POST", "/completion", data={
"prompt": LONG_PROMPT + " The knight finally reached the castle gates.",
"cache_prompt": True,
})
assert res.status_code == 200
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
def test_disabled_with_flag():
global server
server.no_clear_idle = True
server.start()
log = LogReader(server.log_path)
# Feature should not be enabled
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain()
res = server.make_request("POST", "/completion", data={
"prompt": LONG_PROMPT,
"id_slot": 0,
"cache_prompt": True,
})
assert res.status_code == 200
# Request on different slot — should NOT trigger clearing
res = server.make_request("POST", "/completion", data={
"prompt": "The quick brown fox",
"id_slot": 1,
"cache_prompt": True,
})
assert res.status_code == 200
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()