server: add auto-sleep after N seconds of idle (#18228)

* implement sleeping at queue level

* implement server-context suspend

* add test

* add docs

* optimization: add fast path

* make sure to free llama_init

* nits

* fix use-after-free

* allow /models to be accessed during sleeping, fix use-after-free

* don't allow accessing /models during sleep, it is not thread-safe

* fix data race on accessing props and model_meta

* small clean up

* trailing whitespace

* rm outdated comments
This commit is contained in:
Xuan-Son Nguyen 2025-12-21 02:24:42 +01:00 committed by GitHub
parent 52ab19df63
commit ddcb75dd8a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 355 additions and 122 deletions

View file

@ -252,7 +252,6 @@ int main(int argc, char ** argv, char ** envp) {
return 1;
}
ctx_server.init();
ctx_http.is_ready.store(true);
LOG_INF("%s: model loaded\n", __func__);
@ -309,7 +308,11 @@ int main(int argc, char ** argv, char ** envp) {
if (monitor_thread.joinable()) {
monitor_thread.join();
}
llama_memory_breakdown_print(ctx_server.get_llama_context());
auto * ll_ctx = ctx_server.get_llama_context();
if (ll_ctx != nullptr) {
llama_memory_breakdown_print(ll_ctx);
}
}
return 0;