server: add auto-sleep after N seconds of idle (#18228)
* implement sleeping at queue level * implement server-context suspend * add test * add docs * optimization: add fast path * make sure to free llama_init * nits * fix use-after-free * allow /models to be accessed during sleeping, fix use-after-free * don't allow accessing /models during sleep, it is not thread-safe * fix data race on accessing props and model_meta * small clean up * trailing whitespace * rm outdated comments
This commit is contained in:
parent
52ab19df63
commit
ddcb75dd8a
12 changed files with 355 additions and 122 deletions
|
|
@ -252,7 +252,6 @@ int main(int argc, char ** argv, char ** envp) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
ctx_server.init();
|
||||
ctx_http.is_ready.store(true);
|
||||
|
||||
LOG_INF("%s: model loaded\n", __func__);
|
||||
|
|
@ -309,7 +308,11 @@ int main(int argc, char ** argv, char ** envp) {
|
|||
if (monitor_thread.joinable()) {
|
||||
monitor_thread.join();
|
||||
}
|
||||
llama_memory_breakdown_print(ctx_server.get_llama_context());
|
||||
|
||||
auto * ll_ctx = ctx_server.get_llama_context();
|
||||
if (ll_ctx != nullptr) {
|
||||
llama_memory_breakdown_print(ll_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue