server: add auto-sleep after N seconds of idle (#18228)
* implement sleeping at queue level * implement server-context suspend * add test * add docs * optimization: add fast path * make sure to free llama_init * nits * fix use-after-free * allow /models to be accessed during sleeping, fix use-after-free * don't allow accessing /models during sleep, it is not thread-safe * fix data race on accessing props and model_meta * small clean up * trailing whitespace * rm outdated comments
This commit is contained in:
parent
52ab19df63
commit
ddcb75dd8a
12 changed files with 355 additions and 122 deletions
|
|
@ -33,6 +33,7 @@ int server_queue::post(server_task && task, bool front) {
|
|||
} else {
|
||||
queue_tasks.push_back(std::move(task));
|
||||
}
|
||||
time_last_task = ggml_time_ms();
|
||||
condition_tasks.notify_one();
|
||||
return task_id;
|
||||
}
|
||||
|
|
@ -54,6 +55,7 @@ int server_queue::post(std::vector<server_task> && tasks, bool front) {
|
|||
queue_tasks.push_back(std::move(task));
|
||||
}
|
||||
}
|
||||
time_last_task = ggml_time_ms();
|
||||
condition_tasks.notify_one();
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -62,6 +64,7 @@ void server_queue::defer(server_task && task) {
|
|||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
QUE_DBG("defer task, id = %d\n", task.id);
|
||||
queue_tasks_deferred.push_back(std::move(task));
|
||||
time_last_task = ggml_time_ms();
|
||||
condition_tasks.notify_one();
|
||||
}
|
||||
|
||||
|
|
@ -71,31 +74,52 @@ int server_queue::get_new_id() {
|
|||
return new_id;
|
||||
}
|
||||
|
||||
void server_queue::on_new_task(std::function<void(server_task &&)> callback) {
|
||||
callback_new_task = std::move(callback);
|
||||
}
|
||||
|
||||
void server_queue::on_update_slots(std::function<void(void)> callback) {
|
||||
callback_update_slots = std::move(callback);
|
||||
}
|
||||
|
||||
void server_queue::pop_deferred_task() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (!queue_tasks_deferred.empty()) {
|
||||
queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
|
||||
queue_tasks_deferred.pop_front();
|
||||
}
|
||||
time_last_task = ggml_time_ms();
|
||||
condition_tasks.notify_one();
|
||||
}
|
||||
|
||||
void server_queue::wait_until_no_sleep() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (!sleeping) {
|
||||
return;
|
||||
} else {
|
||||
if (!req_stop_sleeping) {
|
||||
QUE_DBG("%s", "requesting to stop sleeping\n");
|
||||
req_stop_sleeping = true;
|
||||
condition_tasks.notify_one(); // only main thread is waiting on this
|
||||
}
|
||||
QUE_DBG("%s", "waiting until no sleep\n");
|
||||
condition_tasks.wait(lock, [&]{
|
||||
return !sleeping;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void server_queue::terminate() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
running = false;
|
||||
condition_tasks.notify_all();
|
||||
}
|
||||
|
||||
void server_queue::start_loop() {
|
||||
void server_queue::start_loop(int64_t idle_sleep_ms) {
|
||||
running = true;
|
||||
time_last_task = ggml_time_ms();
|
||||
|
||||
constexpr auto max_wait_time = std::chrono::seconds(1);
|
||||
auto should_sleep = [&]() -> bool {
|
||||
// caller must hold mutex_tasks
|
||||
if (idle_sleep_ms < 0) {
|
||||
return false;
|
||||
}
|
||||
int64_t now = ggml_time_ms();
|
||||
return (now - time_last_task) >= idle_sleep_ms;
|
||||
};
|
||||
|
||||
while (true) {
|
||||
QUE_DBG("%s", "processing new tasks\n");
|
||||
|
|
@ -117,23 +141,53 @@ void server_queue::start_loop() {
|
|||
QUE_DBG("processing task, id = %d\n", task.id);
|
||||
callback_new_task(std::move(task));
|
||||
}
|
||||
|
||||
// all tasks in the current loop is processed, slots data is now ready
|
||||
QUE_DBG("%s", "update slots\n");
|
||||
|
||||
// this will run the main inference process for all slots
|
||||
callback_update_slots();
|
||||
{
|
||||
// update_slots() may take a while to finish, we need to make sure it's not counted as idle
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
time_last_task = ggml_time_ms();
|
||||
}
|
||||
|
||||
QUE_DBG("%s", "waiting for new tasks\n");
|
||||
{
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (!running) {
|
||||
QUE_DBG("%s", "terminate\n");
|
||||
return;
|
||||
if (!running || !queue_tasks.empty()) {
|
||||
break; // go back to process new tasks or terminate
|
||||
}
|
||||
if (queue_tasks.empty()) {
|
||||
|
||||
// no tasks, check for sleeping state
|
||||
if (should_sleep()) {
|
||||
QUE_INF("%s", "entering sleeping state\n");
|
||||
sleeping = true;
|
||||
callback_sleeping_state(true);
|
||||
req_stop_sleeping = false;
|
||||
// wait until we are requested to exit sleeping state
|
||||
condition_tasks.wait(lock, [&]{
|
||||
return (!running || req_stop_sleeping);
|
||||
});
|
||||
if (!running) { // may changed during sleep
|
||||
break; // terminate
|
||||
}
|
||||
QUE_INF("%s", "exiting sleeping state\n");
|
||||
req_stop_sleeping = false;
|
||||
callback_sleeping_state(false);
|
||||
sleeping = false;
|
||||
time_last_task = ggml_time_ms();
|
||||
condition_tasks.notify_all(); // notify wait_until_no_sleep()
|
||||
break; // process new tasks
|
||||
} else {
|
||||
// wait for new tasks or timeout for checking sleeping condition
|
||||
bool res = condition_tasks.wait_for(lock, max_wait_time, [&]{
|
||||
return (!queue_tasks.empty() || !running);
|
||||
});
|
||||
if (res) {
|
||||
break; // new task arrived or terminate
|
||||
}
|
||||
// otherwise, loop again to check sleeping condition
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue