server: add auto-sleep after N seconds of idle (#18228)

* implement sleeping at queue level * implement server-context suspend * add test * add docs * optimization: add fast path * make sure to free llama_init * nits * fix use-after-free * allow /models to be accessed during sleeping, fix use-after-free * don't allow accessing /models during sleep, it is not thread-safe * fix data race on accessing props and model_meta * small clean up * trailing whitespace * rm outdated comments
2025-12-21 02:24:42 +01:00 · 2025-12-21 02:24:42 +01:00 · ddcb75dd8a
commit ddcb75dd8a
parent 52ab19df63
12 changed files with 355 additions and 122 deletions
--- a/tools/server/server-queue.cpp
+++ b/tools/server/server-queue.cpp
@ -33,6 +33,7 @@ int server_queue::post(server_task && task, bool front) {
    } else {
        queue_tasks.push_back(std::move(task));
    }
+    time_last_task = ggml_time_ms();
    condition_tasks.notify_one();
    return task_id;
 }
@ -54,6 +55,7 @@ int server_queue::post(std::vector<server_task> && tasks, bool front) {
            queue_tasks.push_back(std::move(task));
        }
    }
+    time_last_task = ggml_time_ms();
    condition_tasks.notify_one();
    return 0;
 }
@ -62,6 +64,7 @@ void server_queue::defer(server_task && task) {
    std::unique_lock<std::mutex> lock(mutex_tasks);
    QUE_DBG("defer task, id = %d\n", task.id);
    queue_tasks_deferred.push_back(std::move(task));
+    time_last_task = ggml_time_ms();
    condition_tasks.notify_one();
 }

@ -71,31 +74,52 @@ int server_queue::get_new_id() {
    return new_id;
 }

-void server_queue::on_new_task(std::function<void(server_task &&)> callback) {
-    callback_new_task = std::move(callback);
-}
-
-void server_queue::on_update_slots(std::function<void(void)> callback) {
-    callback_update_slots = std::move(callback);
-}
-
 void server_queue::pop_deferred_task() {
    std::unique_lock<std::mutex> lock(mutex_tasks);
    if (!queue_tasks_deferred.empty()) {
        queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
        queue_tasks_deferred.pop_front();
    }
+    time_last_task = ggml_time_ms();
    condition_tasks.notify_one();
 }

+void server_queue::wait_until_no_sleep() {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    if (!sleeping) {
+        return;
+    } else {
+        if (!req_stop_sleeping) {
+            QUE_DBG("%s", "requesting to stop sleeping\n");
+            req_stop_sleeping = true;
+            condition_tasks.notify_one(); // only main thread is waiting on this
+        }
+        QUE_DBG("%s", "waiting until no sleep\n");
+        condition_tasks.wait(lock, [&]{
+            return !sleeping;
+        });
+    }
+}
+
 void server_queue::terminate() {
    std::unique_lock<std::mutex> lock(mutex_tasks);
    running = false;
    condition_tasks.notify_all();
 }

-void server_queue::start_loop() {
+void server_queue::start_loop(int64_t idle_sleep_ms) {
    running = true;
+    time_last_task = ggml_time_ms();
+
+    constexpr auto max_wait_time = std::chrono::seconds(1);
+    auto should_sleep = [&]() -> bool {
+        // caller must hold mutex_tasks
+        if (idle_sleep_ms < 0) {
+            return false;
+        }
+        int64_t now = ggml_time_ms();
+        return (now - time_last_task) >= idle_sleep_ms;
+    };

    while (true) {
        QUE_DBG("%s", "processing new tasks\n");
@ -117,23 +141,53 @@ void server_queue::start_loop() {
            QUE_DBG("processing task, id = %d\n", task.id);
            callback_new_task(std::move(task));
        }
-
        // all tasks in the current loop is processed, slots data is now ready
        QUE_DBG("%s", "update slots\n");

+        // this will run the main inference process for all slots
        callback_update_slots();
+        {
+            // update_slots() may take a while to finish, we need to make sure it's not counted as idle
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            time_last_task = ggml_time_ms();
+        }

        QUE_DBG("%s", "waiting for new tasks\n");
-        {
+        while (true) {
            std::unique_lock<std::mutex> lock(mutex_tasks);
-            if (!running) {
-                QUE_DBG("%s", "terminate\n");
-                return;
+            if (!running || !queue_tasks.empty()) {
+                break; // go back to process new tasks or terminate
            }
-            if (queue_tasks.empty()) {
+
+            // no tasks, check for sleeping state
+            if (should_sleep()) {
+                QUE_INF("%s", "entering sleeping state\n");
+                sleeping = true;
+                callback_sleeping_state(true);
+                req_stop_sleeping = false;
+                // wait until we are requested to exit sleeping state
                condition_tasks.wait(lock, [&]{
+                    return (!running || req_stop_sleeping);
+                });
+                if (!running) { // may changed during sleep
+                    break; // terminate
+                }
+                QUE_INF("%s", "exiting sleeping state\n");
+                req_stop_sleeping = false;
+                callback_sleeping_state(false);
+                sleeping = false;
+                time_last_task = ggml_time_ms();
+                condition_tasks.notify_all(); // notify wait_until_no_sleep()
+                break; // process new tasks
+            } else {
+                // wait for new tasks or timeout for checking sleeping condition
+                bool res = condition_tasks.wait_for(lock, max_wait_time, [&]{
                    return (!queue_tasks.empty() || !running);
                });
+                if (res) {
+                    break; // new task arrived or terminate
+                }
+                // otherwise, loop again to check sleeping condition
            }
        }
    }