llama-cpp-turboquant/tools/server/server-context.h
Xuan-Son Nguyen ddcb75dd8a
server: add auto-sleep after N seconds of idle (#18228)
* implement sleeping at queue level

* implement server-context suspend

* add test

* add docs

* optimization: add fast path

* make sure to free llama_init

* nits

* fix use-after-free

* allow /models to be accessed during sleeping, fix use-after-free

* don't allow accessing /models during sleep, it is not thread-safe

* fix data race on accessing props and model_meta

* small clean up

* trailing whitespace

* rm outdated comments
2025-12-21 02:24:42 +01:00

90 lines
3.2 KiB
C++

#include "server-http.h"
#include "server-task.h"
#include "server-queue.h"
#include <nlohmann/json_fwd.hpp>
#include <cstddef>
#include <memory>
struct server_context_impl; // private implementation
struct server_context_info {
std::string build_info;
std::string model_name;
bool has_inp_image;
bool has_inp_audio;
};
struct server_context {
std::unique_ptr<server_context_impl> impl;
server_context();
~server_context();
// load the model and initialize llama_context
// returns true on success
bool load_model(const common_params & params);
// this function will block main thread until termination
void start_loop();
// terminate main loop (will unblock start_loop)
void terminate();
// get the underlaying llama_context, can return nullptr if sleeping
llama_context * get_llama_context() const;
// get a new response reader, used by CLI application
server_response_reader get_response_reader();
// get server info
// used by CLI application
server_context_info get_info() const;
};
// forward declarations
struct server_res_generator;
struct server_routes {
server_routes(const common_params & params, server_context & ctx_server, std::function<bool()> is_ready = []() { return true; })
: params(params), ctx_server(*ctx_server.impl), is_ready(is_ready) {
init_routes();
}
void init_routes();
// handlers using lambda function, so that they can capture `this` without `std::bind`
server_http_context::handler_t get_health;
server_http_context::handler_t get_metrics;
server_http_context::handler_t get_slots;
server_http_context::handler_t post_slots;
server_http_context::handler_t get_props;
server_http_context::handler_t post_props;
server_http_context::handler_t get_api_show;
server_http_context::handler_t post_infill;
server_http_context::handler_t post_completions;
server_http_context::handler_t post_completions_oai;
server_http_context::handler_t post_chat_completions;
server_http_context::handler_t post_anthropic_messages;
server_http_context::handler_t post_anthropic_count_tokens;
server_http_context::handler_t post_apply_template;
server_http_context::handler_t get_models;
server_http_context::handler_t post_tokenize;
server_http_context::handler_t post_detokenize;
server_http_context::handler_t post_embeddings;
server_http_context::handler_t post_embeddings_oai;
server_http_context::handler_t post_rerank;
server_http_context::handler_t get_lora_adapters;
server_http_context::handler_t post_lora_adapters;
private:
// TODO: move these outside of server_routes?
std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
const common_params & params;
server_context_impl & ctx_server;
std::function<bool()> is_ready;
};