llama-fit-params: free memory target per device (#18679)

This commit is contained in:
Johannes Gäßler 2026-01-08 10:07:58 +01:00 committed by GitHub
parent 9a5724dee2
commit 64848deb18
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 83 additions and 39 deletions

View file

@ -2255,7 +2255,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
std::vector<std::string> split_arg{ it, {} };
if (split_arg.size() >= llama_max_devices()) {
throw std::invalid_argument(
string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
);
}
for (size_t i = 0; i < llama_max_devices(); ++i) {
@ -2295,10 +2295,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_FIT"));
add_opt(common_arg(
{ "-fitt", "--fit-target" }, "MiB",
string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
[](common_params & params, int value) {
params.fit_params_target = value * size_t(1024*1024);
{ "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
string_format("target margin per device for --fit, comma-separated list of values, "
"single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
[](common_params & params, const std::string & value) {
std::string arg_next = value;
// split string by , and /
const std::regex regex{ R"([,/]+)" };
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
std::vector<std::string> split_arg{ it, {} };
if (split_arg.size() >= llama_max_devices()) {
throw std::invalid_argument(
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
);
}
if (split_arg.size() == 1) {
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
return;
}
for (size_t i = 0; i < split_arg.size(); i++) {
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
}
}
).set_env("LLAMA_ARG_FIT_TARGET"));
add_opt(common_arg(

View file

@ -1097,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) :
if (params.fit_params) {
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
}

View file

@ -332,12 +332,14 @@ struct common_params {
// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
// margin per device in bytes for fitting parameters to free memory:
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs