arg: clarify auto kvu/np being set on server (#17997)
* arg: clarify auto kvu/np being set on server * improve docs * use invalid_argument
This commit is contained in:
parent
a5251ca11d
commit
7b1db3d3b7
6 changed files with 51 additions and 35 deletions
|
|
@ -835,6 +835,19 @@ bool common_arg_utils::is_autoy(const std::string & value) {
|
|||
}
|
||||
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||
// per-example default params
|
||||
// we define here to make sure it's included in llama-gen-docs
|
||||
if (ex == LLAMA_EXAMPLE_COMPLETION) {
|
||||
params.use_jinja = false; // disable jinja by default
|
||||
|
||||
} else if (ex == LLAMA_EXAMPLE_MTMD) {
|
||||
params.use_jinja = false; // disable jinja by default
|
||||
params.sampling.temp = 0.2; // lower temp by default for better quality
|
||||
|
||||
} else if (ex == LLAMA_EXAMPLE_SERVER) {
|
||||
params.n_parallel = -1; // auto by default
|
||||
}
|
||||
|
||||
params.use_color = tty_can_use_colors();
|
||||
|
||||
// load dynamic backends
|
||||
|
|
@ -1107,7 +1120,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_env("LLAMA_ARG_SWA_FULL"));
|
||||
add_opt(common_arg(
|
||||
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
|
||||
string_format("max number of context checkpoints to create per slot (default: %d)\n"
|
||||
string_format("max number of context checkpoints to create per slot (default: %d)"
|
||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
|
||||
[](common_params & params, int value) {
|
||||
params.n_ctx_checkpoints = value;
|
||||
|
|
@ -1115,7 +1128,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
add_opt(common_arg(
|
||||
{"--cache-ram", "-cram"}, "N",
|
||||
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
|
||||
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
|
||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
|
||||
[](common_params & params, int value) {
|
||||
params.cache_ram_mib = value;
|
||||
|
|
@ -1123,12 +1136,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
add_opt(common_arg(
|
||||
{"--kv-unified", "-kvu"},
|
||||
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
|
||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
|
||||
"use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
|
||||
[](common_params & params) {
|
||||
params.kv_unified = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_KV_UNIFIED"));
|
||||
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--context-shift"},
|
||||
{"--no-context-shift"},
|
||||
|
|
@ -1888,13 +1900,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
|
||||
}
|
||||
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
||||
add_opt(common_arg(
|
||||
{"-np", "--parallel"}, "N",
|
||||
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
||||
[](common_params & params, int value) {
|
||||
params.n_parallel = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_N_PARALLEL"));
|
||||
if (ex == LLAMA_EXAMPLE_SERVER) {
|
||||
// this is to make sure this option appears in the server-specific section of the help message
|
||||
add_opt(common_arg(
|
||||
{"-np", "--parallel"}, "N",
|
||||
string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
|
||||
[](common_params & params, int value) {
|
||||
if (value == 0) {
|
||||
throw std::invalid_argument("error: invalid value for n_parallel\n");
|
||||
}
|
||||
params.n_parallel = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
} else {
|
||||
add_opt(common_arg(
|
||||
{"-np", "--parallel"}, "N",
|
||||
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
||||
[](common_params & params, int value) {
|
||||
params.n_parallel = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_N_PARALLEL"));
|
||||
}
|
||||
add_opt(common_arg(
|
||||
{"-ns", "--sequences"}, "N",
|
||||
string_format("number of sequences to decode (default: %d)", params.n_sequences),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue