llama: use FA + max. GPU layers by default (#15434)
* llama: use max. GPU layers by default, auto -fa * ggml-backend: abort instead of segfault
This commit is contained in:
parent
38ad381f9f
commit
e81b8e4b7f
19 changed files with 235 additions and 72 deletions
|
|
@ -151,12 +151,6 @@ def benchmark(
|
|||
if os.environ.get("LLAMA_ARG_N_PARALLEL") is None:
|
||||
logger.info("LLAMA_ARG_N_PARALLEL not explicitly set, using 32")
|
||||
os.environ["LLAMA_ARG_N_PARALLEL"] = "32"
|
||||
if not external_server and os.environ.get("LLAMA_ARG_N_GPU_LAYERS") is None:
|
||||
logger.info("LLAMA_ARG_N_GPU_LAYERS not explicitly set, using 999")
|
||||
os.environ["LLAMA_ARG_N_GPU_LAYERS"] = "999"
|
||||
if not external_server and os.environ.get("LLAMA_ARG_FLASH_ATTN") is None:
|
||||
logger.info("LLAMA_ARG_FLASH_ATTN not explicitly set, using 'true'")
|
||||
os.environ["LLAMA_ARG_FLASH_ATTN"] = "true"
|
||||
|
||||
parallel: int = int(os.environ.get("LLAMA_ARG_N_PARALLEL")) # type: ignore
|
||||
prompts: Union[None, list[str], list[list[int]]] = get_prompts_text(prompt_source, n_prompts)
|
||||
|
|
|
|||
|
|
@ -323,7 +323,7 @@ def run(
|
|||
server.jinja = True
|
||||
server.ctk = ctk
|
||||
server.ctv = ctv
|
||||
server.fa = fa
|
||||
server.fa = "on" if fa else "off"
|
||||
server.n_predict = n_predict
|
||||
server.model_hf_repo = hf
|
||||
server.model_hf_file = None
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue