diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile
index 83182c970..6de22215e 100644
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -13,7 +13,7 @@ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.
FROM ${CANN_BASE_IMAGE} AS build
# -- Install build dependencies --
-RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
+RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \
yum clean all && \
rm -rf /var/cache/yum
@@ -42,6 +42,7 @@ RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
-DGGML_CANN=ON \
-DCMAKE_BUILD_TYPE=Release \
-DSOC_TYPE=ascend${CHIP_TYPE} \
+ -DUSE_ACL_GRAPH=ON \
. && \
cmake --build build --config Release -j$(nproc)
@@ -107,7 +108,7 @@ ENTRYPOINT ["/app/tools.sh"]
# ENTRYPOINT ["/app/llama-server"]
### Target: light
-# Lightweight image containing only llama-cli
+# Lightweight image containing only llama-cli and llama-completion
# ==============================================================================
FROM base AS light
diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile
index b9e84ab98..c70a2de56 100644
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -5,7 +5,7 @@ FROM ubuntu:$UBUNTU_VERSION AS build
ARG TARGETARCH
RUN apt-get update && \
- apt-get install -y build-essential git cmake libcurl4-openssl-dev
+ apt-get install -y build-essential git cmake libssl-dev
WORKDIR /app
diff --git a/.devops/cuda-new.Dockerfile b/.devops/cuda-new.Dockerfile
new file mode 100644
index 000000000..98dc147d7
--- /dev/null
+++ b/.devops/cuda-new.Dockerfile
@@ -0,0 +1,95 @@
+ARG UBUNTU_VERSION=24.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=13.1.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+ apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+ fi && \
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+ cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
+
+RUN mkdir -p /app/full \
+ && cp build/bin/* /app/full \
+ && cp *.py /app/full \
+ && cp -r gguf-py /app/full \
+ && cp -r requirements /app/full \
+ && cp requirements.txt /app/full \
+ && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ${BASE_CUDA_RUN_CONTAINER} AS base
+
+RUN apt-get update \
+ && apt-get install -y libgomp1 curl\
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+ && apt-get install -y \
+ git \
+ python3 \
+ python3-pip \
+ python3-wheel \
+ && pip install --break-system-packages --upgrade setuptools \
+ && pip install --break-system-packages -r requirements.txt \
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile
index fed586315..52f103bc3 100644
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
- apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+ apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
WORKDIR /app
diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile
index adebf0822..35ea4ade8 100644
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -6,7 +6,7 @@ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
- apt-get install -y git libcurl4-openssl-dev
+ apt-get install -y git libssl-dev
WORKDIR /app
diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile
index ef43d78cd..5bbc9ee43 100644
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -6,7 +6,7 @@ WORKDIR /app
COPY . .
-RUN yum install -y gcc g++ cmake make libcurl-devel
+RUN yum install -y gcc g++ cmake make openssl-devel
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
@@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
- cmake --build build --config Release --target llama-cli
+ cmake --build build --config Release --target llama-cli && \
+ cmake --build build --config Release --target llama-completion
# TODO: use image with NNRT
FROM ascendai/cann:$ASCEND_VERSION AS runtime
-COPY --from=build /app/build/bin/llama-cli /llama-cli
+COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
ENV LC_ALL=C.utf8
diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec
index 3bbf4a4de..4d42a906b 100644
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -37,6 +37,7 @@ make -j GGML_CUDA=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
+cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
@@ -68,6 +69,7 @@ rm -rf %{_builddir}/*
%files
%{_bindir}/llama-cuda-cli
+%{_bindir}/llama-cuda-completion
%{_bindir}/llama-cuda-server
%{_bindir}/llama-cuda-simple
/usr/lib/systemd/system/llamacuda.service
diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec
index 45902dcf8..0a4f43058 100644
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -39,6 +39,7 @@ make -j
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
+cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
@@ -70,6 +71,7 @@ rm -rf %{_builddir}/*
%files
%{_bindir}/llama-cli
+%{_bindir}/llama-completion
%{_bindir}/llama-server
%{_bindir}/llama-simple
/usr/lib/systemd/system/llama.service
diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile
index 34d6ad9f4..9eb498520 100644
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -18,7 +18,7 @@ RUN apt-get update && \
python3 \
python3-pip \
git \
- libcurl4-openssl-dev \
+ libssl-dev \
libgomp1
WORKDIR /app
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index a13996bd6..79a7270e5 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -32,7 +32,6 @@
useMpi ? false,
useRocm ? config.rocmSupport,
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
- enableCurl ? true,
useVulkan ? false,
useRpc ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@@ -160,15 +159,13 @@ effectiveStdenv.mkDerivation (finalAttrs: {
++ optionals useMpi [ mpi ]
++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ]
- ++ optionals useVulkan vulkanBuildInputs
- ++ optionals enableCurl [ curl ];
+ ++ optionals useVulkan vulkanBuildInputs;
cmakeFlags =
[
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
- (cmakeBool "LLAMA_CURL" enableCurl)
(cmakeBool "GGML_NATIVE" false)
(cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CUDA" useCuda)
diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile
index 53c3ed8d8..14936f8e9 100644
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -27,7 +27,7 @@ RUN apt-get update \
build-essential \
cmake \
git \
- libcurl4-openssl-dev \
+ libssl-dev \
curl \
libgomp1
diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile
index 1e66f061d..757cd97cd 100644
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -11,7 +11,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
apt install -y --no-install-recommends \
git cmake ccache ninja-build \
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
- libopenblas-dev libcurl4-openssl-dev && \
+ libopenblas-dev libssl-dev && \
rm -rf /var/lib/apt/lists/*
WORKDIR /app
diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile
index b37b4f277..9797c5e0f 100644
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -5,8 +5,8 @@ FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget xz-utils
-# Install cURL and Vulkan SDK dependencies
-RUN apt install -y libcurl4-openssl-dev curl \
+# Install SSL and Vulkan SDK dependencies
+RUN apt install -y libssl-dev curl \
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc
# Build it
@@ -33,6 +33,7 @@ FROM ubuntu:$UBUNTU_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
+ libglvnd0 libgl1 libglx0 libegl1 libgles2 \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
diff --git a/.gemini/settings.json b/.gemini/settings.json
new file mode 100644
index 000000000..68337d390
--- /dev/null
+++ b/.gemini/settings.json
@@ -0,0 +1 @@
+{ "contextFileName": "AGENTS.md" }
diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
index feb0d5120..c106f47a2 100644
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -8,7 +8,8 @@ body:
value: >
Thanks for taking the time to fill out this bug report!
This issue template is intended for bug reports where the compilation of llama.cpp fails.
- Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
+ Before opening an issue, please confirm that the compilation still fails
+ after recreating the CMake build directory and with `-DGGML_CCACHE=OFF`.
If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
by clearing `~/.cache/ccache` (on Linux).
- type: textarea
diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml
index c42a14ff8..31202dfa8 100644
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -11,7 +11,7 @@ body:
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
If you encountered the issue while using an external UI (e.g. ollama),
please reproduce your issue using one of the examples/binaries in this repository.
- The `llama-cli` binary can be used for simple and reproducible model inference.
+ The `llama-completion` binary can be used for simple and reproducible model inference.
- type: textarea
id: version
attributes:
@@ -74,9 +74,12 @@ body:
Please give us a summary of the problem and tell us how to reproduce it.
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
that information would be very much appreciated by us.
+
+ If possible, please try to reproduce the issue using `llama-completion` with `-fit off`.
+ If you can only reproduce the issue with `-fit on`, please provide logs both with and without `--verbose`.
placeholder: >
- e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
- When I use -ngl 0 it works correctly.
+ e.g. when I run llama-completion with `-fa on` I get garbled outputs for very long prompts.
+ With short prompts or `-fa off` it works correctly.
Here are the exact commands that I used: ...
validations:
required: true
@@ -95,7 +98,18 @@ body:
label: Relevant log output
description: >
Please copy and paste any relevant log output, including the command that you entered and any generated text.
- This will be automatically formatted into code, so no need for backticks.
- render: shell
+ For very long logs (thousands of lines), preferably upload them as files instead.
+ On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
+ value: |
+
+ Logs
+
+
+ ```console
+
+ ```
+
+
+
validations:
required: true
diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
index 1904e31fd..8e867e7f6 100644
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -85,7 +85,19 @@ body:
label: Relevant log output
description: >
If applicable, please copy and paste any relevant log output, including any generated text.
- This will be automatically formatted into code, so no need for backticks.
- render: shell
+ If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
+ For very long logs (thousands of lines), please upload them as files instead.
+ On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
+ value: |
+
+ Logs
+
+
+ ```console
+
+ ```
+
+
+
validations:
required: false
diff --git a/.github/actions/windows-setup-curl/action.yml b/.github/actions/windows-setup-curl/action.yml
deleted file mode 100644
index 446f799fa..000000000
--- a/.github/actions/windows-setup-curl/action.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: 'Windows - Setup CURL'
-description: 'Composite action, to be reused in other workflow'
-inputs:
- curl_version:
- description: 'CURL version'
- required: false
- default: '8.6.0_6'
- architecture:
- description: 'Architecture of the libcurl to download'
- required: false
- default: 'win64'
-outputs:
- curl_path:
- description: "Path to the downloaded libcurl"
- value: ${{ steps.get_libcurl.outputs.curl_path }}
-
-runs:
- using: "composite"
- steps:
- - name: libCURL
- id: get_libcurl
- shell: powershell
- env:
- CURL_VERSION: ${{ inputs.curl_version }}
- ARCHITECTURE: ${{ inputs.architecture }}
- run: |
- curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
- mkdir $env:RUNNER_TEMP/libcurl
- tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
- echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
deleted file mode 100644
index ad13c6ea8..000000000
--- a/.github/copilot-instructions.md
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copilot Instructions for llama.cpp
-
-## Repository Overview
-
-llama.cpp is a large-scale C/C++ project for efficient LLM (Large Language Model) inference with minimal setup and dependencies. The project enables running language models on diverse hardware with state-of-the-art performance.
-
-**Key Facts:**
-- **Primary language**: C/C++ with Python utility scripts
-- **Size**: ~200k+ lines of code across 1000+ files
-- **Architecture**: Modular design with main library (`libllama`) and 40+ executable tools/examples
-- **Core dependency**: ggml tensor library (vendored in `ggml/` directory)
-- **Backends supported**: CPU (AVX/NEON/RVV optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA
-- **License**: MIT
-
-## Build Instructions
-
-### Prerequisites
-- CMake 3.14+ (primary build system)
-- C++17 compatible compiler (GCC 13.3+, Clang, MSVC)
-- Optional: ccache for faster compilation
-
-### Basic Build (CPU-only)
-**ALWAYS run these commands in sequence:**
-```bash
-cmake -B build
-cmake --build build --config Release -j $(nproc)
-```
-
-**Build time**: ~10 minutes on 4-core system with ccache enabled, ~25 minutes without ccache.
-
-**Important Notes:**
-- The Makefile is deprecated - always use CMake
-- ccache is automatically detected and used if available
-- Built binaries are placed in `build/bin/`
-- Parallel builds (`-j`) significantly reduce build time
-
-### Backend-Specific Builds
-For CUDA support:
-```bash
-cmake -B build -DGGML_CUDA=ON
-cmake --build build --config Release -j $(nproc)
-```
-
-For Metal (macOS):
-```bash
-cmake -B build -DGGML_METAL=ON
-cmake --build build --config Release -j $(nproc)
-```
-
-**Important Note**: While all backends can be built as long as the correct requirements for that backend are installed, you will not be able to run them without the correct hardware. The only backend that can be run for testing and validation is the CPU backend.
-
-### Debug Builds
-Single-config generators:
-```bash
-cmake -B build -DCMAKE_BUILD_TYPE=Debug
-cmake --build build
-```
-
-Multi-config generators:
-```bash
-cmake -B build -G "Xcode"
-cmake --build build --config Debug
-```
-
-### Common Build Issues
-- **Issue**: Network tests fail in isolated environments
- **Solution**: Expected behavior - core functionality tests will still pass
-
-## Testing
-
-### Running Tests
-```bash
-ctest --test-dir build --output-on-failure -j $(nproc)
-```
-
-**Test suite**: 38 tests covering tokenizers, grammar parsing, sampling, backends, and integration
-**Expected failures**: 2-3 tests may fail if network access is unavailable (they download models)
-**Test time**: ~30 seconds for passing tests
-
-### Server Unit Tests
-Run server-specific unit tests after building the server:
-```bash
-# Build the server first
-cmake --build build --target llama-server
-
-# Navigate to server tests and run
-cd tools/server/tests
-source ../../../.venv/bin/activate
-./tests.sh
-```
-**Server test dependencies**: The `.venv` environment includes the required dependencies for server unit tests (pytest, aiohttp, etc.). Tests can be run individually or with various options as documented in `tools/server/tests/README.md`.
-
-### Test Categories
-- Tokenizer tests: Various model tokenizers (BERT, GPT-2, LLaMA, etc.)
-- Grammar tests: GBNF parsing and validation
-- Backend tests: Core ggml operations across different backends
-- Integration tests: End-to-end workflows
-
-### Manual Testing Commands
-```bash
-# Test basic inference
-./build/bin/llama-cli --version
-
-# Test model loading (requires model file)
-./build/bin/llama-cli -m path/to/model.gguf -p "Hello" -n 10
-```
-
-## Code Quality and Linting
-
-### C++ Code Formatting
-**ALWAYS format C++ code before committing:**
-```bash
-git clang-format
-```
-
-Configuration is in `.clang-format` with these key rules:
-- 4-space indentation
-- 120 column limit
-- Braces on same line for functions
-- Pointer alignment: `void * ptr` (middle)
-- Reference alignment: `int & ref` (middle)
-
-### Python Code
-**ALWAYS activate the Python environment in `.venv` and use tools from that environment:**
-```bash
-# Activate virtual environment
-source .venv/bin/activate
-```
-
-Configuration files:
-- `.flake8`: flake8 settings (max-line-length=125, excludes examples/tools)
-- `pyrightconfig.json`: pyright type checking configuration
-
-### Pre-commit Hooks
-Run before committing:
-```bash
-pre-commit run --all-files
-```
-
-## Continuous Integration
-
-### GitHub Actions Workflows
-Key workflows that run on every PR:
-- `.github/workflows/build.yml`: Multi-platform builds
-- `.github/workflows/server.yml`: Server functionality tests
-- `.github/workflows/python-lint.yml`: Python code quality
-- `.github/workflows/python-type-check.yml`: Python type checking
-
-### Local CI Validation
-**Run full CI locally before submitting PRs:**
-```bash
-mkdir tmp
-
-# CPU-only build
-bash ./ci/run.sh ./tmp/results ./tmp/mnt
-```
-
-**CI Runtime**: 30-60 minutes depending on backend configuration
-
-### Triggering CI
-Add `ggml-ci` to commit message to trigger heavy CI workloads on the custom CI infrastructure.
-
-## Project Layout and Architecture
-
-### Core Directories
-- **`src/`**: Main llama library implementation (`llama.cpp`, `llama-*.cpp`)
-- **`include/`**: Public API headers, primarily `include/llama.h`
-- **`ggml/`**: Core tensor library (submodule with custom GGML framework)
-- **`examples/`**: 30+ example applications and tools
-- **`tools/`**: Additional development and utility tools (server benchmarks, tests)
-- **`tests/`**: Comprehensive test suite with CTest integration
-- **`docs/`**: Detailed documentation (build guides, API docs, etc.)
-- **`scripts/`**: Utility scripts for CI, data processing, and automation
-- **`common/`**: Shared utility code used across examples
-
-### Key Files
-- **`CMakeLists.txt`**: Primary build configuration
-- **`include/llama.h`**: Main C API header (~2000 lines)
-- **`src/llama.cpp`**: Core library implementation (~8000 lines)
-- **`CONTRIBUTING.md`**: Coding guidelines and PR requirements
-- **`.clang-format`**: C++ formatting rules
-- **`.pre-commit-config.yaml`**: Git hook configuration
-
-### Built Executables (in `build/bin/`)
-Primary tools:
-- **`llama-cli`**: Main inference tool
-- **`llama-server`**: OpenAI-compatible HTTP server
-- **`llama-quantize`**: Model quantization utility
-- **`llama-perplexity`**: Model evaluation tool
-- **`llama-bench`**: Performance benchmarking
-- **`llama-convert-llama2c-to-ggml`**: Model conversion utilities
-
-### Configuration Files
-- **CMake**: `CMakeLists.txt`, `cmake/` directory
-- **Linting**: `.clang-format`, `.clang-tidy`, `.flake8`
-- **CI**: `.github/workflows/`, `ci/run.sh`
-- **Git**: `.gitignore` (includes build artifacts, models, cache)
-
-### Dependencies
-- **System**: OpenMP, libcurl (for model downloading)
-- **Optional**: CUDA SDK, Metal framework, Vulkan SDK, Intel oneAPI
-- **Bundled**: httplib, json (header-only libraries in vendored form)
-
-## Common Validation Steps
-
-### After Making Changes
-1. **Format code**: `git clang-format`
-2. **Build**: `cmake --build build --config Release`
-3. **Test**: `ctest --test-dir build --output-on-failure`
-4. **Server tests** (if modifying server): `cd tools/server/tests && source ../../../.venv/bin/activate && ./tests.sh`
-5. **Manual validation**: Test relevant tools in `build/bin/`
-
-### Performance Validation
-```bash
-# Benchmark inference performance
-./build/bin/llama-bench -m model.gguf
-
-# Evaluate model perplexity
-./build/bin/llama-perplexity -m model.gguf -f dataset.txt
-```
-
-### Backend Validation
-```bash
-# Test backend operations
-./build/bin/test-backend-ops
-```
-
-## Environment Setup
-
-### Required Tools
-- CMake 3.14+ (install via system package manager)
-- Modern C++ compiler with C++17 support
-- Git (for submodule management)
-- Python 3.9+ with virtual environment (`.venv` is provided)
-
-### Optional but Recommended
-- ccache: `apt install ccache` or `brew install ccache`
-- clang-format 15+: Usually included with LLVM/Clang installation
-- pre-commit: `pip install pre-commit`
-
-### Backend-Specific Requirements
-- **CUDA**: NVIDIA CUDA Toolkit 11.2+
-- **Metal**: Xcode command line tools (macOS only)
-- **Vulkan**: Vulkan SDK
-- **SYCL**: Intel oneAPI toolkit
-
-## Important Guidelines
-
-### Code Changes
-- **Minimal dependencies**: Avoid adding new external dependencies
-- **Cross-platform compatibility**: Test on Linux, macOS, Windows when possible
-- **Performance focus**: This is a performance-critical inference library
-- **API stability**: Changes to `include/llama.h` require careful consideration
-
-### Git Workflow
-- Always create feature branches from `master`
-- **Never** commit build artifacts (`build/`, `.ccache/`, `*.o`, `*.gguf`)
-- Use descriptive commit messages following project conventions
-
-### Trust These Instructions
-Only search for additional information if these instructions are incomplete or found to be incorrect. This document contains validated build and test procedures that work reliably across different environments.
-
diff --git a/.github/labeler.yml b/.github/labeler.yml
index d8ada150c..08cfd7e0b 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -89,7 +89,10 @@ nix:
embedding:
- changed-files:
- any-glob-to-any-file: examples/embedding/
-
+jinja parser:
+ - changed-files:
+ - any-glob-to-any-file:
+ - common/jinja/**
Ascend NPU:
- changed-files:
- any-glob-to-any-file:
diff --git a/.github/workflows/build-cache.yml b/.github/workflows/build-cache.yml
index 6a22e41c3..3de0be9fa 100644
--- a/.github/workflows/build-cache.yml
+++ b/.github/workflows/build-cache.yml
@@ -16,7 +16,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Get latest Vulkan SDK version
id: vulkan_sdk_version
@@ -24,7 +24,7 @@ jobs:
echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
- name: Setup Cache
- uses: actions/cache@v4
+ uses: actions/cache@v5
id: cache-sdk
with:
path: ./vulkan_sdk
@@ -47,10 +47,10 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Setup Cache
- uses: actions/cache@v4
+ uses: actions/cache@v5
id: cache-toolchain
with:
path: ./spacemit_toolchain
@@ -73,10 +73,10 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Setup Cache
- uses: actions/cache@v4
+ uses: actions/cache@v5
id: cache-rocm
with:
path: C:\Program Files\AMD\ROCm
diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml
index fee2ab96b..259efa43c 100644
--- a/.github/workflows/build-cmake-pkg.yml
+++ b/.github/workflows/build-cmake-pkg.yml
@@ -7,7 +7,7 @@ jobs:
linux:
runs-on: ubuntu-24.04
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6
with:
fetch-depth: 0
@@ -20,7 +20,7 @@ jobs:
run: |
PREFIX="$(pwd)"/inst
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
- -DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
+ -DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
cmake --build build --config Release
cmake --install build --prefix "$PREFIX" --config Release
diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml
index c2c6ea12a..8b6ebaf4a 100644
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -8,7 +8,7 @@ jobs:
# runs-on: ubuntu-24.04
# steps:
- # - uses: actions/checkout@v4
+ # - uses: actions/checkout@v6
# - name: Setup Riscv
# run: |
# sudo dpkg --add-architecture riscv64
@@ -30,7 +30,7 @@ jobs:
# - name: Build
# run: |
- # cmake -B build -DLLAMA_CURL=OFF \
+ # cmake -B build -DLLAMA_OPENSSL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_OPENMP=OFF \
# -DLLAMA_BUILD_EXAMPLES=ON \
@@ -52,7 +52,7 @@ jobs:
# runs-on: ubuntu-24.04
# steps:
- # - uses: actions/checkout@v4
+ # - uses: actions/checkout@v6
# - name: Setup Riscv
# run: |
# sudo dpkg --add-architecture riscv64
@@ -76,7 +76,7 @@ jobs:
# - name: Build
# run: |
- # cmake -B build -DLLAMA_CURL=OFF \
+ # cmake -B build -DLLAMA_OPENSSL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_VULKAN=ON \
# -DGGML_OPENMP=OFF \
@@ -99,7 +99,7 @@ jobs:
# runs-on: ubuntu-24.04
# steps:
- # - uses: actions/checkout@v4
+ # - uses: actions/checkout@v6
# - name: Setup Arm64
# run: |
# sudo dpkg --add-architecture arm64
@@ -122,7 +122,7 @@ jobs:
# - name: Build
# run: |
- # cmake -B build -DLLAMA_CURL=OFF \
+ # cmake -B build -DLLAMA_OPENSSL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_VULKAN=ON \
# -DGGML_OPENMP=OFF \
@@ -146,7 +146,7 @@ jobs:
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6
- name: Setup LoongArch
run: |
rm -f /etc/apt/sources.list.d/*
@@ -178,7 +178,7 @@ jobs:
- name: Build
run: |
- cmake -B build -DLLAMA_CURL=OFF \
+ cmake -B build -DLLAMA_OPENSSL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -201,7 +201,7 @@ jobs:
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6
- name: Setup LoongArch
run: |
rm -f /etc/apt/sources.list.d/*
@@ -235,7 +235,7 @@ jobs:
- name: Build
run: |
- cmake -B build -DLLAMA_CURL=OFF \
+ cmake -B build -DLLAMA_OPENSSL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
@@ -262,10 +262,10 @@ jobs:
SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6
- name: Use SpacemiT Toolchain Cache
- uses: actions/cache@v4
+ uses: actions/cache@v5
id: cache-toolchain
with:
path: ./spacemit_toolchain
@@ -281,7 +281,7 @@ jobs:
- name: Build
run: |
export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
- cmake -B build -DLLAMA_CURL=OFF \
+ cmake -B build -DLLAMA_OPENSSL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 9fe1401df..551bdd3df 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -20,7 +20,8 @@ on:
'**/*.swift',
'**/*.m',
'**/*.metal',
- '**/*.comp'
+ '**/*.comp',
+ '**/*.glsl'
]
pull_request:
@@ -40,7 +41,8 @@ on:
'**/*.swift',
'**/*.m',
'**/*.metal',
- '**/*.comp'
+ '**/*.comp',
+ '**/*.glsl'
]
concurrency:
@@ -61,13 +63,14 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-arm64
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
@@ -76,7 +79,6 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
- -DLLAMA_CURL=OFF \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=OFF \
@@ -89,7 +91,7 @@ jobs:
id: cmake_test
run: |
cd build
- ctest -L 'main|curl' --verbose --timeout 900
+ ctest -L main --verbose --timeout 900
macOS-latest-cmake-x64:
runs-on: macos-15-intel
@@ -97,13 +99,14 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-x64
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
@@ -114,7 +117,6 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
- -DLLAMA_CURL=OFF \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON \
@@ -133,13 +135,14 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-arm64-webgpu
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dawn Dependency
id: dawn-depends
@@ -147,13 +150,13 @@ jobs:
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
- DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.zip"
- echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+ DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
+ echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
curl -L -o artifact.zip \
- "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+ "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
mkdir dawn
unzip artifact.zip
- tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.tar.gz -C dawn --strip-components=1
+ tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
@@ -186,13 +189,14 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-cpu-cmake-${{ matrix.build }}
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build Dependencies
id: build_depends
@@ -221,8 +225,6 @@ jobs:
id: cmake_build
run: |
cmake -B build \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
@@ -231,7 +233,7 @@ jobs:
id: cmake_test
run: |
cd build
- ctest -L 'main|curl' --verbose --timeout 900
+ ctest -L main --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
@@ -267,13 +269,14 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
@@ -286,8 +289,6 @@ jobs:
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -298,8 +299,6 @@ jobs:
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
@@ -318,7 +317,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Dependencies
id: depends
@@ -329,14 +328,10 @@ jobs:
- name: Build
id: cmake_build
run: |
- mkdir build
- cd build
- cmake .. \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
+ cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_LLGUIDANCE=ON
- cmake --build . --config Release -j $(nproc)
+ cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
@@ -352,7 +347,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
# - name: ccache
# uses: ggml-org/ccache-action@v1.2.16
@@ -370,8 +365,6 @@ jobs:
id: cmake_build
run: |
cmake -B build \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
@@ -387,13 +380,14 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-24-cmake-vulkan-deb
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
@@ -404,8 +398,6 @@ jobs:
id: cmake_configure
run: |
cmake -B build \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DGGML_BACKEND_DL=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
@@ -422,13 +414,14 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-24-cmake-vulkan
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
@@ -443,7 +436,7 @@ jobs:
echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
- name: Use Vulkan SDK Cache
- uses: actions/cache@v4
+ uses: actions/cache@v5
id: cache-sdk
with:
path: ./vulkan_sdk
@@ -461,8 +454,6 @@ jobs:
run: |
source ./vulkan_sdk/setup-env.sh
cmake -B build \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DGGML_VULKAN=ON
cmake --build build --config Release -j $(nproc)
@@ -481,13 +472,14 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-24-cmake-webgpu
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
@@ -502,7 +494,7 @@ jobs:
echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
- name: Use Vulkan SDK Cache
- uses: actions/cache@v4
+ uses: actions/cache@v5
id: cache-sdk
with:
path: ./vulkan_sdk
@@ -522,21 +514,19 @@ jobs:
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
- DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.zip"
- echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+ DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release"
+ echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
curl -L -o artifact.zip \
- "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+ "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
mkdir dawn
unzip artifact.zip
- tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.tar.gz -C dawn --strip-components=1
+ tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
run: |
export Dawn_DIR=dawn/lib64/cmake/Dawn
cmake -B build \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DGGML_WEBGPU=ON
cmake --build build --config Release -j $(nproc)
@@ -553,13 +543,14 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-latest-wasm-webgpu
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Install Emscripten
run: |
@@ -582,7 +573,7 @@ jobs:
source emsdk/emsdk_env.sh
emcmake cmake -B build-wasm \
-DGGML_WEBGPU=ON \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
cmake --build build-wasm --target test-backend-ops -j $(nproc)
@@ -594,7 +585,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Dependencies
id: depends
@@ -607,13 +598,12 @@ jobs:
with:
key: ubuntu-22-cmake-hip
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build with native CMake HIP support
id: cmake_build
run: |
cmake -B build -S . \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGGML_HIP=ON
@@ -626,7 +616,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Dependencies
id: depends
@@ -639,13 +629,12 @@ jobs:
with:
key: ubuntu-22-cmake-musa
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build with native CMake MUSA support
id: cmake_build
run: |
cmake -B build -S . \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DGGML_MUSA=ON
cmake --build build --config Release -j $(nproc)
@@ -655,7 +644,7 @@ jobs:
continue-on-error: true
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6
- name: add oneAPI to apt
shell: bash
@@ -679,21 +668,20 @@ jobs:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-sycl
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx
@@ -705,7 +693,7 @@ jobs:
continue-on-error: true
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6
- name: add oneAPI to apt
shell: bash
@@ -729,21 +717,20 @@ jobs:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-sycl-fp16
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
@@ -762,13 +749,14 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-ios
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
@@ -793,13 +781,14 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-tvos
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
@@ -824,7 +813,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Build
id: cmake_build
@@ -854,16 +843,17 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: macOS-latest-swift
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Download xcframework artifact
- uses: actions/download-artifact@v4
+ uses: actions/download-artifact@v7
with:
name: llama-xcframework
path: build-apple/llama.xcframework/
@@ -875,7 +865,7 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_TESTS=OFF \
@@ -895,7 +885,7 @@ jobs:
steps:
- name: Clone
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
@@ -903,6 +893,7 @@ jobs:
key: windows-msys2
variant: ccache
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Setup ${{ matrix.sys }}
uses: msys2/setup-msys2@v2
@@ -963,7 +954,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
@@ -971,6 +962,7 @@ jobs:
key: windows-latest-cmake-${{ matrix.build }}
variant: ccache
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Download OpenBLAS
id: get_openblas
@@ -1023,7 +1015,7 @@ jobs:
id: cmake_build
run: |
cmake -S . -B build ${{ matrix.defines }} `
- -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
+ -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Add libopenblas.dll
@@ -1061,7 +1053,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Install dependencies
env:
@@ -1075,18 +1067,19 @@ jobs:
with:
key: ubuntu-latest-cmake-cuda
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build with CMake
+ # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
run: |
cmake -S . -B build -G Ninja \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CUDA_ARCHITECTURES=89-real \
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-DGGML_NATIVE=OFF \
- -DGGML_CUDA=ON
+ -DGGML_CUDA=ON \
+ -DGGML_CUDA_CUB_3DOT2=ON
cmake --build build
windows-2022-cmake-cuda:
@@ -1099,7 +1092,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Install ccache
uses: ggml-org/ccache-action@v1.2.16
@@ -1107,6 +1100,7 @@ jobs:
key: windows-cuda-${{ matrix.cuda }}
variant: ccache
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Install Cuda Toolkit
uses: ./.github/actions/windows-setup-cuda
@@ -1121,17 +1115,18 @@ jobs:
- name: Build
id: cmake_build
shell: cmd
+ # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
cmake -S . -B build -G "Ninja Multi-Config" ^
-DLLAMA_BUILD_SERVER=ON ^
- -DLLAMA_CURL=OFF ^
-DLLAMA_BUILD_BORINGSSL=ON ^
-DGGML_NATIVE=OFF ^
-DGGML_BACKEND_DL=ON ^
-DGGML_CPU_ALL_VARIANTS=ON ^
-DGGML_CUDA=ON ^
- -DGGML_RPC=ON
+ -DGGML_RPC=ON ^
+ -DGGML_CUDA_CUB_3DOT2=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
@@ -1150,7 +1145,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
@@ -1158,6 +1153,7 @@ jobs:
key: windows-latest-cmake-sycl
variant: ccache
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Install
run: |
@@ -1181,7 +1177,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Grab rocWMMA package
id: grab_rocwmma
@@ -1191,7 +1187,7 @@ jobs:
7z x data.tar
- name: Use ROCm Installation Cache
- uses: actions/cache@v4
+ uses: actions/cache@v5
id: cache-rocm
with:
path: C:\Program Files\AMD\ROCm
@@ -1219,6 +1215,7 @@ jobs:
with:
key: ${{ github.job }}
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
@@ -1230,7 +1227,6 @@ jobs:
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
-DCMAKE_BUILD_TYPE=Release `
- -DLLAMA_CURL=OFF `
-DLLAMA_BUILD_BORINGSSL=ON `
-DROCM_DIR="${env:HIP_PATH}" `
-DGGML_HIP=ON `
@@ -1243,7 +1239,7 @@ jobs:
steps:
- name: Checkout code
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Setup Xcode
uses: maxim-lobanov/setup-xcode@v1
@@ -1257,7 +1253,7 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_TESTS=OFF \
@@ -1273,7 +1269,7 @@ jobs:
./build-xcframework.sh
- name: Upload xcframework artifact
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v6
with:
name: llama-xcframework
path: build-apple/llama.xcframework/
@@ -1289,7 +1285,7 @@ jobs:
steps:
- name: Clone
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
# Disabled due to size (400MB) and always 0 cache hits
# - name: ccache
@@ -1299,7 +1295,7 @@ jobs:
# evict-old-files: 1d
- name: Set up JDK
- uses: actions/setup-java@v3
+ uses: actions/setup-java@v5
with:
java-version: 17
distribution: zulu
@@ -1324,14 +1320,14 @@ jobs:
matrix:
include:
- build: 'arm64-cpu'
- defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_CURL=OFF -D GGML_OPENMP=OFF'
+ defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
- build: 'arm64-snapdragon'
defines: '--preset arm64-android-snapdragon-release'
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Install OpenCL Headers and Libs
id: install_opencl
@@ -1390,7 +1386,6 @@ jobs:
echo "FIXME: test on devices"
openEuler-latest-cmake-cann:
- if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
defaults:
run:
shell: bash -el {0}
@@ -1399,10 +1394,15 @@ jobs:
arch: [x86, aarch64]
chip_type: ['910b', '310p']
build: ['Release']
+ use_acl_graph: ['on', 'off']
+ exclude:
+ # 310P does not support USE_ACL_GRAPH=on
+ - chip_type: '310p'
+ use_acl_graph: 'on'
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0
@@ -1424,6 +1424,7 @@ jobs:
env:
BUILD_TYPE: ${{ matrix.build }}
SOC_TYPE: ascend${{ matrix.chip_type }}
+ USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
run: |
HOST_UID=$(id -u)
HOST_GID=$(id -g)
@@ -1433,17 +1434,19 @@ jobs:
-w /workspace \
-e SOC_TYPE=${SOC_TYPE} \
-e BUILD_TYPE=${BUILD_TYPE} \
+ -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
"${{ steps.cann-image.outputs.image }}" \
bash -lc '
set -e
- yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
+ yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
yum clean all && rm -rf /var/cache/yum
git config --global --add safe.directory "/workspace"
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DGGML_CANN=on \
- -DSOC_TYPE=${SOC_TYPE}
+ -DSOC_TYPE=${SOC_TYPE} \
+ -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
cmake --build build -j $(nproc)
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
@@ -1457,19 +1460,20 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ggml-ci-x64-cpu-low-perf
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential libcurl4-openssl-dev
+ sudo apt-get install build-essential
- name: Test
id: ggml-ci
@@ -1482,19 +1486,20 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ggml-ci-arm64-cpu-low-perf
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential libcurl4-openssl-dev
+ sudo apt-get install build-essential
- name: Test
id: ggml-ci
@@ -1507,19 +1512,20 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ggml-ci-x64-cpu-high-perf
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential libcurl4-openssl-dev
+ sudo apt-get install build-essential
- name: Test
id: ggml-ci
@@ -1532,19 +1538,20 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ggml-ci-arm64-cpu-high-perf
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential libcurl4-openssl-dev
+ sudo apt-get install build-essential
- name: Test
id: ggml-ci
@@ -1557,19 +1564,20 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ggml-ci-arm64-cpu-high-perf-sve
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential libcurl4-openssl-dev
+ sudo apt-get install build-essential
- name: Test
id: ggml-ci
@@ -1582,7 +1590,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Test
id: ggml-ci
@@ -1596,7 +1604,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Test
id: ggml-ci
@@ -1610,7 +1618,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Test
id: ggml-ci
@@ -1624,7 +1632,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Test
id: ggml-ci
@@ -1637,7 +1645,7 @@ jobs:
# steps:
# - name: Clone
# id: checkout
- # uses: actions/checkout@v4
+ # uses: actions/checkout@v6
# - name: Test
# id: ggml-ci
@@ -1651,7 +1659,7 @@ jobs:
# steps:
# - name: Clone
# id: checkout
- # uses: actions/checkout@v4
+ # uses: actions/checkout@v6
# - name: Test
# id: ggml-ci
@@ -1665,20 +1673,48 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+ ggml-ci-mac-webgpu:
+ runs-on: [self-hosted, macOS, ARM64]
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: Dawn Dependency
+ id: dawn-depends
+ run: |
+ DAWN_VERSION="v2.0.0"
+ DAWN_OWNER="reeselevine"
+ DAWN_REPO="dawn"
+ DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
+ echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+ curl -L -o artifact.zip \
+ "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+ mkdir dawn
+ unzip artifact.zip
+ tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
+
+ - name: Test
+ id: ggml-ci
+ run: |
+ GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
+ bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
ggml-ci-mac-vulkan:
runs-on: [self-hosted, macOS, ARM64]
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Test
id: ggml-ci
@@ -1692,19 +1728,20 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ggml-ci-arm64-cpu-kleidiai
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
- sudo apt-get install -y build-essential libcurl4-openssl-dev
+ sudo apt-get install -y build-essential
- name: Test
id: ggml-ci
@@ -1720,7 +1757,7 @@ jobs:
sudo apt-get update
# Install necessary packages
- sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
+ sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache git-lfs
# Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
@@ -1732,9 +1769,11 @@ jobs:
rustup install stable
rustup default stable
+ git lfs install
+
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Check environment
run: |
@@ -1769,8 +1808,6 @@ jobs:
id: cmake_build
run: |
cmake -B build \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -1788,7 +1825,7 @@ jobs:
id: cmake_test
run: |
cd build
- ctest -L 'main|curl' --verbose --timeout 900
+ ctest -L main --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
@@ -1817,7 +1854,7 @@ jobs:
sudo apt-get update
# Install necessary packages
- sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
+ sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs
# Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
@@ -1829,6 +1866,8 @@ jobs:
rustup install stable
rustup default stable
+ git lfs install
+
- name: GCC version check
run: |
gcc --version
@@ -1836,7 +1875,7 @@ jobs:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Setup ccache
run: |
@@ -1861,7 +1900,7 @@ jobs:
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=ON \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -1880,7 +1919,7 @@ jobs:
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -1909,7 +1948,7 @@ jobs:
sudo apt-get update
# Install necessary packages
- sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
+ sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs
# Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
@@ -1921,6 +1960,8 @@ jobs:
rustup install stable
rustup default stable
+ git lfs install
+
- name: GCC version check
run: |
gcc --version
@@ -1928,7 +1969,7 @@ jobs:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Setup ccache
run: |
@@ -1949,7 +1990,7 @@ jobs:
id: cmake_build
run: |
cmake -B build \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -1981,7 +2022,7 @@ jobs:
sudo apt-get update
# Install necessary packages
- sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
+ sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache git-lfs
# Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
@@ -1993,6 +2034,8 @@ jobs:
rustup install stable
rustup default stable
+ git lfs install
+
- name: GCC version check
run: |
gcc --version
@@ -2000,7 +2043,7 @@ jobs:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Setup ccache
run: |
@@ -2021,8 +2064,6 @@ jobs:
id: cmake_build
run: |
cmake -B build \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -2048,7 +2089,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Dependencies
id: depends
@@ -2058,7 +2099,6 @@ jobs:
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
apt-get install -y \
build-essential \
- libcurl4-openssl-dev \
python3-venv \
gpg \
wget \
@@ -2082,6 +2122,7 @@ jobs:
with:
key: ggml-ci-arm64-graviton4-kleidiai
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Test
id: ggml-ci
diff --git a/.github/workflows/check-vendor.yml b/.github/workflows/check-vendor.yml
index 7b3016079..b9e8ac765 100644
--- a/.github/workflows/check-vendor.yml
+++ b/.github/workflows/check-vendor.yml
@@ -23,12 +23,12 @@ jobs:
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v6
with:
python-version: '3.x'
diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml
index cbfc4990d..8fb5310d0 100644
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -15,7 +15,7 @@ jobs:
issues: write
pull-requests: write
steps:
- - uses: actions/stale@v5
+ - uses: actions/stale@v10
with:
exempt-issue-labels: "refactoring,help wanted,good first issue,research 🔬,bug,roadmap"
days-before-issue-stale: 30
diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml
index 3645e3037..fc3cec5ea 100644
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@@ -26,7 +26,7 @@ jobs:
# If you do not check out your code, Copilot will do this for you.
steps:
- name: Checkout code
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
@@ -38,14 +38,14 @@ jobs:
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential libcurl4-openssl-dev
+ sudo apt-get install build-essential libssl-dev
# Install git-clang-format script for formatting only changed code
wget -O /tmp/git-clang-format https://raw.githubusercontent.com/llvm/llvm-project/release/18.x/clang/tools/clang-format/git-clang-format
sudo cp /tmp/git-clang-format /usr/local/bin/git-clang-format
sudo chmod +x /usr/local/bin/git-clang-format
- name: Set up Python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: '3.11'
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 7ca11b1df..8062177ba 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -40,16 +40,16 @@ jobs:
# https://github.com/ggml-org/llama.cpp/issues/11888
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
+ - { tag: "cuda cuda12", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "12.4.0", ubuntu_version: "22.04" }
+ - { tag: "cuda13", dockerfile: ".devops/cuda-new.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "13.1.0", ubuntu_version: "24.04" }
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
- # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
- #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
+ - { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
steps:
- name: Check out the repo
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0 # preserve git history, so we can determine the build number
@@ -63,7 +63,7 @@ jobs:
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
- uses: docker/login-action@v2
+ uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
@@ -81,18 +81,21 @@ jobs:
run: |
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
REPO_NAME="${{ github.event.repository.name }}"
+ PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
# list all tags possible
- if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
- TYPE=""
- else
- TYPE="-${{ matrix.config.tag }}"
- fi
- PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
- CACHETAGS="${PREFIX}buildcache${TYPE}"
- FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
- LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
- SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
+ tags="${{ matrix.config.tag }}"
+ for tag in $tags; do
+ if [[ "$tag" == "cpu" ]]; then
+ TYPE=""
+ else
+ TYPE="-$tag"
+ fi
+ CACHETAGS="${PREFIX}buildcache${TYPE}"
+ FULLTAGS="${FULLTAGS:+$FULLTAGS,}${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
+ LIGHTTAGS="${LIGHTTAGS:+$LIGHTTAGS,}${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
+ SERVERTAGS="${SERVERTAGS:+$SERVERTAGS,}${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
+ done
echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
@@ -133,6 +136,9 @@ jobs:
file: ${{ matrix.config.dockerfile }}
target: full
provenance: false
+ build-args: |
+ ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
+ ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
@@ -155,6 +161,9 @@ jobs:
file: ${{ matrix.config.dockerfile }}
target: light
provenance: false
+ build-args: |
+ ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
+ ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
@@ -177,6 +186,9 @@ jobs:
file: ${{ matrix.config.dockerfile }}
target: server
provenance: false
+ build-args: |
+ ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
+ ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
@@ -196,7 +208,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0
diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
index f02b7c219..a5cd59001 100644
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -22,7 +22,7 @@ jobs:
editorconfig:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6
- uses: editorconfig-checker/action-editorconfig-checker@v2
with:
version: v3.0.3
diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml
index 3ca4d3058..5bdab0f15 100644
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -24,9 +24,9 @@ jobs:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6
- name: Set up Python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: '3.9.x'
- name: Install dependencies
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 0b0f300aa..42f00c0cd 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -9,9 +9,9 @@ jobs:
pull-requests: write
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6
with:
repository: "ggml-org/llama.cpp"
- - uses: actions/labeler@v5
+ - uses: actions/labeler@v6
with:
configuration-path: '.github/labeler.yml'
diff --git a/.github/workflows/pre-tokenizer-hashes.yml b/.github/workflows/pre-tokenizer-hashes.yml
index dff998e23..8120df0e3 100644
--- a/.github/workflows/pre-tokenizer-hashes.yml
+++ b/.github/workflows/pre-tokenizer-hashes.yml
@@ -16,10 +16,10 @@ jobs:
steps:
- name: Checkout repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up Python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: '3.11'
diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
index 46e80aecd..08cdcb9d0 100644
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -24,9 +24,9 @@ jobs:
name: check-requirements
steps:
- name: Check out source repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up Python environment
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: "3.11"
- name: Run check-requirements.sh script
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
index ddfdf73b8..91dc4d78a 100644
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -19,9 +19,9 @@ jobs:
name: Lint
steps:
- name: Check out source repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up Python environment
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: "3.11"
- name: flake8 Lint
diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml
index 373bb6010..54d5fab5b 100644
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -24,9 +24,9 @@ jobs:
name: pyright type-check
steps:
- name: Check out source repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up Python environment
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: "3.11"
- name: Install Python dependencies
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 446cae9f8..1914c0848 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -27,7 +27,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0
@@ -37,13 +37,6 @@ jobs:
key: macOS-latest-cmake-arm64
evict-old-files: 1d
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
- brew install curl
-
- name: Build
id: cmake_build
run: |
@@ -52,6 +45,7 @@ jobs:
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
+ -DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON \
@@ -66,17 +60,10 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
- zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- - name: Upload artifacts (zip)
- uses: actions/upload-artifact@v4
- with:
- path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
- name: llama-bin-macos-arm64.zip
-
- - name: Upload artifacts (tar)
- uses: actions/upload-artifact@v4
+ - name: Upload artifacts
+ uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
name: llama-bin-macos-arm64.tar.gz
@@ -87,7 +74,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0
@@ -97,13 +84,6 @@ jobs:
key: macOS-latest-cmake-x64
evict-old-files: 1d
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
- brew install curl
-
- name: Build
id: cmake_build
run: |
@@ -114,6 +94,7 @@ jobs:
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
+ -DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON \
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
@@ -127,17 +108,10 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
- zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- - name: Upload artifacts (zip)
- uses: actions/upload-artifact@v4
- with:
- path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
- name: llama-bin-macos-x64.zip
-
- - name: Upload artifacts (tar)
- uses: actions/upload-artifact@v4
+ - name: Upload artifacts
+ uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
name: llama-bin-macos-x64.tar.gz
@@ -159,7 +133,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0
@@ -173,7 +147,7 @@ jobs:
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential libcurl4-openssl-dev
+ sudo apt-get install build-essential libssl-dev
- name: Build
id: cmake_build
@@ -196,17 +170,10 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
- zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- - name: Upload artifacts (zip)
- uses: actions/upload-artifact@v4
- with:
- path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
- name: llama-bin-ubuntu-${{ matrix.build }}.zip
-
- - name: Upload artifacts (tar)
- uses: actions/upload-artifact@v4
+ - name: Upload artifacts
+ uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
@@ -217,7 +184,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0
@@ -233,7 +200,7 @@ jobs:
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y
- sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
+ sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
- name: Build
id: cmake_build
@@ -256,17 +223,10 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
- zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- - name: Upload artifacts (zip)
- uses: actions/upload-artifact@v4
- with:
- path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
- name: llama-bin-ubuntu-vulkan-x64.zip
-
- - name: Upload artifacts (tar)
- uses: actions/upload-artifact@v4
+ - name: Upload artifacts
+ uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
name: llama-bin-ubuntu-vulkan-x64.tar.gz
@@ -282,7 +242,7 @@ jobs:
steps:
- name: Clone
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0
@@ -297,39 +257,28 @@ jobs:
run: |
choco install ninja
- - name: libCURL
- id: get_libcurl
- uses: ./.github/actions/windows-setup-curl
- with:
- architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
-
- name: Build
shell: cmd
- env:
- CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
cmake -S . -B build -G "Ninja Multi-Config" ^
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
+ -DLLAMA_BUILD_BORINGSSL=ON ^
-DGGML_NATIVE=OFF ^
-DGGML_BACKEND_DL=ON ^
-DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
-DGGML_OPENMP=ON ^
- -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
${{ env.CMAKE_ARGS }}
cmake --build build --config Release
- name: Pack artifacts
id: pack_artifacts
- env:
- CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
- Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
- name: Upload artifacts
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v6
with:
path: llama-bin-win-cpu-${{ matrix.arch }}.zip
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
@@ -356,7 +305,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
@@ -402,7 +351,7 @@ jobs:
- name: Build
id: cmake_build
run: |
- cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
+ cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --config Release --target ${{ matrix.target }}
- name: Pack artifacts
@@ -411,7 +360,7 @@ jobs:
7z a -snl llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
- name: Upload artifacts
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v6
with:
path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
@@ -426,7 +375,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Install ccache
uses: ggml-org/ccache-action@v1.2.16
@@ -448,6 +397,7 @@ jobs:
- name: Build
id: cmake_build
shell: cmd
+ # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
cmake -S . -B build -G "Ninja Multi-Config" ^
@@ -455,7 +405,8 @@ jobs:
-DGGML_NATIVE=OFF ^
-DGGML_CPU=OFF ^
-DGGML_CUDA=ON ^
- -DLLAMA_CURL=OFF
+ -DLLAMA_BUILD_BORINGSSL=ON ^
+ -DGGML_CUDA_CUB_3DOT2=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
@@ -465,7 +416,7 @@ jobs:
7z a -snl llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
- name: Upload artifacts
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v6
with:
path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
@@ -480,7 +431,7 @@ jobs:
7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
- name: Upload Cuda runtime
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v6
with:
path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
@@ -500,7 +451,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
@@ -523,7 +474,7 @@ jobs:
-DCMAKE_BUILD_TYPE=Release ^
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
- -DLLAMA_CURL=OFF
+ -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --target ggml-sycl -j
- name: Build the release package
@@ -560,7 +511,7 @@ jobs:
7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload the release package
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v6
with:
path: llama-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
@@ -580,7 +531,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Grab rocWMMA package
id: grab_rocwmma
@@ -591,7 +542,7 @@ jobs:
- name: Cache ROCm Installation
id: cache-rocm
- uses: actions/cache@v4
+ uses: actions/cache@v5
with:
path: C:\Program Files\AMD\ROCm
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
@@ -650,7 +601,7 @@ jobs:
-DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_HIP=ON `
- -DLLAMA_CURL=OFF
+ -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
md "build\bin\hipblaslt\library"
@@ -666,7 +617,7 @@ jobs:
7z a -snl llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
- name: Upload artifacts
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v6
with:
path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
@@ -676,7 +627,7 @@ jobs:
steps:
- name: Checkout code
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0
@@ -691,7 +642,7 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_TESTS=OFF \
@@ -716,32 +667,43 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
- zip -y -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
- tar -czvf llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz -C build-apple llama.xcframework
+ # Zip file is required for Swift Package Manager, which does not support tar.gz for binary targets.
+ # For more details, see https://developer.apple.com/documentation/xcode/distributing-binary-frameworks-as-swift-packages
+ zip -r -y llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
- - name: Upload artifacts (zip)
- uses: actions/upload-artifact@v4
+ - name: Upload artifacts
+ uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
- - name: Upload artifacts (tar)
- uses: actions/upload-artifact@v4
- with:
- path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
- name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
-
openEuler-cann:
strategy:
matrix:
- arch: [x86, aarch64]
- chip_type: ['910b', '310p']
- build: ['Release']
+ include:
+ # 910b with aclgraph (both architectures)
+ - arch: x86
+ chip_type: '910b'
+ build: 'Release'
+ use_acl_graph: 'on'
+ - arch: aarch64
+ chip_type: '910b'
+ build: 'Release'
+ use_acl_graph: 'on'
+ # 310p without aclgraph (both architectures)
+ - arch: x86
+ chip_type: '310p'
+ build: 'Release'
+ use_acl_graph: 'off'
+ - arch: aarch64
+ chip_type: '310p'
+ build: 'Release'
+ use_acl_graph: 'off'
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0
@@ -763,6 +725,7 @@ jobs:
env:
BUILD_TYPE: ${{ matrix.build }}
SOC_TYPE: ascend${{ matrix.chip_type }}
+ USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
run: |
HOST_UID=$(id -u)
HOST_GID=$(id -g)
@@ -772,17 +735,19 @@ jobs:
-w /workspace \
-e SOC_TYPE=${SOC_TYPE} \
-e BUILD_TYPE=${BUILD_TYPE} \
+ -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
"${{ steps.cann-image.outputs.image }}" \
bash -lc '
set -e
- yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
+ yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
yum clean all && rm -rf /var/cache/yum
git config --global --add safe.directory "/workspace"
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DGGML_CANN=on \
- -DSOC_TYPE=${SOC_TYPE}
+ -DSOC_TYPE=${SOC_TYPE} \
+ -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
cmake --build build -j $(nproc)
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
@@ -795,13 +760,13 @@ jobs:
- name: Pack artifacts
run: |
cp LICENSE ./build/bin/
- tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+ tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- - name: Upload artifacts (tar)
- uses: actions/upload-artifact@v4
+ - name: Upload artifacts
+ uses: actions/upload-artifact@v6
with:
- path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
- name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
+ path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
+ name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -829,7 +794,7 @@ jobs:
steps:
- name: Clone
id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0
@@ -839,7 +804,7 @@ jobs:
- name: Download artifacts
id: download-artifact
- uses: actions/download-artifact@v4
+ uses: actions/download-artifact@v7
with:
path: ./artifact
merge-multiple: true
@@ -889,9 +854,6 @@ jobs:
with:
tag_name: ${{ steps.tag.outputs.name }}
body: |
- > [!WARNING]
- > **Release Format Update**: Linux releases will soon use .tar.gz archives instead of .zip. Please make the necessary changes to your deployment scripts.
-
${{ github.event.head_commit.message }}
@@ -901,7 +863,7 @@ jobs:
**macOS/iOS:**
- [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
- [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
- - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz)
+ - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)
**Linux:**
- [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
@@ -911,21 +873,21 @@ jobs:
**Windows:**
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
- - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip)
- - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip)
+ - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
+ - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [CUDA 13.1 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.1-x64.zip)
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
**openEuler:**
- [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
- - [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz)
+ - [openEuler x86 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86-aclgraph.tar.gz)
- [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
- - [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz)
+ - [openEuler aarch64 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64-aclgraph.tar.gz)
- name: Upload release
id: upload_release
- uses: actions/github-script@v3
+ uses: actions/github-script@v8
with:
github-token: ${{secrets.GITHUB_TOKEN}}
script: |
@@ -935,7 +897,7 @@ jobs:
for (let file of await fs.readdirSync('./release')) {
if (path.extname(file) === '.zip' || file.endsWith('.tar.gz')) {
console.log('uploadReleaseAsset', file);
- await github.repos.uploadReleaseAsset({
+ await github.rest.repos.uploadReleaseAsset({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: release_id,
diff --git a/.github/workflows/server-webui.yml b/.github/workflows/server-webui.yml
new file mode 100644
index 000000000..6d1b61737
--- /dev/null
+++ b/.github/workflows/server-webui.yml
@@ -0,0 +1,219 @@
+# Server WebUI build and tests
+name: Server WebUI
+
+on:
+ workflow_dispatch: # allows manual triggering
+ inputs:
+ sha:
+ description: 'Commit SHA1 to build'
+ required: false
+ type: string
+ slow_tests:
+ description: 'Run slow tests'
+ required: true
+ type: boolean
+ push:
+ branches:
+ - master
+ paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
+ pull_request:
+ types: [opened, synchronize, reopened]
+ paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
+
+env:
+ LLAMA_LOG_COLORS: 1
+ LLAMA_LOG_PREFIX: 1
+ LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_LOG_VERBOSITY: 10
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ webui-check:
+ name: WebUI Checks
+ runs-on: ubuntu-latest
+ continue-on-error: true
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v6
+ with:
+ fetch-depth: 0
+ ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+ - name: Setup Node.js
+ id: node
+ uses: actions/setup-node@v6
+ with:
+ node-version: "22"
+ cache: "npm"
+ cache-dependency-path: "tools/server/webui/package-lock.json"
+
+ - name: Install dependencies
+ id: setup
+ if: ${{ steps.node.conclusion == 'success' }}
+ run: npm ci
+ working-directory: tools/server/webui
+
+ - name: Run type checking
+ if: ${{ always() && steps.setup.conclusion == 'success' }}
+ run: npm run check
+ working-directory: tools/server/webui
+
+ - name: Run linting
+ if: ${{ always() && steps.setup.conclusion == 'success' }}
+ run: npm run lint
+ working-directory: tools/server/webui
+
+ - name: Build application
+ if: ${{ always() && steps.setup.conclusion == 'success' }}
+ run: npm run build
+ working-directory: tools/server/webui
+
+ - name: Install Playwright browsers
+ id: playwright
+ if: ${{ always() && steps.setup.conclusion == 'success' }}
+ run: npx playwright install --with-deps
+ working-directory: tools/server/webui
+
+ - name: Build Storybook
+ if: ${{ always() && steps.playwright.conclusion == 'success' }}
+ run: npm run build-storybook
+ working-directory: tools/server/webui
+
+ - name: Run Client tests
+ if: ${{ always() && steps.playwright.conclusion == 'success' }}
+ run: npm run test:client
+ working-directory: tools/server/webui
+
+ - name: Run Unit tests
+ if: ${{ always() && steps.playwright.conclusion == 'success' }}
+ run: npm run test:unit
+ working-directory: tools/server/webui
+
+ - name: Run UI tests
+ if: ${{ always() && steps.playwright.conclusion == 'success' }}
+ run: npm run test:ui -- --testTimeout=60000
+ working-directory: tools/server/webui
+
+ - name: Run E2E tests
+ if: ${{ always() && steps.playwright.conclusion == 'success' }}
+ run: npm run test:e2e
+ working-directory: tools/server/webui
+
+ server-build:
+ runs-on: ubuntu-latest
+
+ strategy:
+ matrix:
+ sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
+ build_type: [RelWithDebInfo]
+ include:
+ - build_type: Release
+ sanitizer: ""
+ fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
+
+ steps:
+ - name: Dependencies
+ id: depends
+ run: |
+ sudo apt-get update
+ sudo apt-get -y install \
+ build-essential \
+ xxd \
+ git \
+ cmake \
+ curl \
+ wget \
+ language-pack-en \
+ libssl-dev
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+ with:
+ fetch-depth: 0
+ ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+ - name: Python setup
+ id: setup_python
+ uses: actions/setup-python@v6
+ with:
+ python-version: '3.11'
+
+ - name: Tests dependencies
+ id: test_dependencies
+ run: |
+ pip install -r tools/server/tests/requirements.txt
+
+ - name: Setup Node.js for WebUI
+ uses: actions/setup-node@v6
+ with:
+ node-version: "22"
+ cache: "npm"
+ cache-dependency-path: "tools/server/webui/package-lock.json"
+
+ - name: Install WebUI dependencies
+ run: npm ci
+ working-directory: tools/server/webui
+
+ - name: Build WebUI
+ run: npm run build
+ working-directory: tools/server/webui
+
+ - name: Build (no OpenMP)
+ id: cmake_build_no_openmp
+ if: ${{ matrix.sanitizer == 'THREAD' }}
+ run: |
+ cmake -B build \
+ -DGGML_NATIVE=OFF \
+ -DLLAMA_BUILD_SERVER=ON \
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+ -DGGML_OPENMP=OFF ;
+ cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+ - name: Build (sanitizers)
+ id: cmake_build_sanitizers
+ if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
+ run: |
+ cmake -B build \
+ -DGGML_NATIVE=OFF \
+ -DLLAMA_BUILD_SERVER=ON \
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
+ cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+ - name: Build (sanitizers)
+ id: cmake_build
+ if: ${{ matrix.sanitizer == '' }}
+ run: |
+ cmake -B build \
+ -DGGML_NATIVE=OFF \
+ -DLLAMA_BUILD_SERVER=ON \
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
+ cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+ - name: Tests
+ id: server_integration_tests
+ if: ${{ matrix.sanitizer == '' }}
+ env:
+ GITHUB_ACTIONS: "true"
+ run: |
+ cd tools/server/tests
+ ./tests.sh
+
+ - name: Tests (sanitizers)
+ id: server_integration_tests_sanitizers
+ if: ${{ matrix.sanitizer != '' }}
+ run: |
+ cd tools/server/tests
+ LLAMA_SANITIZE=1 ./tests.sh
+
+ - name: Slow tests
+ id: server_integration_tests_slow
+ if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
+ run: |
+ cd tools/server/tests
+ SLOW_TESTS=1 ./tests.sh
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index a57d0e8b1..9f1ef48c8 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -41,192 +41,10 @@ jobs:
include:
- build_type: Release
sanitizer: ""
- fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
-
- steps:
- - name: Dependencies
- id: depends
- run: |
- sudo apt-get update
- sudo apt-get -y install \
- build-essential \
- xxd \
- git \
- cmake \
- curl \
- wget \
- language-pack-en \
- libssl-dev
-
- - name: Clone
- id: checkout
- uses: actions/checkout@v4
- with:
- fetch-depth: 0
- ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
- - name: Python setup
- id: setup_python
- uses: actions/setup-python@v5
- with:
- python-version: '3.11'
-
- - name: Tests dependencies
- id: test_dependencies
- run: |
- pip install -r tools/server/tests/requirements.txt
-
- webui-setup:
- name: WebUI Setup
- runs-on: ubuntu-latest
- steps:
- - name: Checkout code
- uses: actions/checkout@v4
- with:
- fetch-depth: 0
- ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
- - name: Setup Node.js
- uses: actions/setup-node@v4
- with:
- node-version: "22"
- cache: "npm"
- cache-dependency-path: "tools/server/webui/package-lock.json"
-
- - name: Cache node_modules
- uses: actions/cache@v4
- id: cache-node-modules
- with:
- path: tools/server/webui/node_modules
- key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
- restore-keys: |
- ${{ runner.os }}-node-modules-
-
- - name: Install dependencies
- if: steps.cache-node-modules.outputs.cache-hit != 'true'
- run: npm ci
- working-directory: tools/server/webui
-
- webui-check:
- needs: webui-setup
- name: WebUI Check
- runs-on: ubuntu-latest
- steps:
- - name: Checkout code
- uses: actions/checkout@v4
- with:
- fetch-depth: 0
- ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
- - name: Setup Node.js
- uses: actions/setup-node@v4
- with:
- node-version: "22"
-
- - name: Restore node_modules cache
- uses: actions/cache@v4
- with:
- path: tools/server/webui/node_modules
- key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
- restore-keys: |
- ${{ runner.os }}-node-modules-
-
- - name: Run type checking
- run: npm run check
- working-directory: tools/server/webui
-
- - name: Run linting
- run: npm run lint
- working-directory: tools/server/webui
-
- webui-build:
- needs: webui-check
- name: WebUI Build
- runs-on: ubuntu-latest
- steps:
- - name: Checkout code
- uses: actions/checkout@v4
- with:
- fetch-depth: 0
- ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
- - name: Setup Node.js
- uses: actions/setup-node@v4
- with:
- node-version: "22"
-
- - name: Restore node_modules cache
- uses: actions/cache@v4
- with:
- path: tools/server/webui/node_modules
- key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
- restore-keys: |
- ${{ runner.os }}-node-modules-
-
- - name: Build application
- run: npm run build
- working-directory: tools/server/webui
-
- webui-tests:
- needs: webui-build
- name: Run WebUI tests
- permissions:
- contents: read
-
- runs-on: ubuntu-latest
-
- steps:
- - name: Checkout code
- uses: actions/checkout@v4
-
- - name: Setup Node.js
- uses: actions/setup-node@v4
- with:
- node-version: "22"
-
- - name: Restore node_modules cache
- uses: actions/cache@v4
- with:
- path: tools/server/webui/node_modules
- key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
- restore-keys: |
- ${{ runner.os }}-node-modules-
-
- - name: Install Playwright browsers
- run: npx playwright install --with-deps
- working-directory: tools/server/webui
-
- - name: Build Storybook
- run: npm run build-storybook
- working-directory: tools/server/webui
-
- - name: Run Client tests
- run: npm run test:client
- working-directory: tools/server/webui
-
- - name: Run Server tests
- run: npm run test:server
- working-directory: tools/server/webui
-
- - name: Run UI tests
- run: npm run test:ui -- --testTimeout=60000
- working-directory: tools/server/webui
-
- - name: Run E2E tests
- run: npm run test:e2e
- working-directory: tools/server/webui
-
- server-build:
- needs: [webui-tests]
- runs-on: ubuntu-latest
-
- strategy:
- matrix:
- sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
- build_type: [RelWithDebInfo]
- include:
+ extra_args: ""
- build_type: Release
sanitizer: ""
+ extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
steps:
@@ -246,107 +64,7 @@ jobs:
- name: Clone
id: checkout
- uses: actions/checkout@v4
- with:
- fetch-depth: 0
- ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
- - name: Python setup
- id: setup_python
- uses: actions/setup-python@v5
- with:
- python-version: '3.11'
-
- - name: Tests dependencies
- id: test_dependencies
- run: |
- pip install -r tools/server/tests/requirements.txt
-
- - name: Setup Node.js for WebUI
- uses: actions/setup-node@v4
- with:
- node-version: "22"
- cache: "npm"
- cache-dependency-path: "tools/server/webui/package-lock.json"
-
- - name: Install WebUI dependencies
- run: npm ci
- working-directory: tools/server/webui
-
- - name: Build WebUI
- run: npm run build
- working-directory: tools/server/webui
-
- - name: Build (no OpenMP)
- id: cmake_build_no_openmp
- if: ${{ matrix.sanitizer == 'THREAD' }}
- run: |
- cmake -B build \
- -DGGML_NATIVE=OFF \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
- -DLLAMA_BUILD_SERVER=ON \
- -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
- -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
- -DGGML_OPENMP=OFF ;
- cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
- - name: Build (sanitizers)
- id: cmake_build_sanitizers
- if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
- run: |
- cmake -B build \
- -DGGML_NATIVE=OFF \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
- -DLLAMA_BUILD_SERVER=ON \
- -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
- -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
- cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
- - name: Build (sanitizers)
- id: cmake_build
- if: ${{ matrix.sanitizer == '' }}
- run: |
- cmake -B build \
- -DGGML_NATIVE=OFF \
- -DLLAMA_CURL=OFF \
- -DLLAMA_OPENSSL=ON \
- -DLLAMA_BUILD_SERVER=ON \
- -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
- cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
- - name: Tests
- id: server_integration_tests
- if: ${{ matrix.sanitizer == '' }}
- env:
- GITHUB_ACTIONS: "true"
- run: |
- cd tools/server/tests
- ./tests.sh
-
- - name: Tests (sanitizers)
- id: server_integration_tests_sanitizers
- if: ${{ matrix.sanitizer != '' }}
- run: |
- cd tools/server/tests
- LLAMA_SANITIZE=1 ./tests.sh
-
- - name: Slow tests
- id: server_integration_tests_slow
- if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
- run: |
- cd tools/server/tests
- SLOW_TESTS=1 ./tests.sh
-
-
- server-windows:
- runs-on: windows-2022
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
@@ -354,12 +72,48 @@ jobs:
- name: Build
id: cmake_build
run: |
- cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
+ cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
+ cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
+
+ - name: Python setup
+ id: setup_python
+ uses: actions/setup-python@v6
+ with:
+ python-version: '3.11'
+
+ - name: Tests dependencies
+ id: test_dependencies
+ run: |
+ pip install -r tools/server/tests/requirements.txt
+
+ - name: Tests
+ id: server_integration_tests
+ if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) && matrix.build_type == 'Release' }}
+ run: |
+ cd tools/server/tests
+ export ${{ matrix.extra_args }}
+ pytest -v -x -m "not slow"
+
+ server-windows:
+ runs-on: windows-2022
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+ with:
+ fetch-depth: 0
+ ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+ - name: Build
+ id: cmake_build
+ run: |
+ cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
- name: Python setup
id: setup_python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: '3.11'
diff --git a/.github/workflows/update-ops-docs.yml b/.github/workflows/update-ops-docs.yml
index d5e264b34..40447db4e 100644
--- a/.github/workflows/update-ops-docs.yml
+++ b/.github/workflows/update-ops-docs.yml
@@ -18,10 +18,10 @@ jobs:
steps:
- name: Checkout repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up Python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: '3.x'
diff --git a/.github/workflows/winget.yml b/.github/workflows/winget.yml
index d3d9be23c..750609164 100644
--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@@ -21,7 +21,7 @@ jobs:
- name: Find latest release
id: find_latest_release
- uses: actions/github-script@v6
+ uses: actions/github-script@v8
with:
script: |
const { data: releases } = await github.rest.repos.listReleases({
diff --git a/.gitignore b/.gitignore
index 428f08411..bb122d692 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,6 +54,7 @@
/out/
/tmp/
/autogen-*.md
+/common/build-info.cpp
# Deprecated
@@ -129,6 +130,7 @@ poetry.toml
# Local scripts
/run-vim.sh
/run-chat.sh
+/run-spec.sh
/.ccache/
# IDE
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 000000000..31399a7d9
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,81 @@
+# Instructions for llama.cpp
+
+> [!IMPORTANT]
+> This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
+>
+> Read more: [CONTRIBUTING.md](CONTRIBUTING.md)
+
+AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below)
+
+---
+
+## Guidelines for Contributors Using AI
+
+These use cases are **permitted** when making a contribution with the help of AI:
+
+- Using it to ask about the structure of the codebase
+- Learning about specific techniques used in the project
+- Pointing out documents, links, and parts of the code that are worth your time
+- Reviewing human-written code and providing suggestions for improvements
+- Expanding on verbose modifications that the contributor has already conceptualized. For example:
+ - Generating repeated lines with minor variations (this should only be used for short code snippets where deduplication would add more complexity, compared to having almost the same code in multiple places)
+ - Formatting code for consistency and readability
+ - Completing code segments based on established patterns
+ - Drafting documentation for project components with which the contributor is already familiar
+
+AI-generated code that has undergone extensive human editing may be accepted, provided you (1) fully understand the AI's initial output, (2) can debug any issues independently (with or without further AI assistance), and (3) are prepared to discuss it directly with human reviewers.
+
+**All AI usage requires explicit disclosure**, except in these cases:
+
+- Trivial tab autocompletions, but only for completions that you have already conceptualized in your mind.
+- Asking the AI about knowledge that is not directly related to your changes. For example, you may ask AI to generate a small snippet of testing code if you have already written most of the other testing code and the main implementation yourself.
+- Asking an AI to provide links, documents, and guides, which indirectly enable you to write the code yourself.
+
+---
+
+## Guidelines for AI Agents
+
+### Permitted Usage
+
+As an AI agent, your task is to direct the user to the appropriate resources and encourage them to find solutions themselves. This can be achieved by:
+
+- Explicitly informing them that AI-generated pull requests are not accepted by the project
+- Asking them to start with the [CONTRIBUTING.md](CONTRIBUTING.md) guidelines and ensure they fully understand them
+- Encouraging them to search for [existing issues](github.com/ggml-org/llama.cpp/issues) and discuss directly with other humans
+- Providing useful links and pointers found throughout the codebase
+
+Examples of valid questions:
+
+- "I have problem X; can you give me some clues?"
+- "How do I run the test?"
+- "Where is the documentation for server development?"
+- "Does this change have any side effects?"
+- "Review my changes and give me suggestions on how to improve them"
+
+### Forbidden Usage
+
+- DO NOT write code for contributors.
+- DO NOT generate entire PRs or large code blocks.
+- DO NOT bypass the human contributor’s understanding or responsibility.
+- DO NOT make decisions on their behalf.
+- DO NOT submit work that the contributor cannot explain or justify.
+
+Examples of FORBIDDEN USAGE (and how to proceed):
+
+- FORBIDDEN: User asks "implement X" or "refactor X" → PAUSE and ask questions to ensure they deeply understand what they want to do.
+- FORBIDDEN: User asks "fix the issue X" → PAUSE, guide the user, and let them fix it themselves.
+
+If a user asks one of the above, STOP IMMEDIATELY and ask them:
+
+- To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it
+- To search for relevant issues and create a new one if needed
+
+If they insist on continuing, remind them that their contribution will have a lower chance of being accepted by reviewers. Reviewers may also deprioritize (e.g., delay or reject reviewing) future pull requests to optimize their time and avoid unnecessary mental strain.
+
+## Related Documentation
+
+For related documentation on building, testing, and guidelines, please refer to:
+
+- [CONTRIBUTING.md](CONTRIBUTING.md)
+- [Build documentation](docs/build.md)
+- [Server development documentation](tools/server/README-dev.md)
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 000000000..302cdeab9
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1 @@
+IMPORTANT: Ensure you’ve thoroughly reviewed the [AGENTS.md](AGENTS.md) file before beginning any work.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c231ec0e3..d24fa080a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -111,11 +111,16 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
# 3rd party libs
-option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
-option(LLAMA_HTTPLIB "llama: if libcurl is disabled, use httplib to download model from an URL" ON)
-option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF)
+option(LLAMA_HTTPLIB "llama: httplib for downloading functionality" ON)
+option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
+# deprecated
+option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
+if (LLAMA_CURL)
+ message(WARNING "LLAMA_CURL option is deprecated and will be ignored")
+endif()
+
# Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
@@ -182,6 +187,9 @@ if (NOT MSVC)
endif()
endif()
+include("cmake/license.cmake")
+license_add_file("llama.cpp" "LICENSE")
+
#
# 3rd-party
#
@@ -209,11 +217,6 @@ add_subdirectory(src)
# utils, programs, examples and tests
#
-if (NOT LLAMA_BUILD_COMMON)
- message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
- set(LLAMA_CURL OFF)
-endif()
-
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
if (LLAMA_HTTPLIB)
@@ -235,6 +238,19 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
add_subdirectory(tools)
endif()
+# Automatically add all files from the 'licenses' directory
+file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
+
+foreach(FILE_PATH ${EXTRA_LICENSES})
+ get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
+ string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
+ license_add_file("${NAME}" "${FILE_PATH}")
+endforeach()
+
+if (LLAMA_BUILD_COMMON)
+ license_generate(common)
+endif()
+
#
# install
#
diff --git a/CODEOWNERS b/CODEOWNERS
index 8e62a36e8..55f5011df 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -15,6 +15,7 @@
/common/common.* @ggerganov
/common/console.* @ggerganov
/common/http.* @angt
+/common/jinja/ @ngxson @CISC @aldehir
/common/llguidance.* @ggerganov
/common/log.* @ggerganov
/common/peg-parser.* @aldehir
@@ -32,7 +33,7 @@
/examples/export-docs/ @ggerganov
/examples/gen-docs/ @ggerganov
/examples/gguf/ @ggerganov
-/examples/llama.android/ @ggerganov
+/examples/llama.android/ @ggerganov @hanyin-arm @naco-siren
/examples/llama.swiftui/ @ggerganov
/examples/llama.vim @ggerganov
/examples/lookahead/ @ggerganov
@@ -87,7 +88,8 @@
/tests/ @ggerganov
/tests/test-chat-.* @pwilkin
/tools/batched-bench/ @ggerganov
-/tools/main/ @ggerganov
+/tools/cli/ @ngxson
+/tools/completion/ @ggerganov
/tools/mtmd/ @ngxson
/tools/perplexity/ @ggerganov
/tools/quantize/ @ggerganov
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4545ff8f9..c928bc39c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -6,21 +6,45 @@ The project differentiates between 3 levels of contributors:
- Collaborators (Triage): people with significant contributions, who may be responsible for some parts of the code, and are expected to maintain and review contributions for the code they own
- Maintainers: responsible for reviewing and merging PRs, after approval from the code owners
+# AI Usage Policy
+
+> [!IMPORTANT]
+> This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
+>
+> Detailed information regarding permissible and restricted uses of AI can be found in the [AGENTS.md](AGENTS.md) file.
+
+Code that is initially generated by AI and subsequently edited will still be considered AI-generated. AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (e.g., generating repeated lines with minor variations).
+
+If AI is used to generate any portion of the code, contributors must adhere to the following requirements:
+
+1. Explicitly disclose the manner in which AI was employed.
+2. Perform a comprehensive manual review prior to submitting the pull request.
+3. Be prepared to explain every line of code they submitted when asked about it by a maintainer.
+4. Using AI to write pull request descriptions or to respond to human reviewers is strictly prohibited.
+
+For more info, please refer to the [AGENTS.md](AGENTS.md) file.
+
# Pull requests (for contributors & collaborators)
+Before submitting your PR:
+- Search for existing PRs to prevent duplicating efforts
- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
- Test your changes:
- Execute [the full CI locally on your machine](ci/README.md) before publishing
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
-- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
-- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
+- Create separate PRs for each feature or fix:
+ - Avoid combining unrelated changes in a single PR
+ - For intricate features, consider opening a feature request first to discuss and align expectations
+ - When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
-- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
+
+After submitting your PR:
+- Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
-- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
-- Using AI to generate PRs is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before publishing the PR. Note that trivial tab autocompletions do not require disclosure.
+- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
+- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for fixing related issues and reviewing related PRs
# Pull requests (for maintainers)
@@ -31,6 +55,11 @@ The project differentiates between 3 levels of contributors:
- When merging a PR, make sure you have a good understanding of the changes
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
+Maintainers reserve the right to decline review or close pull requests for any reason, particularly under any of the following conditions:
+- The proposed change is already mentioned in the roadmap or an existing issue, and it has been assigned to someone.
+- The pull request duplicates an existing one.
+- The contributor fails to adhere to this contributing guide.
+
# Coding guidelines
- Avoid adding third-party dependencies, extra files, extra headers, etc.
diff --git a/README.md b/README.md
index b7d24c9dd..91a8f25d1 100644
--- a/README.md
+++ b/README.md
@@ -190,6 +190,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
- Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
- Go (no CGo needed): [hybridgroup/yzma](https://github.com/hybridgroup/yzma)
+- Android: [llama.android](/examples/llama.android)
@@ -199,6 +200,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
+- [BonzAI App](https://apps.apple.com/us/app/bonzai-your-local-ai-agent/id6752847988) (proprietary)
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
@@ -313,7 +315,7 @@ The Hugging Face platform provides a variety of online tools for converting, qua
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
-## [`llama-cli`](tools/main)
+## [`llama-cli`](tools/cli)
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
@@ -481,21 +483,6 @@ To learn more about model quantization, [read this documentation](tools/quantize
-## [`llama-run`](tools/run)
-
-#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
-
--
- Run a model with a specific prompt (by default it's pulled from Ollama registry)
-
- ```bash
- llama-run granite-code
- ```
-
-
-
-[^3]: [RamaLama](https://github.com/containers/ramalama)
-
## [`llama-simple`](examples/simple)
#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
@@ -525,7 +512,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
## Other documentation
-- [main (cli)](tools/main/README.md)
+- [cli](tools/cli/README.md)
+- [completion](tools/completion/README.md)
- [server](tools/server/README.md)
- [GBNF grammars](grammars/README.md)
@@ -597,8 +585,5 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
-- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
-- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
-- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
diff --git a/SECURITY.md b/SECURITY.md
index 9c86ae91b..9a9373231 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,12 +1,52 @@
# Security Policy
+ - [**Reporting a vulnerability**](#reporting-a-vulnerability)
+ - [**Requirements**](#requirements)
+ - [**Covered Topics**](#covered-topics)
- [**Using llama.cpp securely**](#using-llamacpp-securely)
- [Untrusted models](#untrusted-models)
- [Untrusted inputs](#untrusted-inputs)
- [Data privacy](#data-privacy)
- [Untrusted environments or networks](#untrusted-environments-or-networks)
- [Multi-Tenant environments](#multi-tenant-environments)
- - [**Reporting a vulnerability**](#reporting-a-vulnerability)
+
+## Reporting a vulnerability
+
+If you have discovered a security vulnerability in this project that falls inside the [covered topics](#covered-topics), please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
+
+Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
+
+A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
+
+> [!IMPORTANT]
+> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
+
+## Requirements
+
+Before submitting your report, ensure you meet the following requirements:
+
+- You have read this policy and fully understand it.
+- AI is only permitted in an assistive capacity as stated in [AGENTS.md](AGENTS.md). We do not accept reports that are written exclusively by AI.
+- Your report must include a working Proof-of-Concept in the form of a script and/or attached files.
+
+Maintainers reserve the right to close the report if these requirements are not fulfilled.
+
+## Covered Topics
+
+Only vulnerabilities that fall within these parts of the project are considered valid. For problems falling outside of this list, please report them as issues.
+
+- `src/**/*`
+- `ggml/**/*`
+- `gguf-py/**/*`
+- `tools/server/*`, **excluding** the following topics:
+ - Web UI
+ - Features marked as experimental
+ - Features not recommended for use in untrusted environments (e.g., router, MCP)
+ - Bugs that can lead to Denial-of-Service attack
+
+Note that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities in LLaMA C++.
+
+For vulnerabilities that fall within the `vendor` directory, please report them directly to the third-party project.
## Using llama.cpp securely
@@ -55,16 +95,3 @@ If you intend to run multiple models in parallel with shared memory, it is your
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
-
-## Reporting a vulnerability
-
-Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
-
-
-However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
-
-Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
-
-Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
-
-A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
diff --git a/build-xcframework.sh b/build-xcframework.sh
index 81280f749..0eec87113 100755
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -414,7 +414,7 @@ cmake -B build-ios-sim -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-S .
cmake --build build-ios-sim --config Release -- -quiet
@@ -428,7 +428,7 @@ cmake -B build-ios-device -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-S .
cmake --build build-ios-device --config Release -- -quiet
@@ -439,7 +439,7 @@ cmake -B build-macos -G Xcode \
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-S .
cmake --build build-macos --config Release -- -quiet
@@ -453,7 +453,7 @@ cmake -B build-visionos -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-DLLAMA_HTTPLIB=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-S .
@@ -469,7 +469,7 @@ cmake -B build-visionos-sim -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-DLLAMA_HTTPLIB=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-S .
@@ -487,7 +487,7 @@ cmake -B build-tvos-sim -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-S .
cmake --build build-tvos-sim --config Release -- -quiet
@@ -502,7 +502,7 @@ cmake -B build-tvos-device -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
- -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=OFF \
-S .
cmake --build build-tvos-device --config Release -- -quiet
diff --git a/ci/run.sh b/ci/run.sh
index 0676504b3..dfcf95966 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -45,14 +45,15 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`
-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
fi
if [ ! -z ${GG_BUILD_CUDA} ]; then
- CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
+ # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DGGML_CUDA_CUB_3DOT2=ON"
if command -v nvidia-smi >/dev/null 2>&1; then
CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
@@ -104,7 +105,20 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
fi
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
- CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
+
+ if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
+ if [ -z "${CMAKE_PREFIX_PATH}" ]; then
+ export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}"
+ else
+ export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}:${CMAKE_PREFIX_PATH}"
+ fi
+ fi
+
+ # For some systems, Dawn_DIR needs to be set explicitly, e.g., the lib64 path
+ if [ ! -z "${GG_BUILD_WEBGPU_DAWN_DIR}" ]; then
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DDawn_DIR=${GG_BUILD_WEBGPU_DAWN_DIR}"
+ fi
fi
if [ ! -z ${GG_BUILD_MUSA} ]; then
@@ -240,7 +254,7 @@ function gg_run_ctest_release {
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then
- (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+ (time ctest --output-on-failure -L 'main|python' ) 2>&1 | tee -a $OUT/${ci}-ctest.log
else
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
fi
@@ -283,7 +297,8 @@ function gg_sum_test_scripts {
}
function gg_get_model {
- local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
+ #local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
+ local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-q4_0.gguf"
if [[ -s $gguf_0 ]]; then
echo -n "$gguf_0"
else
@@ -398,6 +413,8 @@ function gg_run_qwen3_0_6b {
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
+ (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
+
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -523,6 +540,8 @@ function gg_run_embd_bge_small {
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+ (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
+
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -563,6 +582,8 @@ function gg_run_rerank_tiny {
model_f16="${path_models}/ggml-model-f16.gguf"
+ (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
+
# for this model, the SEP token is ""
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
diff --git a/cmake/download-models.cmake b/cmake/download-models.cmake
new file mode 100644
index 000000000..de252906a
--- /dev/null
+++ b/cmake/download-models.cmake
@@ -0,0 +1,21 @@
+get_filename_component(DEST_DIR "${DEST}" DIRECTORY)
+file(MAKE_DIRECTORY "${DEST_DIR}")
+
+if(NOT EXISTS "${DEST}")
+ message(STATUS "Downloading ${NAME} from ggml-org/models...")
+endif()
+
+file(DOWNLOAD
+ "https://huggingface.co/ggml-org/models/resolve/main/${NAME}?download=true"
+ "${DEST}"
+ TLS_VERIFY ON
+ EXPECTED_HASH ${HASH}
+ STATUS status
+)
+
+list(GET status 0 code)
+
+if(NOT code EQUAL 0)
+ list(GET status 1 msg)
+ message(FATAL_ERROR "Failed to download ${NAME}: ${msg}")
+endif()
diff --git a/cmake/license.cmake b/cmake/license.cmake
new file mode 100644
index 000000000..de066603b
--- /dev/null
+++ b/cmake/license.cmake
@@ -0,0 +1,40 @@
+define_property(GLOBAL PROPERTY LICENSE_TEXT
+ BRIEF_DOCS "Embedded licenses"
+ FULL_DOCS "Global string containing all aggregated licenses"
+)
+
+function(license_add_file NAME FILE)
+ if(NOT IS_ABSOLUTE "${FILE}")
+ set(FILE "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}")
+ endif()
+ if(EXISTS "${FILE}")
+ set(TITLE "License for ${NAME}")
+ string(REGEX REPLACE "." "=" UNDERLINE "${TITLE}")
+ file(READ "${FILE}" TEXT)
+ get_property(TMP GLOBAL PROPERTY LICENSE_TEXT)
+ string(APPEND TMP "R\"=L=(${TITLE}\n${UNDERLINE}\n\n${TEXT})=L=\",\n")
+ set_property(GLOBAL PROPERTY LICENSE_TEXT "${TMP}")
+ else()
+ message(WARNING "License file '${FILE}' not found")
+ endif()
+endfunction()
+
+function(license_generate TARGET_NAME)
+ message(STATUS "Generating embedded license file for target: ${TARGET_NAME}")
+ get_property(TEXT GLOBAL PROPERTY LICENSE_TEXT)
+
+ set(CPP_CONTENT "// Generated by CMake\n\n")
+ string(APPEND CPP_CONTENT "const char* LICENSES[] = {\n")
+ string(APPEND CPP_CONTENT "${TEXT}")
+ string(APPEND CPP_CONTENT "nullptr\n")
+ string(APPEND CPP_CONTENT "};\n")
+
+ set(CPP_FILE "${CMAKE_BINARY_DIR}/license.cpp")
+ file(WRITE "${CPP_FILE}" "${CPP_CONTENT}")
+
+ if(TARGET ${TARGET_NAME})
+ target_sources(${TARGET_NAME} PRIVATE "${CPP_FILE}")
+ else()
+ message(FATAL_ERROR "Target '${TARGET_NAME}' does not exist")
+ endif()
+endfunction()
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 0182767c2..ae02c0bd7 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -60,6 +60,8 @@ add_library(${TARGET} STATIC
common.h
console.cpp
console.h
+ debug.cpp
+ debug.h
download.cpp
download.h
http.h
@@ -83,8 +85,23 @@ add_library(${TARGET} STATIC
speculative.h
unicode.cpp
unicode.h
+ jinja/lexer.cpp
+ jinja/lexer.h
+ jinja/parser.cpp
+ jinja/parser.h
+ jinja/runtime.cpp
+ jinja/runtime.h
+ jinja/value.cpp
+ jinja/value.h
+ jinja/string.cpp
+ jinja/string.h
+ jinja/caps.cpp
+ jinja/caps.h
)
+target_include_directories(${TARGET} PUBLIC . ../vendor)
+target_compile_features (${TARGET} PUBLIC cxx_std_17)
+
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
@@ -92,17 +109,7 @@ endif()
# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
set(LLAMA_COMMON_EXTRA_LIBS build_info)
-if (LLAMA_CURL)
- # Use curl to download model url
- find_package(CURL)
- if (NOT CURL_FOUND)
- message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
- endif()
- target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
- include_directories(${CURL_INCLUDE_DIRS})
- set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
-elseif (LLAMA_HTTPLIB)
- # otherwise, use cpp-httplib
+if (LLAMA_HTTPLIB)
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
endif()
@@ -151,30 +158,4 @@ if (LLAMA_LLGUIDANCE)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
endif ()
-target_include_directories(${TARGET} PUBLIC . ../vendor)
-target_compile_features (${TARGET} PUBLIC cxx_std_17)
-target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
-
-
-#
-# copy the license files
-#
-
-# Check if running in GitHub Actions
-if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
- message(STATUS "Running inside GitHub Actions - copying license files")
-
- # Copy all files from licenses/ to build/bin/
- file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
- foreach(LICENSE_FILE ${LICENSE_FILES})
- get_filename_component(FILENAME ${LICENSE_FILE} NAME)
- add_custom_command(
- POST_BUILD
- TARGET ${TARGET}
- COMMAND ${CMAKE_COMMAND} -E copy_if_different
- "${LICENSE_FILE}"
- "$/${FILENAME}"
- COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
- message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
- endforeach()
-endif()
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
diff --git a/common/arg.cpp b/common/arg.cpp
index bb2a6840b..163c9b71b 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2,10 +2,11 @@
#include "chat.h"
#include "common.h"
+#include "download.h"
#include "json-schema-to-grammar.h"
#include "log.h"
#include "sampling.h"
-#include "download.h"
+#include "preset.h"
// fix problem with std::min and std::max
#if defined(_WIN32)
@@ -20,6 +21,7 @@
#include
#include
+#include
#include
#include
#include
@@ -46,6 +48,8 @@
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+extern const char * LICENSES[];
+
using json = nlohmann::ordered_json;
using namespace common_arg_utils;
@@ -95,6 +99,11 @@ common_arg & common_arg::set_sparam() {
return *this;
}
+common_arg & common_arg::set_preset_only() {
+ is_preset_only = true;
+ return *this;
+}
+
bool common_arg::in_example(enum llama_example ex) {
return examples.find(ex) != examples.end();
}
@@ -262,6 +271,55 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
}
}
+static std::string clean_file_name(const std::string & fname) {
+ std::string clean_fname = fname;
+ string_replace_all(clean_fname, "\\", "_");
+ string_replace_all(clean_fname, "/", "_");
+ return clean_fname;
+}
+
+static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
+ GGML_ASSERT(!params.model.hf_repo.empty());
+
+ // the returned hf_repo is without tag
+ auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
+
+ // "latest" tag (default if not specified) is translated to "default" preset
+ if (hf_tag == "latest") {
+ hf_tag = "default";
+ }
+
+ const bool offline = params.offline;
+ std::string model_endpoint = get_model_endpoint();
+ auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
+
+ // prepare local path for caching
+ auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
+ auto preset_path = fs_get_cache_file(preset_fname);
+ const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
+ const bool has_preset = status >= 200 && status < 400;
+
+ // remote preset is optional, so we don't error out if not found
+ if (has_preset) {
+ LOG_INF("applying remote preset from %s\n", preset_url.c_str());
+ common_preset_context ctx(ex, /* only_remote_allowed */ true);
+ common_preset global;
+ auto remote_presets = ctx.load_from_ini(preset_path, global);
+ remote_presets = ctx.cascade(global, remote_presets);
+ if (remote_presets.find(hf_tag) != remote_presets.end()) {
+ common_preset preset = remote_presets.at(hf_tag);
+ LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
+ preset.apply_to_params(params);
+ } else {
+ throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
+ }
+ } else {
+ LOG_INF("%s", "no remote preset found, skipping\n");
+ }
+
+ return has_preset;
+}
+
struct handle_model_result {
bool found_mmproj = false;
common_params_model mmproj;
@@ -283,7 +341,7 @@ static handle_model_result common_params_handle_model(
if (model.path.empty()) {
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
- exit(1); // built without CURL, error message already printed
+ exit(1); // error message already printed
}
model.name = model.hf_repo; // repo name with tag
model.hf_repo = auto_detected.repo; // repo name without tag
@@ -303,9 +361,7 @@ static handle_model_result common_params_handle_model(
// make sure model path is present (for caching purposes)
if (model.path.empty()) {
// this is to avoid different repo having same file name, or same file name in different subdirs
- std::string filename = model.hf_repo + "_" + model.hf_file;
- // to make sure we don't have any slashes in the filename
- string_replace_all(filename, "/", "_");
+ std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
model.path = fs_get_cache_file(filename);
}
@@ -419,56 +475,87 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
}
};
- for (int i = 1; i < argc; i++) {
- const std::string arg_prefix = "--";
+ auto parse_cli_args = [&]() {
+ std::set seen_args;
- std::string arg = argv[i];
- if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
- std::replace(arg.begin(), arg.end(), '_', '-');
- }
- if (arg_to_options.find(arg) == arg_to_options.end()) {
- throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
- }
- auto & tmp = arg_to_options[arg];
- auto opt = *tmp.first;
- bool is_positive = tmp.second;
- if (opt.has_value_from_env()) {
- fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
- }
- try {
- if (opt.handler_void) {
- opt.handler_void(params);
- continue;
- }
- if (opt.handler_bool) {
- opt.handler_bool(params, is_positive);
- continue;
- }
+ for (int i = 1; i < argc; i++) {
+ const std::string arg_prefix = "--";
- // arg with single value
- check_arg(i);
- std::string val = argv[++i];
- if (opt.handler_int) {
- opt.handler_int(params, std::stoi(val));
- continue;
+ std::string arg = argv[i];
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+ std::replace(arg.begin(), arg.end(), '_', '-');
}
- if (opt.handler_string) {
- opt.handler_string(params, val);
- continue;
+ if (arg_to_options.find(arg) == arg_to_options.end()) {
+ throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
}
+ if (!seen_args.insert(arg).second) {
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+ }
+ auto & tmp = arg_to_options[arg];
+ auto opt = *tmp.first;
+ bool is_positive = tmp.second;
+ if (opt.has_value_from_env()) {
+ fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
+ }
+ try {
+ if (opt.handler_void) {
+ opt.handler_void(params);
+ continue;
+ }
+ if (opt.handler_bool) {
+ opt.handler_bool(params, is_positive);
+ continue;
+ }
- // arg with 2 values
- check_arg(i);
- std::string val2 = argv[++i];
- if (opt.handler_str_str) {
- opt.handler_str_str(params, val, val2);
- continue;
+ // arg with single value
+ check_arg(i);
+ std::string val = argv[++i];
+ if (opt.handler_int) {
+ opt.handler_int(params, std::stoi(val));
+ continue;
+ }
+ if (opt.handler_string) {
+ opt.handler_string(params, val);
+ continue;
+ }
+
+ // arg with 2 values
+ check_arg(i);
+ std::string val2 = argv[++i];
+ if (opt.handler_str_str) {
+ opt.handler_str_str(params, val, val2);
+ continue;
+ }
+ } catch (std::exception & e) {
+ throw std::invalid_argument(string_format(
+ "error while handling argument \"%s\": %s\n\n"
+ "usage:\n%s\n\nto show complete usage, run with -h",
+ arg.c_str(), e.what(), opt.to_string().c_str()));
}
- } catch (std::exception & e) {
- throw std::invalid_argument(string_format(
- "error while handling argument \"%s\": %s\n\n"
- "usage:\n%s\n\nto show complete usage, run with -h",
- arg.c_str(), e.what(), opt.to_string().c_str()));
+ }
+ };
+
+ // parse the first time to get -hf option (used for remote preset)
+ parse_cli_args();
+
+ // maybe handle remote preset
+ if (!params.model.hf_repo.empty()) {
+ std::string cli_hf_repo = params.model.hf_repo;
+ bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
+
+ // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
+ // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
+ std::string preset_hf_repo = params.model.hf_repo;
+ bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
+
+ if (has_preset) {
+ // re-parse CLI args to override preset values
+ parse_cli_args();
+ }
+
+ // preserve hf_repo from preset if needed
+ if (preset_has_hf_repo) {
+ params.model.hf_repo = preset_hf_repo;
}
}
@@ -529,7 +616,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
params.kv_overrides.back().key[0] = 0;
}
- if (!params.tensor_buft_overrides.empty()) {
+ // pad tensor_buft_overrides for llama_params_fit:
+ const size_t ntbo = llama_max_tensor_buft_overrides();
+ while (params.tensor_buft_overrides.size() < ntbo) {
params.tensor_buft_overrides.push_back({nullptr, nullptr});
}
@@ -666,7 +755,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-quantize",
"llama-qwen2vl-cli",
"llama-retrieval",
- "llama-run",
"llama-save-load-state",
"llama-server",
"llama-simple",
@@ -747,6 +835,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map seen_args;
+
for (int i = 1; i < argc; i++) {
const std::string arg_prefix = "--";
@@ -757,8 +847,16 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map parse_csv_row(const std::string& input) {
+ std::vector fields;
+ std::string field;
+ bool in_quotes = false;
+
+ for (size_t i = 0; i < input.length(); ++i) {
+ char ch = input[i];
+
+ if (ch == '"') {
+ if (!in_quotes) {
+ // start of quoted field (only valid if at beginning of field)
+ if (!field.empty()) {
+ // quote appeared in middle of unquoted field, treat as literal
+ field += '"';
+ } else {
+ in_quotes = true; // start
+ }
+ } else {
+ if (i + 1 < input.length() && input[i + 1] == '"') {
+ // escaped quote: ""
+ field += '"';
+ ++i; // skip the next quote
+ } else {
+ in_quotes = false; // end
+ }
+ }
+ } else if (ch == ',') {
+ if (in_quotes) {
+ field += ',';
+ } else {
+ fields.push_back(std::move(field));
+ field.clear();
+ }
+ } else {
+ field += ch;
+ }
+ }
+
+ // Add the last field
+ fields.push_back(std::move(field));
+
+ return fields;
+}
+
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+ // per-example default params
+ // we define here to make sure it's included in llama-gen-docs
+ if (ex == LLAMA_EXAMPLE_COMPLETION) {
+ params.use_jinja = false; // disable jinja by default
+
+ } else if (ex == LLAMA_EXAMPLE_MTMD) {
+ params.use_jinja = false; // disable jinja by default
+ params.sampling.temp = 0.2; // lower temp by default for better quality
+
+ } else if (ex == LLAMA_EXAMPLE_SERVER) {
+ params.n_parallel = -1; // auto by default
+ }
+
params.use_color = tty_can_use_colors();
// load dynamic backends
@@ -847,7 +1006,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
sampler_type_chars += common_sampler_type_to_chr(sampler);
sampler_type_names += common_sampler_type_to_str(sampler) + ";";
}
- sampler_type_names.pop_back();
+ if (!sampler_type_names.empty()) {
+ sampler_type_names.pop_back(); // remove last semicolon
+ }
/**
@@ -880,6 +1041,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
exit(0);
}
));
+ add_opt(common_arg(
+ {"--license"},
+ "show source code license and dependencies",
+ [](common_params &) {
+ for (int i = 0; LICENSES[i]; ++i) {
+ printf("%s\n", LICENSES[i]);
+ }
+ exit(0);
+ }
+ ));
add_opt(common_arg(
{"-cl", "--cache-list"},
"show list of models in cache",
@@ -1104,28 +1275,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_SWA_FULL"));
add_opt(common_arg(
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
- string_format("max number of context checkpoints to create per slot (default: %d)\n"
+ string_format("max number of context checkpoints to create per slot (default: %d)"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
[](common_params & params, int value) {
params.n_ctx_checkpoints = value;
}
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
- {"--cache-ram", "-cram"}, "N",
- string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
+ {"-cram", "--cache-ram"}, "N",
+ string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
[](common_params & params, int value) {
params.cache_ram_mib = value;
}
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
- {"--kv-unified", "-kvu"},
- string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
- "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
+ {"-kvu", "--kv-unified"},
+ "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
[](common_params & params) {
params.kv_unified = true;
}
- ).set_env("LLAMA_ARG_KV_UNIFIED"));
+ ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
add_opt(common_arg(
{"--context-shift"},
{"--no-context-shift"},
@@ -1169,7 +1339,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.system_prompt = value;
}
- ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
add_opt(common_arg(
{"--perf"},
{"--no-perf"},
@@ -1211,13 +1381,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg(
{"--in-file"}, "FNAME",
- "an input file (repeat to specify multiple files)",
+ "an input file (use comma-separated values to specify multiple files)",
[](common_params & params, const std::string & value) {
- std::ifstream file(value);
- if (!file) {
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+ for (const auto & item : parse_csv_row(value)) {
+ std::ifstream file(item);
+ if (!file) {
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
+ }
+ params.in_files.push_back(item);
}
- params.in_files.push_back(value);
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
@@ -1358,7 +1530,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, bool value) {
params.warmup = value;
}
- ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
add_opt(common_arg(
{"--spm-infill"},
string_format(
@@ -1386,7 +1558,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
- {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
+ {"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
[](common_params & params, const std::string & value) {
params.sampling.samplers = common_sampler_types_from_chars(value);
@@ -1415,7 +1587,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.top_k = value;
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
}
- ).set_sparam());
+ ).set_sparam().set_env("LLAMA_ARG_TOP_K"));
add_opt(common_arg(
{"--top-p"}, "N",
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
@@ -1557,6 +1729,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_sparam());
+ add_opt(common_arg(
+ {"--adaptive-target"}, "N",
+ string_format("adaptive-p: select tokens near this probability (valid range 0.0 "
+ "to 1.0; negative = disabled) (default: %.2f)\n"
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)",
+ (double)params.sampling.adaptive_target),
+ [](common_params & params, const std::string & value) {
+ params.sampling.adaptive_target = std::stof(value);
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--adaptive-decay"}, "N",
+ string_format("adaptive-p: decay rate for target adaptation over time. lower values "
+ "are more reactive, higher values are more stable.\n"
+ "(valid range 0.0 to 0.99) (default: %.2f)",
+ (double)params.sampling.adaptive_decay),
+ [](common_params & params, const std::string & value) {
+ params.sampling.adaptive_decay = std::stof(value);
+ }
+ ).set_sparam());
add_opt(common_arg(
{"--dynatemp-range"}, "N",
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
@@ -1656,6 +1848,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
}
).set_sparam());
+ add_opt(common_arg(
+ {"-bs", "--backend-sampling"},
+ "enable backend sampling (experimental) (default: disabled)",
+ [](common_params & params) {
+ params.sampling.backend_sampling = true;
+ }
+ ).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
add_opt(common_arg(
{"--pooling"}, "{none,mean,cls,last,rank}",
"pooling type for embeddings, use model default if unspecified",
@@ -1667,7 +1866,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
else { throw std::invalid_argument("invalid value"); }
}
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
add_opt(common_arg(
{"--attention"}, "{causal,non-causal}",
"attention type for embeddings, use model default if unspecified",
@@ -1885,13 +2084,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
}
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
- add_opt(common_arg(
- {"-np", "--parallel"}, "N",
- string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
- [](common_params & params, int value) {
- params.n_parallel = value;
- }
- ).set_env("LLAMA_ARG_N_PARALLEL"));
+ if (ex == LLAMA_EXAMPLE_SERVER) {
+ // this is to make sure this option appears in the server-specific section of the help message
+ add_opt(common_arg(
+ {"-np", "--parallel"}, "N",
+ string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
+ [](common_params & params, int value) {
+ if (value == 0) {
+ throw std::invalid_argument("error: invalid value for n_parallel\n");
+ }
+ params.n_parallel = value;
+ }
+ ).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
+ } else {
+ add_opt(common_arg(
+ {"-np", "--parallel"}, "N",
+ string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
+ [](common_params & params, int value) {
+ params.n_parallel = value;
+ }
+ ).set_env("LLAMA_ARG_N_PARALLEL"));
+ }
add_opt(common_arg(
{"-ns", "--sequences"}, "N",
string_format("number of sequences to decode (default: %d)", params.n_sequences),
@@ -1940,9 +2153,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
add_opt(common_arg(
{"--image", "--audio"}, "FILE",
- "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
+ "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
[](common_params & params, const std::string & value) {
- params.image.emplace_back(value);
+ for (const auto & item : parse_csv_row(value)) {
+ params.image.emplace_back(item);
+ }
}
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
@@ -1962,7 +2177,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",
- "comma separated list of RPC servers",
+ "comma separated list of RPC servers (host:port)",
[](common_params & params, const std::string & value) {
add_rpc_devices(value);
GGML_UNUSED(params);
@@ -1979,11 +2194,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"--mmap"},
{"--no-mmap"},
- string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+ string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.use_mmap = value;
+ if (value) {
+ params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
+ }
}
).set_env("LLAMA_ARG_MMAP"));
+ add_opt(common_arg(
+ {"-dio", "--direct-io"},
+ {"-ndio", "--no-direct-io"},
+ string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.use_direct_io = value;
+ }
+ ).set_env("LLAMA_ARG_DIO"));
add_opt(common_arg(
{"--numa"}, "TYPE",
"attempt optimizations that help on some NUMA systems\n"
@@ -2028,26 +2254,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
));
add_opt(common_arg(
- {"--override-tensor", "-ot"}, "=,...",
+ {"-ot", "--override-tensor"}, "=,...",
"override tensor buffer type", [](common_params & params, const std::string & value) {
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
}
- ));
+ ).set_env("LLAMA_ARG_OVERRIDE_TENSOR"));
add_opt(common_arg(
- {"--override-tensor-draft", "-otd"}, "=,...",
+ {"-otd", "--override-tensor-draft"}, "=,...",
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
- {"--cpu-moe", "-cmoe"},
+ {"-cmoe", "--cpu-moe"},
"keep all Mixture of Experts (MoE) weights in the CPU",
[](common_params & params) {
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
}
).set_env("LLAMA_ARG_CPU_MOE"));
add_opt(common_arg(
- {"--n-cpu-moe", "-ncmoe"}, "N",
+ {"-ncmoe", "--n-cpu-moe"}, "N",
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
[](common_params & params, int value) {
if (value < 0) {
@@ -2062,14 +2288,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_N_CPU_MOE"));
add_opt(common_arg(
- {"--cpu-moe-draft", "-cmoed"},
+ {"-cmoed", "--cpu-moe-draft"},
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
[](common_params & params) {
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
add_opt(common_arg(
- {"--n-cpu-moe-draft", "-ncmoed"}, "N",
+ {"-ncmoed", "--n-cpu-moe-draft"}, "N",
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
[](common_params & params, int value) {
if (value < 0) {
@@ -2082,11 +2308,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
+ GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
- string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
- [](common_params & params, int value) {
- params.n_gpu_layers = value;
+ string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
+ [](common_params & params, const std::string & value) {
+ if (value == "auto") {
+ params.n_gpu_layers = -1;
+ } else if (value == "all") {
+ params.n_gpu_layers = -2;
+ } else {
+ params.n_gpu_layers = std::stoi(value);
+ }
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
@@ -2128,7 +2361,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
std::vector split_arg{ it, {} };
if (split_arg.size() >= llama_max_devices()) {
throw std::invalid_argument(
- string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
+ string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
);
}
for (size_t i = 0; i < llama_max_devices(); ++i) {
@@ -2153,6 +2386,52 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_env("LLAMA_ARG_MAIN_GPU"));
+ add_opt(common_arg(
+ { "-fit", "--fit" }, "[on|off]",
+ string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
+ [](common_params & params, const std::string & value) {
+ if (is_truthy(value)) {
+ params.fit_params = true;
+ } else if (is_falsey(value)) {
+ params.fit_params = false;
+ } else {
+ throw std::runtime_error(
+ string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
+ }
+ }
+ ).set_env("LLAMA_ARG_FIT"));
+ add_opt(common_arg(
+ { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
+ string_format("target margin per device for --fit, comma-separated list of values, "
+ "single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
+ [](common_params & params, const std::string & value) {
+ std::string arg_next = value;
+
+ // split string by , and /
+ const std::regex regex{ R"([,/]+)" };
+ std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+ std::vector split_arg{ it, {} };
+ if (split_arg.size() >= llama_max_devices()) {
+ throw std::invalid_argument(
+ string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
+ );
+ }
+ if (split_arg.size() == 1) {
+ std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
+ return;
+ }
+ for (size_t i = 0; i < split_arg.size(); i++) {
+ params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
+ }
+ }
+ ).set_env("LLAMA_ARG_FIT_TARGET"));
+ add_opt(common_arg(
+ { "-fitc", "--fit-ctx" }, "N",
+ string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
+ [](common_params & params, int value) {
+ params.fit_params_min_ctx = value;
+ }
+ ).set_env("LLAMA_ARG_FIT_CTX"));
add_opt(common_arg(
{"--check-tensors"},
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
@@ -2161,12 +2440,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
));
add_opt(common_arg(
- {"--override-kv"}, "KEY=TYPE:VALUE",
- "advanced option to override model metadata by key. may be specified multiple times.\n"
- "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
+ {"--override-kv"}, "KEY=TYPE:VALUE,...",
+ "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
+ "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
[](common_params & params, const std::string & value) {
- if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
- throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
+ for (const auto & item : parse_csv_row(value)) {
+ if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
+ throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
+ }
}
}
));
@@ -2180,33 +2461,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
));
add_opt(common_arg(
{"--lora"}, "FNAME",
- "path to LoRA adapter (can be repeated to use multiple adapters)",
+ "path to LoRA adapter (use comma-separated values to load multiple adapters)",
[](common_params & params, const std::string & value) {
- params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
+ for (const auto & item : parse_csv_row(value)) {
+ params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
+ }
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
add_opt(common_arg(
- {"--lora-scaled"}, "FNAME", "SCALE",
- "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
- [](common_params & params, const std::string & fname, const std::string & scale) {
- params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
+ {"--lora-scaled"}, "FNAME:SCALE,...",
+ "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
+ "note: use comma-separated values",
+ [](common_params & params, const std::string & value) {
+ for (const auto & item : parse_csv_row(value)) {
+ auto parts = string_split(item, ':');
+ if (parts.size() != 2) {
+ throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
+ }
+ params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
+ }
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
add_opt(common_arg(
{"--control-vector"}, "FNAME",
- "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
+ "add a control vector\nnote: use comma-separated values to add multiple control vectors",
[](common_params & params, const std::string & value) {
- params.control_vectors.push_back({ 1.0f, value, });
+ for (const auto & item : parse_csv_row(value)) {
+ params.control_vectors.push_back({ 1.0f, item, });
+ }
}
));
add_opt(common_arg(
- {"--control-vector-scaled"}, "FNAME", "SCALE",
+ {"--control-vector-scaled"}, "FNAME:SCALE,...",
"add a control vector with user defined scaling SCALE\n"
- "note: this argument can be repeated to add multiple scaled control vectors",
- [](common_params & params, const std::string & fname, const std::string & scale) {
- params.control_vectors.push_back({ std::stof(scale), fname });
+ "note: use comma-separated values (format: FNAME:SCALE,...)",
+ [](common_params & params, const std::string & value) {
+ for (const auto & item : parse_csv_row(value)) {
+ auto parts = string_split(item, ':');
+ if (parts.size() != 2) {
+ throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
+ }
+ params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
+ }
}
));
add_opt(common_arg(
@@ -2296,13 +2594,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("HF_TOKEN"));
add_opt(common_arg(
{"--context-file"}, "FNAME",
- "file to load context from (repeat to specify multiple files)",
+ "file to load context from (use comma-separated values to specify multiple files)",
[](common_params & params, const std::string & value) {
- std::ifstream file(value, std::ios::binary);
- if (!file) {
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+ for (const auto & item : parse_csv_row(value)) {
+ std::ifstream file(item, std::ios::binary);
+ if (!file) {
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
+ }
+ params.context_files.push_back(item);
}
- params.context_files.push_back(value);
}
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
add_opt(common_arg(
@@ -2443,7 +2743,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.embd_normalize = value;
}
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
add_opt(common_arg(
{"--embd-output-format"}, "FORMAT",
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
@@ -2493,6 +2793,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.api_prefix = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
+ add_opt(common_arg(
+ {"--webui-config"}, "JSON",
+ "JSON that provides default WebUI settings (overrides WebUI defaults)",
+ [](common_params & params, const std::string & value) {
+ params.webui_config_json = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
+ add_opt(common_arg(
+ {"--webui-config-file"}, "PATH",
+ "JSON file that provides default WebUI settings (overrides WebUI defaults)",
+ [](common_params & params, const std::string & value) {
+ params.webui_config_json = read_file(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
add_opt(common_arg(
{"--webui"},
{"--no-webui"},
@@ -2507,9 +2821,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.embedding = true;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
add_opt(common_arg(
- {"--reranking", "--rerank"},
+ {"--rerank", "--reranking"},
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
[](common_params & params) {
params.embedding = true;
@@ -2518,9 +2832,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
add_opt(common_arg(
{"--api-key"}, "KEY",
- "API key to use for authentication (default: none)",
+ "API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
[](common_params & params, const std::string & value) {
- params.api_keys.push_back(value);
+ for (const auto & key : parse_csv_row(value)) {
+ if (!key.empty()) {
+ params.api_keys.push_back(key);
+ }
+ }
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
add_opt(common_arg(
@@ -2534,7 +2852,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
std::string key;
while (std::getline(key_file, key)) {
if (!key.empty()) {
- params.api_keys.push_back(key);
+ params.api_keys.push_back(key);
}
}
key_file.close();
@@ -2556,7 +2874,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
add_opt(common_arg(
{"--chat-template-kwargs"}, "STRING",
- string_format("sets additional params for the json template parser"),
+ "sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
[](common_params & params, const std::string & value) {
auto parsed = json::parse(value);
for (const auto & item : parsed.items()) {
@@ -2579,10 +2897,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_threads_http = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
+ add_opt(common_arg(
+ {"--cache-prompt"},
+ {"--no-cache-prompt"},
+ string_format("whether to enable prompt caching (default: %s)", params.cache_prompt ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.cache_prompt = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_PROMPT"));
add_opt(common_arg(
{"--cache-reuse"}, "N",
string_format(
- "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
+ "min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: %d)\n"
"[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
),
[](common_params & params, int value) {
@@ -2744,6 +3070,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.lora_init_without_apply = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--sleep-idle-seconds"}, "SECONDS",
+ string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
+ [](common_params & params, int value) {
+ if (value == 0 || value < -1) {
+ throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
+ }
+ params.sleep_idle_seconds = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--simple-io"},
"use basic IO for better compatibility in subprocesses and limited consoles",
@@ -2980,7 +3316,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
add_opt(common_arg(
- {"--draft-max", "--draft", "--draft-n"}, "N",
+ {"--draft", "--draft-n", "--draft-max"}, "N",
string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
[](common_params & params, int value) {
params.speculative.n_max = value;
@@ -3022,11 +3358,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.devices = parse_device_list(value);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+ GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
add_opt(common_arg(
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
- "number of layers to store in VRAM for the draft model",
- [](common_params & params, int value) {
- params.speculative.n_gpu_layers = value;
+ string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
+ params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
+ [](common_params & params, const std::string & value) {
+ if (value == "auto") {
+ params.speculative.n_gpu_layers = -1;
+ } else if (value == "all") {
+ params.speculative.n_gpu_layers = -2;
+ } else {
+ params.speculative.n_gpu_layers = std::stoi(value);
+ }
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
@@ -3176,6 +3520,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+ add_opt(common_arg(
+ {"--save-logits"},
+ string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
+ [](common_params & params) {
+ params.save_logits = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
+ add_opt(common_arg(
+ {"--logits-output-dir"}, "PATH",
+ string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
+ [](common_params & params, const std::string & value) {
+ params.logits_output_dir = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
+ add_opt(common_arg(
+ {"--tensor-filter"}, "REGEX",
+ "filter tensor names for debug output (regex pattern, can be specified multiple times)",
+ [](common_params & params, const std::string & value) {
+ params.tensor_filter.push_back(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
// presets
add_opt(common_arg(
@@ -3356,3 +3721,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
return ctx_arg;
}
+
+void common_params_add_preset_options(std::vector & args) {
+ // arguments below won't be treated as CLI args, only preset options
+ args.push_back(common_arg(
+ {"load-on-startup"}, "NAME",
+ "in server router mode, autoload this model on startup",
+ [](common_params &, const std::string &) { /* unused */ }
+ ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
+
+ args.push_back(common_arg(
+ {"stop-timeout"}, "SECONDS",
+ "in server router mode, force-kill model instance after this many seconds of graceful shutdown",
+ [](common_params &, int) { /* unused */ }
+ ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
+
+ // args.push_back(common_arg(
+ // {"pin"},
+ // "in server router mode, do not unload this model if models_max is exceeded",
+ // [](common_params &) { /* unused */ }
+ // ).set_preset_only());
+}
diff --git a/common/arg.h b/common/arg.h
index 1321595c1..55782a158 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -8,6 +8,10 @@
#include
#include
+// pseudo-env variable to identify preset-only arguments
+#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
+#define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT"
+
//
// CLI argument parsing
//
@@ -22,6 +26,7 @@ struct common_arg {
const char * env = nullptr;
std::string help;
bool is_sparam = false; // is current arg a sampling param?
+ bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
void (*handler_void) (common_params & params) = nullptr;
void (*handler_string) (common_params & params, const std::string &) = nullptr;
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
@@ -70,6 +75,7 @@ struct common_arg {
common_arg & set_excludes(std::initializer_list excludes);
common_arg & set_env(const char * env);
common_arg & set_sparam();
+ common_arg & set_preset_only();
bool in_example(enum llama_example ex);
bool is_exclude(enum llama_example ex);
bool get_value_from_env(std::string & output) const;
@@ -114,16 +120,12 @@ struct common_params_context {
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
// parse input arguments from CLI into a map
-// TODO: support repeated args in the future
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map & out_map);
+// populate preset-only arguments
+// these arguments are not treated as command line arguments
+// see: https://github.com/ggml-org/llama.cpp/issues/18163
+void common_params_add_preset_options(std::vector & args);
+
// initialize argument parser context - used by test-arg-parser and preset
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
-
-struct common_remote_params {
- std::vector headers;
- long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
- long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
-};
-// get remote file content, returns
-std::pair> common_remote_get_content(const std::string & url, const common_remote_params & params);
diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
index d740dac06..29819e48d 100644
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -129,7 +129,7 @@ static void parse_json_tool_calls(
}
}
-common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
+common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax)
: input_(input), is_partial_(is_partial), syntax_(syntax)
{
result_.role = "assistant";
@@ -1395,6 +1395,126 @@ static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
builder.consume_reasoning_with_xml_tool_calls(form, "", "");
}
+static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
+ builder.try_parse_reasoning("<|think|>", "<|end|><|begin|>assistant<|content|>");
+
+ // TODO: Tool calling
+
+ builder.add_content(builder.consume_rest());
+}
+
+static void common_chat_parse_exaone_moe_content(common_chat_msg_parser & builder) {
+ // 1) { "name": "...", "arguments": {...} }
+ // 2) { "id": "...", "type": "function", "function": { "name": "...", "arguments": {...} } }
+ static const common_regex tool_call_open(R"(]*>)");
+
+ if (!builder.syntax().parse_tool_calls) {
+ LOG_DBG("%s: not parse_tool_calls\n", __func__);
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+
+ LOG_DBG("%s: parse_tool_calls\n", __func__);
+
+ // Find all blocks
+ while (auto first = builder.try_find_regex(tool_call_open, std::string::npos, /* add_prelude_to_content= */ true)) {
+ builder.move_to(first->groups[0].end);
+ builder.consume_spaces();
+
+ builder.try_consume_literal("```json");
+ builder.try_consume_literal("```");
+ builder.consume_spaces();
+
+ // Consume JSON object
+ auto data = builder.consume_json();
+
+ builder.consume_spaces();
+ builder.try_consume_literal("```");
+ builder.consume_spaces();
+
+ if (!builder.try_consume_literal("")) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ builder.consume_spaces();
+
+ // Extract name and arguments
+ std::string name;
+ std::string id;
+ nlohmann::ordered_json arguments;
+
+ const auto extract_args = [&](const nlohmann::ordered_json & obj) -> bool {
+ if (!obj.contains("name") || !obj.contains("arguments")) {
+ return false;
+ }
+ name = obj.at("name").get();
+ arguments = obj.at("arguments");
+ if (obj.contains("id") && obj.at("id").is_string()) {
+ id = obj.at("id").get();
+ }
+ return true;
+ };
+
+ if (!extract_args(data.json)) {
+ if (data.json.contains("function") && data.json.at("function").is_object()) {
+ auto fn = data.json.at("function");
+ extract_args(fn);
+ if (id.empty() && data.json.contains("id") && data.json.at("id").is_string()) {
+ id = data.json.at("id").get();
+ }
+ }
+ }
+
+ // If name is empty, treat the JSON object as content
+ if (name.empty()) {
+ LOG_DBG("%s: tool call missing name, treating as content\n", __func__);
+ builder.add_content(data.json.dump());
+ continue;
+ }
+
+ std::string args_str = arguments.dump();
+ if (!builder.add_tool_call(name, id, args_str)) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ }
+
+ builder.add_content(builder.consume_rest());
+}
+
+static void common_chat_parse_exaone_moe(common_chat_msg_parser & builder) {
+ LOG_DBG("%s: parsing exaone_moe\n", __func__);
+ // EXAONE MoE outputs reasoning content between "" and "" tags, followed by regular content
+ // First try to parse using the standard reasoning parsing method
+ LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
+
+ auto start_pos = builder.pos();
+ auto found_end_think = builder.try_find_literal("");
+ builder.move_to(start_pos);
+
+ if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
+ LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
+ common_chat_parse_exaone_moe_content(builder);
+ } else if (builder.try_parse_reasoning("", "")) {
+ // If reasoning was parsed successfully, the remaining content is regular content
+ LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
+ common_chat_parse_exaone_moe_content(builder);
+ } else {
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+ LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
+ common_chat_parse_exaone_moe_content(builder);
+ return;
+ }
+ // If no reasoning tags found, check if we should treat everything as reasoning
+ if (builder.syntax().thinking_forced_open) {
+ // If thinking is forced open but no tags found, treat everything as reasoning
+ LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
+ builder.add_reasoning_content(builder.consume_rest());
+ } else {
+ LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
+ common_chat_parse_exaone_moe_content(builder);
+ }
+ }
+}
+
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
builder.try_parse_reasoning("", "");
builder.add_content(builder.consume_rest());
@@ -1479,13 +1599,19 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
case COMMON_CHAT_FORMAT_XIAOMI_MIMO:
common_chat_parse_xiaomi_mimo(builder);
break;
+ case COMMON_CHAT_FORMAT_SOLAR_OPEN:
+ common_chat_parse_solar_open(builder);
+ break;
+ case COMMON_CHAT_FORMAT_EXAONE_MOE:
+ common_chat_parse_exaone_moe(builder);
+ break;
default:
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
}
builder.finish();
}
-common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
+common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax) {
if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE ||
syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE ||
syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
@@ -1504,12 +1630,12 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
}
auto msg = builder.result();
if (!is_partial) {
- LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
}
return msg;
}
-common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
+common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax) {
if (parser.empty()) {
throw std::runtime_error("Failed to parse due to missing parser definition.");
}
@@ -1537,7 +1663,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std
mapper.from_ast(ctx.ast, result);
}
if (!is_partial) {
- LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
}
return msg;
}
diff --git a/common/chat-parser.h b/common/chat-parser.h
index 78c4b74c2..3ed9c30a2 100644
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@@ -5,7 +5,7 @@
#include "json-partial.h"
#include "regex-partial.h"
-#include
+#include
#include
#include
@@ -19,20 +19,20 @@ class common_chat_msg_partial_exception : public std::runtime_error {
class common_chat_msg_parser {
std::string input_;
bool is_partial_;
- common_chat_syntax syntax_;
+ common_chat_parser_params syntax_; // TODO: rename to params
std::string healing_marker_;
size_t pos_ = 0;
common_chat_msg result_;
public:
- common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
+ common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
const std::string & input() const { return input_; }
size_t pos() const { return pos_; }
const std::string & healing_marker() const { return healing_marker_; }
const bool & is_partial() const { return is_partial_; }
const common_chat_msg & result() const { return result_; }
- const common_chat_syntax & syntax() const { return syntax_; }
+ const common_chat_parser_params & syntax() const { return syntax_; }
void move_to(size_t pos) {
if (pos > input_.size()) {
diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp
index 74a7b6a46..1bcba9cd8 100644
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -4,9 +4,14 @@
using json = nlohmann::json;
-static std::string_view trim_trailing_space(std::string_view sv) {
+static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
+ int count = 0;
while (!sv.empty() && std::isspace(static_cast(sv.back()))) {
+ if (max != -1 && count <= max) {
+ break;
+ }
sv.remove_suffix(1);
+ count++;
}
return sv;
}
@@ -93,7 +98,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
if (is_arg_string && current_tool) {
// Serialize to JSON, but exclude the end quote
- std::string dumped = json(node.text).dump();
+ std::string dumped = json(trim_trailing_space(node.text)).dump();
current_tool->arguments += dumped.substr(0, dumped.size() - 1);
needs_closing_quote = true;
}
@@ -101,6 +106,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
if (is_arg_close && current_tool) {
if (needs_closing_quote) {
current_tool->arguments += "\"";
+ needs_closing_quote = false;
}
}
@@ -109,6 +115,10 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
}
if (is_tool_close && current_tool) {
+ if (needs_closing_quote) {
+ current_tool->arguments += "\"";
+ needs_closing_quote = false;
+ }
current_tool->arguments += "}";
}
}
diff --git a/common/chat.cpp b/common/chat.cpp
index c371edaa5..aba26e97a 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -7,8 +7,10 @@
#include "log.h"
#include "regex-partial.h"
-#include
-#include
+#include "jinja/parser.h"
+#include "jinja/value.h"
+#include "jinja/runtime.h"
+#include "jinja/caps.h"
#include
#include
@@ -51,39 +53,73 @@ static bool has_content_or_tool_calls(const common_chat_msg & msg) {
return !msg.content.empty() || !msg.tool_calls.empty();
}
-template <>
-json common_chat_msg::to_json_oaicompat() const
-{
- json message {
- {"role", "assistant"},
- };
- if (!reasoning_content.empty()) {
- message["reasoning_content"] = reasoning_content;
+json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
+ if (!content.empty() && !content_parts.empty()) {
+ throw std::runtime_error("Cannot specify both content and content_parts");
}
- if (content.empty() && !tool_calls.empty()) {
- message["content"] = json();
+ json jmsg {
+ {"role", role},
+ };
+ if (!content.empty()) {
+ jmsg["content"] = content;
+ } else if (!content_parts.empty()) {
+ if (concat_typed_text) {
+ std::string text;
+ for (const auto & part : content_parts) {
+ if (part.type != "text") {
+ LOG_WRN("Ignoring content part type: %s\n", part.type.c_str());
+ continue;
+ }
+ if (!text.empty()) {
+ text += '\n';
+ }
+ text += part.text;
+ }
+ jmsg["content"] = text;
+ } else {
+ auto & parts = jmsg["content"] = json::array();
+ for (const auto & part : content_parts) {
+ parts.push_back({
+ {"type", part.type},
+ {"text", part.text},
+ });
+ }
+ }
} else {
- message["content"] = content;
+ jmsg["content"] = "";
+ }
+ if (!reasoning_content.empty()) {
+ jmsg["reasoning_content"] = reasoning_content;
+ }
+ if (!tool_name.empty()) {
+ jmsg["name"] = tool_name;
+ }
+ if (!tool_call_id.empty()) {
+ jmsg["tool_call_id"] = tool_call_id;
}
if (!tool_calls.empty()) {
- auto arr = json::array();
- for (const auto & tc : tool_calls) {
- arr.push_back({
+ jmsg["tool_calls"] = json::array();
+ auto & jtool_calls = jmsg["tool_calls"];
+ for (const auto & tool_call : tool_calls) {
+ json tc {
{"type", "function"},
{"function", {
- {"name", tc.name},
- {"arguments", tc.arguments},
+ {"name", tool_call.name},
+ {"arguments", tool_call.arguments},
}},
- {"id", tc.id},
- // // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
- // // We only generate a random id for the ones that don't generate one by themselves
- // // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
- // {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
- });
+ };
+ if (!tool_call.id.empty()) {
+ tc["id"] = tool_call.id;
+ }
+ // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
+ // We only generate a random id for the ones that don't generate one by themselves
+ // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
+ // {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
+ jtool_calls.push_back(tc);
}
- message["tool_calls"] = arr;
}
- return message;
+
+ return jmsg;
}
std::vector common_chat_msg_diff::compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new) {
@@ -135,7 +171,68 @@ std::vector common_chat_msg_diff::compute_diffs(const comm
return diffs;
}
-typedef minja::chat_template common_chat_template;
+using chat_template_caps = jinja::caps;
+
+struct common_chat_template {
+ jinja::program prog;
+ std::string bos_tok;
+ std::string eos_tok;
+ std::string src;
+ chat_template_caps caps;
+
+ common_chat_template(const std::string & src, const std::string & bos_token, const std::string & eos_token) {
+ jinja::lexer lexer;
+ auto lexer_res = lexer.tokenize(src);
+ this->prog = jinja::parse_from_tokens(lexer_res);
+
+ this->src = lexer_res.source;
+ this->bos_tok = bos_token;
+ this->eos_tok = eos_token;
+
+ this->caps = jinja::caps_get(prog);
+ // LOG_INF("%s: caps:\n%s\n", __func__, this->caps.to_string().c_str());
+ }
+
+ const std::string & source() const { return src; }
+ const std::string & bos_token() const { return bos_tok; }
+ const std::string & eos_token() const { return eos_tok; }
+
+ // TODO: this is ugly, refactor it somehow
+ json add_system(const json & messages, const std::string & system_prompt) const {
+ GGML_ASSERT(messages.is_array());
+ auto msgs_copy = messages;
+ if (!caps.supports_system_role) {
+ if (msgs_copy.empty()) {
+ msgs_copy.insert(msgs_copy.begin(), json{
+ {"role", "user"},
+ {"content", system_prompt}
+ });
+ } else {
+ auto & first_msg = msgs_copy[0];
+ if (!first_msg.contains("content")) {
+ first_msg["content"] = "";
+ }
+ first_msg["content"] = system_prompt + "\n\n"
+ + first_msg["content"].get();
+ }
+ } else {
+ if (msgs_copy.empty() || msgs_copy[0].at("role") != "system") {
+ msgs_copy.insert(msgs_copy.begin(), json{
+ {"role", "system"},
+ {"content", system_prompt}
+ });
+ } else if (msgs_copy[0].at("role") == "system") {
+ msgs_copy[0]["content"] = system_prompt;
+ }
+ }
+ return msgs_copy;
+ }
+
+ chat_template_caps original_caps() const {
+ return caps;
+ }
+
+};
struct common_chat_templates {
bool add_bos;
@@ -161,6 +258,7 @@ struct templates_params {
bool add_bos;
bool add_eos;
bool is_inference = true;
+ bool mark_input = true; // whether to mark input strings in the jinja context
};
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -189,7 +287,6 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
}
-template <>
std::vector common_chat_msgs_parse_oaicompat(const json & messages) {
std::vector msgs;
@@ -283,80 +380,15 @@ std::vector common_chat_msgs_parse_oaicompat(const json & messa
return msgs;
}
-template <>
json common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text) {
json messages = json::array();
for (const auto & msg : msgs) {
- if (!msg.content.empty() && !msg.content_parts.empty()) {
- throw std::runtime_error("Cannot specify both content and content_parts");
- }
- json jmsg {
- {"role", msg.role},
- };
- if (!msg.content.empty()) {
- jmsg["content"] = msg.content;
- } else if (!msg.content_parts.empty()) {
- if (concat_typed_text) {
- std::string text;
- for (const auto & part : msg.content_parts) {
- if (part.type != "text") {
- LOG_WRN("Ignoring content part type: %s\n", part.type.c_str());
- continue;
- }
- if (!text.empty()) {
- text += '\n';
- }
- text += part.text;
- }
- jmsg["content"] = text;
- } else {
- auto & parts = jmsg["content"] = json::array();
- for (const auto & part : msg.content_parts) {
- parts.push_back({
- {"type", part.type},
- {"text", part.text},
- });
- }
- }
- } else {
- jmsg["content"] = json(); // null
- }
- if (!msg.reasoning_content.empty()) {
- jmsg["reasoning_content"] = msg.reasoning_content;
- }
- if (!msg.tool_name.empty()) {
- jmsg["name"] = msg.tool_name;
- }
- if (!msg.tool_call_id.empty()) {
- jmsg["tool_call_id"] = msg.tool_call_id;
- }
- if (!msg.tool_calls.empty()) {
- auto & tool_calls = jmsg["tool_calls"] = json::array();
- for (const auto & tool_call : msg.tool_calls) {
- json tc {
- {"type", "function"},
- {"function", {
- {"name", tool_call.name},
- {"arguments", tool_call.arguments},
- }},
- };
- if (!tool_call.id.empty()) {
- tc["id"] = tool_call.id;
- }
- tool_calls.push_back(tc);
- }
- }
+ json jmsg = msg.to_json_oaicompat(concat_typed_text);
messages.push_back(jmsg);
}
return messages;
}
-template <>
-std::vector common_chat_msgs_parse_oaicompat(const std::string & messages) {
- return common_chat_msgs_parse_oaicompat(json::parse(messages));
-}
-
-template <>
std::vector common_chat_tools_parse_oaicompat(const json & tools) {
std::vector result;
@@ -380,8 +412,8 @@ std::vector common_chat_tools_parse_oaicompat(const json & too
const auto & function = tool.at("function");
result.push_back({
/* .name = */ function.at("name"),
- /* .description = */ function.at("description"),
- /* .parameters = */ function.at("parameters").dump(),
+ /* .description = */ function.value("description", ""),
+ /* .parameters = */ function.value("parameters", json::object()).dump(),
});
}
}
@@ -392,12 +424,6 @@ std::vector common_chat_tools_parse_oaicompat(const json & too
return result;
}
-template <>
-std::vector common_chat_tools_parse_oaicompat(const std::string & tools) {
- return common_chat_tools_parse_oaicompat(json::parse(tools));
-}
-
-template <>
json common_chat_tools_to_json_oaicompat(const std::vector & tools) {
if (tools.empty()) {
return json();
@@ -417,7 +443,7 @@ json common_chat_tools_to_json_oaicompat(const std::vector & t
return result;
}
-template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
+json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
json delta = json::object();
if (!diff.reasoning_content_delta.empty()) {
delta["reasoning_content"] = diff.reasoning_content_delta;
@@ -534,18 +560,18 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
return tmpls->has_explicit_template;
}
-const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) {
- if (variant != nullptr) {
- if (strcmp(variant, "tool_use") == 0) {
+std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
+ if (!variant.empty()) {
+ if (variant == "tool_use") {
if (tmpls->template_tool_use) {
- return tmpls->template_tool_use->source().c_str();
+ return tmpls->template_tool_use->source();
}
- return nullptr;
+ return "";
} else {
- LOG_DBG("%s: unknown template variant: %s\n", __func__, variant);
+ LOG_DBG("%s: unknown template variant: %s\n", __func__, variant.c_str());
}
}
- return tmpls->template_default->source().c_str();
+ return tmpls->template_default->source();
}
common_chat_templates_ptr common_chat_templates_init(
@@ -627,14 +653,16 @@ common_chat_templates_ptr common_chat_templates_init(
tmpls->add_bos = add_bos;
tmpls->add_eos = add_eos;
try {
- tmpls->template_default = std::make_unique(default_template_src, token_bos, token_eos);
+ tmpls->template_default = std::make_unique(default_template_src, token_bos, token_eos);
} catch (const std::exception & e) {
- LOG_ERR("%s: failed to parse chat template (defaulting to chatml): %s \n", __func__, e.what());
- tmpls->template_default = std::make_unique(CHATML_TEMPLATE_SRC, token_bos, token_eos);
+ LOG_ERR("%s: error: %s\n", __func__, e.what());
+ LOG_ERR("%s: failed to initialize chat template\n", __func__);
+ LOG_ERR("%s: please consider disabling jinja via --no-jinja, or using another chat template\n", __func__);
+ throw e;
}
if (!template_tool_use_src.empty()) {
try {
- tmpls->template_tool_use = std::make_unique(template_tool_use_src, token_bos, token_eos);
+ tmpls->template_tool_use = std::make_unique(template_tool_use_src, token_bos, token_eos);
} catch (const std::exception & e) {
LOG_ERR("%s: failed to parse tool use chat template (ignoring it): %s\n", __func__, e.what());
}
@@ -669,6 +697,8 @@ const char * common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
+ case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
+ case COMMON_CHAT_FORMAT_EXAONE_MOE: return "EXAONE MoE";
case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
@@ -711,6 +741,25 @@ static void foreach_function(const json & tools, const std::function & fn) {
+ if (!function.contains("parameters") || !function.at("parameters").is_object()) {
+ return;
+ }
+ const auto & params = function.at("parameters");
+ if (!params.contains("properties") || !params.at("properties").is_object()) {
+ return;
+ }
+ const auto & props = params.at("properties");
+ std::set required;
+ if (params.contains("required") && params.at("required").is_array()) {
+ params.at("required").get_to(required);
+ }
+ for (const auto & [name, prop] : props.items()) {
+ bool is_required = (required.find(name) != required.end());
+ fn(name, prop, is_required);
+ }
+}
+
static std::string apply(
const common_chat_template & tmpl,
const struct templates_params & inputs,
@@ -718,27 +767,43 @@ static std::string apply(
const std::optional & tools_override = std::nullopt,
const std::optional & additional_context = std::nullopt)
{
- minja::chat_template_inputs tmpl_inputs;
- tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
- if (tools_override) {
- tmpl_inputs.tools = *tools_override;
- } else {
- tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
- }
- tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
- tmpl_inputs.extra_context = inputs.extra_context;
- tmpl_inputs.extra_context["enable_thinking"] = inputs.enable_thinking;
- if (additional_context) {
- tmpl_inputs.extra_context.merge_patch(*additional_context);
- }
- // TODO: add flag to control date/time, if only for testing purposes.
- // tmpl_inputs.now = std::chrono::system_clock::now();
+ jinja::context ctx(tmpl.source());
- minja::chat_template_options tmpl_opts;
- // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
- // instead of using `chat_template_options.use_bos_token = false`, since these tokens
- // may be needed inside the template / between messages too.
- auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
+ nlohmann::ordered_json inp = nlohmann::ordered_json{
+ {"messages", messages_override.has_value() ? *messages_override : inputs.messages},
+ {"tools", tools_override.has_value() ? *tools_override : inputs.tools},
+ {"bos_token", tmpl.bos_token()},
+ {"eos_token", tmpl.eos_token()},
+ };
+ if (inputs.extra_context.is_object()) {
+ // TODO: do we need to merge, or replacing is fine?
+ for (const auto & [k, v] : inputs.extra_context.items()) {
+ inp[k] = v;
+ }
+ }
+ if (additional_context.has_value()) {
+ // TODO: merge properly instead of overwriting (matching old behavior)
+ for (const auto & [k, v] : additional_context->items()) {
+ inp[k] = v;
+ }
+ }
+ if (inputs.add_generation_prompt) {
+ inp["add_generation_prompt"] = true;
+ }
+ if (inp["tools"].is_null()) {
+ inp["tools"] = json::array();
+ }
+
+ jinja::global_from_json(ctx, inp, inputs.mark_input);
+
+ // render
+ jinja::runtime runtime(ctx);
+ const jinja::value results = runtime.execute(tmpl.prog);
+ auto parts = runtime.gather_string_parts(results);
+
+ std::string result = parts->as_string().str();
+
+ // TODO: improve this later
if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
result = result.substr(tmpl.bos_token().size());
}
@@ -825,10 +890,17 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
builder.add_schema("root", schema);
});
- auto tweaked_messages = common_chat_template::add_system(
+ auto tweaked_messages = tmpl.add_system(
inputs.messages,
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
+ // ensure all messages has "content" field
+ for (auto & message : tweaked_messages) {
+ if (!message.contains("content") || message["content"].is_null()) {
+ message["content"] = "";
+ }
+ }
+
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
data.format = COMMON_CHAT_FORMAT_GENERIC;
return data;
@@ -1343,7 +1415,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
{"date_string", format_time(inputs.now, "%d %b %Y")},
{"tools_in_user_message", false},
- {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
+ {"builtin_tools", builtin_tools},
});
return data;
}
@@ -1409,6 +1481,123 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
return data;
}
+static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ data.prompt = apply(tmpl, inputs);
+ data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED;
+
+ // Handle thinking tags appropriately based on inputs.enable_thinking
+ if (string_ends_with(data.prompt, "\n")) {
+ if (!inputs.enable_thinking) {
+ data.prompt += "";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ }
+
+ data.preserved_tokens = {
+ "",
+ "",
+ "",
+ "",
+ };
+
+ auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
+ auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+ auto include_grammar = true;
+
+ auto parser = build_chat_peg_constructed_parser([&](auto & p) {
+ auto reasoning = p.eps();
+ if (inputs.enable_thinking && extract_reasoning) {
+ auto reasoning_content = p.reasoning(p.until("")) + ("" | p.end());
+ if (data.thinking_forced_open) {
+ reasoning = reasoning_content;
+ }
+ }
+
+ // Response format parser
+ if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
+ return reasoning << p.content(p.schema(p.json(), "response-format", inputs.json_schema));
+ }
+
+ // Tool call parser
+ if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
+ auto tool_choice = p.choice();
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+
+ auto schema_info = common_schema_info();
+ schema_info.resolve_refs(parameters);
+
+ auto tool_open = "\n";
+ auto tool_close = p.literal("\n");
+ auto args = p.sequence();
+ auto arg_string = p.rule("xml-arg-string", p.until_one_of({
+ "\n",
+ "\n"
+ }));
+
+ foreach_parameter(function, [&](const auto & param_name, const json & param_schema, bool is_required) {
+ auto rule_name = "tool-" + name + "-arg-" + param_name;
+
+ auto arg_open = "\n";
+ auto arg_close = p.literal("\n");
+ auto arg_value = p.eps();
+
+ if (schema_info.resolves_to_string(param_schema)) {
+ arg_value = p.tool_arg_string_value(arg_string) + "\n";
+ } else {
+ arg_value = p.tool_arg_json_value(p.schema(p.json(), rule_name + "-schema", param_schema));
+ }
+
+ // Model may or my not close with
+ auto arg_rule = p.rule(rule_name, p.tool_arg_open(arg_open) + arg_value + p.optional(p.tool_arg_close(arg_close)));
+ args += p.repeat(arg_rule, /* min = */ is_required ? 1 : 0, /* max = */ 1);
+ });
+
+ tool_choice |= p.rule("tool-" + name, p.tool_open(tool_open) + args + p.tool_close(tool_close));
+ });
+
+ auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
+ auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
+ auto tool_call = p.rule("tool-call", "\n" + tool_choice + "" + p.space());
+ auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls));
+
+ return reasoning << p.content(p.until("")) << tool_calls;
+ }
+
+ // Content only parser
+ include_grammar = false;
+ return reasoning << p.content(p.rest());
+ });
+
+ data.parser = parser.save();
+
+ if (include_grammar) {
+ data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ auto schema = function.at("parameters");
+ builder.resolve_refs(schema);
+ });
+ parser.build_grammar(builder, data.grammar_lazy);
+ });
+
+ data.grammar_triggers = {
+ {COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ""}
+ };
+ }
+
+ return data;
+}
+
+
static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
@@ -1928,7 +2117,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
// Trigger on tool calls that appear in the commentary channel
data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
- "<\\|channel\\|>(commentary|analysis) to"
+ "<\\|channel\\|>(?:commentary|analysis) to"
});
// Trigger tool calls that appear in the role section, either at the
@@ -2261,17 +2450,17 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
data.grammar_triggers.push_back({
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
// If thinking_forced_open, then we capture the tag in the grammar,
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
- std::string(data.thinking_forced_open ? "[\\s\\S]*?(\\s*)" : "(?:[\\s\\S]*?\\s*)?") + (
+ std::string(data.thinking_forced_open ? "(\\s*)" : "") + (
"\\s*("
"(?:"
"||||)?"
"\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
")"
- ")[\\s\\S]*"
+ ")"
),
});
data.preserved_tokens = {
@@ -2381,6 +2570,125 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
return data;
}
+static common_chat_params common_chat_params_init_solar_open(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ // TODO: Reasoning effort
+ json additional_context = {};
+
+ data.prompt = apply(tmpl, inputs, std::nullopt, std::nullopt, additional_context);
+ data.format = COMMON_CHAT_FORMAT_SOLAR_OPEN;
+
+ data.preserved_tokens = {
+ "<|think|>",
+ "<|content|>",
+ "<|begin|>",
+ "<|end|>",
+ };
+
+ // TODO: Tool calling
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_exaone_moe(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ data.prompt = apply(tmpl, inputs);
+ data.format = COMMON_CHAT_FORMAT_EXAONE_MOE;
+ if (string_ends_with(data.prompt, "\n")) {
+ if (!inputs.enable_thinking) {
+ data.prompt += "\n\n";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ }
+
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ std::vector tool_rules;
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+ builder.resolve_refs(parameters);
+ // Expect: {"name": "", "arguments": {...}}
+ tool_rules.push_back(builder.add_rule(
+ name + "-call",
+ "\"\" space " +
+ builder.add_schema(name + "-obj", json{
+ {"type", "object"},
+ {"properties", {
+ {"name", json{{"const", name}}},
+ {"arguments", parameters},
+ }},
+ {"required", json::array({"name", "arguments"})},
+ }) +
+ " space \"\" space"));
+ });
+
+ auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
+ builder.add_rule("root",
+ std::string(data.thinking_forced_open ? "( \"\" space )? " : "") +
+ (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
+
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+ std::string(data.thinking_forced_open ? "[\\s\\S]*?(\\s*)?" : "") +
+ "()[\\s\\S]*"
+ });
+ data.preserved_tokens = {
+ "",
+ "",
+ "",
+ "",
+ };
+ });
+ }
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_translate_gemma(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ // This template does not support tools or reasoning
+ // we just need to transform the messages into the correct schema
+
+ templates_params inputs_new = inputs;
+ json & messages = inputs_new.messages;
+
+ GGML_ASSERT(messages.is_array());
+ for (auto & message : messages) {
+ if (message.contains("role") && message["role"].get() != "user") {
+ continue;
+ }
+ if (!message.contains("content")) {
+ message["content"] = json::array();
+ }
+ if (message.contains("content") && !message["content"].is_array()) {
+ auto content_str = message["content"].get();
+ // default to en-GB if not specified (to make common_chat_format_example works)
+ auto src_lang = message.contains("source_lang_code") ? message["source_lang_code"].get() : "en-GB";
+ auto tgt_lang = message.contains("target_lang_code") ? message["target_lang_code"].get() : "en-GB";
+ message["content"] = json::array({
+ json{
+ {"type", "text"},
+ {"text", content_str},
+ {"source_lang_code", src_lang},
+ {"target_lang_code", tgt_lang},
+ }
+ });
+ }
+ }
+
+ data.prompt = apply(tmpl, inputs_new, std::nullopt, std::nullopt);
+ data.format = COMMON_CHAT_FORMAT_GENERIC;
+
+ return data;
+}
+
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs);
@@ -2451,18 +2759,119 @@ static common_chat_params common_chat_params_init_seed_oss(
return data;
}
+// various workarounds for known issues with certain templates or model behaviors
+// TODO @ngxson : improve this (how?)
+namespace workaround {
+
+// if first message is system and template does not support it, merge it with next message
+static void system_message_not_supported(json & messages) {
+ if (!messages.empty() && messages.front().at("role") == "system") {
+ if (messages.size() > 1) {
+ LOG_DBG("Merging system prompt into next message\n");
+ auto & first_msg = messages.front();
+ auto & second_msg = messages[1];
+ second_msg["content"] = first_msg.at("content").get()
+ + "\n" + second_msg.at("content").get();
+ messages.erase(messages.begin());
+ } else {
+ LOG_WRN("Removing system prompt due to template not supporting system role\n");
+ messages.erase(messages.begin());
+ }
+ }
+}
+
+static void func_args_not_string(json & messages) {
+ GGML_ASSERT(messages.is_array());
+ for (auto & message : messages) {
+ if (message.contains("tool_calls")) {
+ for (auto & tool_call : message["tool_calls"]) {
+ if (tool_call.contains("function") && tool_call["function"].contains("arguments")) {
+ auto & args = tool_call["function"]["arguments"];
+ if (args.is_string()) {
+ try {
+ args = json::parse(args.get());
+ } catch (const std::exception & e) {
+ throw std::runtime_error("Failed to parse tool call arguments as JSON: " + std::string(e.what()));
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static void move_tool_calls_to_content(json & messages, int indent_spaces = 2) {
+ GGML_ASSERT(messages.is_array());
+ for (auto & message : messages) {
+ if (message.contains("tool_calls")) {
+ auto tool_calls_new = json{
+ {"tool_calls", message.at("tool_calls")}
+ };
+ message.erase("tool_calls");
+ auto content = message.at("content");
+ std::string content_new = content.is_null() ? "" : content.get();
+ message["content"] = content_new + tool_calls_new.dump(indent_spaces, ' ', false, json::error_handler_t::replace);
+ }
+ }
+}
+
+// TODO @ngxson : we may remove support for generic schema in the future
+static void use_generic_schema(json & messages) {
+ GGML_ASSERT(messages.is_array());
+ for (auto & message : messages) {
+ if (message.contains("tool_calls") && message.at("tool_calls").is_array()) {
+ auto & tool_calls = message.at("tool_calls");
+ for (auto & tool_call : tool_calls) {
+ if (tool_call.contains("type") && tool_call.at("type") == "function" &&
+ tool_call.contains("function") && tool_call.at("function").is_object()) {
+ // Copy values before erasing to avoid use-after-free
+ json name_value;
+ json arguments_value;
+ json id_value;
+ const auto & function = tool_call.at("function");
+ if (function.contains("name")) {
+ name_value = function.at("name");
+ }
+ if (function.contains("arguments")) {
+ arguments_value = function.at("arguments");
+ }
+ if (tool_call.contains("id")) {
+ id_value = tool_call.at("id");
+ }
+ // Now safely erase and assign in the correct order
+ tool_call.erase("type");
+ tool_call.erase("function");
+ tool_call.erase("id");
+ // Reassign in desired order: name, arguments, id
+ if (!name_value.is_null()) {
+ tool_call["name"] = name_value;
+ }
+ if (!arguments_value.is_null()) {
+ tool_call["arguments"] = arguments_value;
+ }
+ if (!id_value.is_null()) {
+ tool_call["id"] = id_value;
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace workaround
+
static common_chat_params common_chat_templates_apply_jinja(
const struct common_chat_templates * tmpls,
const struct common_chat_templates_inputs & inputs)
{
templates_params params;
- params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
+ params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
const auto & tmpl = params.tools.is_array() && tmpls->template_tool_use
? *tmpls->template_tool_use
: *tmpls->template_default;
const auto & src = tmpl.source();
const auto & caps = tmpl.original_caps();
- params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
+ params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
params.add_generation_prompt = inputs.add_generation_prompt;
params.tool_choice = inputs.tool_choice;
params.reasoning_format = inputs.reasoning_format;
@@ -2472,6 +2881,10 @@ static common_chat_params common_chat_templates_apply_jinja(
params.add_bos = tmpls->add_bos;
params.add_eos = tmpls->add_eos;
+ if (!tmpl.original_caps().supports_system_role) {
+ workaround::system_message_not_supported(params.messages);
+ }
+
params.extra_context = json::object();
for (auto el : inputs.chat_template_kwargs) {
params.extra_context[el.first] = json::parse(el.second);
@@ -2510,11 +2923,15 @@ static common_chat_params common_chat_templates_apply_jinja(
// Command R7B: : use handler in all cases except json schema (thinking / tools).
if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos && params.json_schema.is_null()) {
+ workaround::func_args_not_string(params.messages);
return common_chat_params_init_command_r7b(tmpl, params);
}
// Granite (IBM) - detects thinking / tools support
if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
+ workaround::func_args_not_string(params.messages);
+ workaround::use_generic_schema(params.messages);
+ workaround::move_tool_calls_to_content(params.messages);
return common_chat_params_init_granite(tmpl, params);
}
@@ -2523,6 +2940,11 @@ static common_chat_params common_chat_templates_apply_jinja(
src.find("") != std::string::npos &&
src.find("") != std::string::npos &&
params.json_schema.is_null()) {
+ workaround::func_args_not_string(params.messages);
+ if (!params.extra_context.contains("clear_thinking")) {
+ // by default, do not clear reasoning_content (added since GLM-4.7)
+ params.extra_context["clear_thinking"] = false;
+ }
return common_chat_params_init_glm_4_5(tmpl, params);
}
@@ -2534,6 +2956,11 @@ static common_chat_params common_chat_templates_apply_jinja(
src.find("") != std::string::npos &&
src.find("") != std::string::npos) {
+ return common_chat_params_init_nemotron_v3(tmpl, params);
+ }
return common_chat_params_init_qwen3_coder_xml(tmpl, params);
}
@@ -2547,6 +2974,13 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_xiaomi_mimo(tmpl, params);
}
+ // EXAONE MoE format detection
+ if (src.find("") != std::string::npos &&
+ src.find("") != std::string::npos &&
+ src.find("<|tool_declare|>") != std::string::npos) {
+ return common_chat_params_init_exaone_moe(tmpl, params);
+ }
+
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
if (src.find("") != std::string::npos && params.json_schema.is_null()) {
return common_chat_params_init_hermes_2_pro(tmpl, params);
@@ -2559,6 +2993,7 @@ static common_chat_params common_chat_templates_apply_jinja(
// Seed-OSS
if (src.find("") != std::string::npos) {
+ workaround::func_args_not_string(params.messages);
return common_chat_params_init_seed_oss(tmpl, params, inputs);
}
@@ -2580,6 +3015,7 @@ static common_chat_params common_chat_templates_apply_jinja(
// MiniMax-M2 format detection
if (src.find("]~!b[") != std::string::npos && src.find("]~b]") != std::string::npos) {
+ workaround::func_args_not_string(params.messages);
return common_chat_params_init_minimax_m2(tmpl, params);
}
@@ -2626,6 +3062,7 @@ static common_chat_params common_chat_templates_apply_jinja(
// Llama 3.1, 3.2, 3.3 (also requires date_string so using it even w/o tools)
if (src.find("<|start_header_id|>ipython<|end_header_id|>") != std::string::npos) {
auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
+ workaround::func_args_not_string(params.messages);
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
}
@@ -2640,6 +3077,19 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_magistral(tmpl, params);
}
+ // Solar Open
+ if (src.find("<|tool_response:begin|>") != std::string::npos &&
+ src.find("<|tool_response:name|>") != std::string::npos &&
+ src.find("<|tool_response:result|>") != std::string::npos) {
+ return common_chat_params_init_solar_open(tmpl, params);
+ }
+
+ // TranslateGemma
+ if (src.find("[source_lang_code]") != std::string::npos &&
+ src.find("[target_lang_code]") != std::string::npos) {
+ return common_chat_params_init_translate_gemma(tmpl, params);
+ }
+
// Plain handler (no tools)
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return common_chat_params_init_without_tools(tmpl, params);
@@ -2647,10 +3097,14 @@ static common_chat_params common_chat_templates_apply_jinja(
// Mistral Nemo (w/ tools)
if (src.find("[TOOL_CALLS]") != std::string::npos) {
+ workaround::func_args_not_string(params.messages);
return common_chat_params_init_mistral_nemo(tmpl, params);
}
// Generic fallback
+ workaround::func_args_not_string(params.messages);
+ workaround::use_generic_schema(params.messages);
+ workaround::move_tool_calls_to_content(params.messages);
return common_chat_params_init_generic(tmpl, params);
}
@@ -2728,3 +3182,9 @@ common_chat_params common_chat_templates_apply(
? common_chat_templates_apply_jinja(tmpls, inputs)
: common_chat_templates_apply_legacy(tmpls, inputs);
}
+
+std::map common_chat_templates_get_caps(const common_chat_templates * chat_templates) {
+ GGML_ASSERT(chat_templates != nullptr);
+ GGML_ASSERT(chat_templates->template_default != nullptr);
+ return chat_templates->template_default->caps.to_map();
+}
diff --git a/common/chat.h b/common/chat.h
index 6085510a4..24aa4aab5 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -10,6 +10,8 @@
#include
#include