diff --git a/.clang-format b/.clang-format
index 45232b80e..47d96b6b4 100644
--- a/.clang-format
+++ b/.clang-format
@@ -22,8 +22,8 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
-BinPackArguments: true
-BinPackParameters: true # OnePerLine
+BinPackArguments: false
+BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
BraceWrapping:
@@ -70,15 +70,18 @@ ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
IncludeBlocks: Regroup
IncludeCategories:
- - Regex: '^<.*\.h>'
+ - Regex: '".*"'
Priority: 1
SortPriority: 0
- - Regex: '^<.*'
+ - Regex: '^<.*\.h>'
Priority: 2
SortPriority: 0
- - Regex: '.*'
+ - Regex: '^<.*'
Priority: 3
SortPriority: 0
+ - Regex: '.*'
+ Priority: 4
+ SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile
new file mode 100644
index 000000000..02f3e03b5
--- /dev/null
+++ b/.devops/cann.Dockerfile
@@ -0,0 +1,130 @@
+# ==============================================================================
+# ARGUMENTS
+# ==============================================================================
+
+# Define the CANN base image for easier version updates later
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
+
+# ==============================================================================
+# BUILD STAGE
+# Compile all binary files and libraries
+# ==============================================================================
+FROM ${CANN_BASE_IMAGE} AS build
+
+# Define the Ascend chip model for compilation. Default is Ascend910B3
+ARG ASCEND_SOC_TYPE=Ascend910B3
+
+# -- Install build dependencies --
+RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
+ yum clean all && \
+ rm -rf /var/cache/yum
+
+# -- Set the working directory --
+WORKDIR /app
+
+# -- Copy project files --
+COPY . .
+
+# -- Set CANN environment variables (required for compilation) --
+# Using ENV instead of `source` allows environment variables to persist across the entire image layer
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
+# ... You can add other environment variables from the original file as needed ...
+# For brevity, only core variables are listed here. You can paste the original ENV list here.
+
+# -- Build llama.cpp --
+# Use the passed ASCEND_SOC_TYPE argument and add general build options
+RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
+ && \
+ cmake -B build \
+ -DGGML_CANN=ON \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DSOC_TYPE=${ASCEND_SOC_TYPE} \
+ . && \
+ cmake --build build --config Release -j$(nproc)
+
+# -- Organize build artifacts for copying in later stages --
+# Create a lib directory to store all .so files
+RUN mkdir -p /app/lib && \
+ find build -name "*.so" -exec cp {} /app/lib \;
+
+# Create a full directory to store all executables and Python scripts
+RUN mkdir -p /app/full && \
+ cp build/bin/* /app/full/ && \
+ cp *.py /app/full/ && \
+ cp -r gguf-py /app/full/ && \
+ cp -r requirements /app/full/ && \
+ cp requirements.txt /app/full/
+ # If you have a tools.sh script, make sure it is copied here
+ # cp .devops/tools.sh /app/full/tools.sh
+
+# ==============================================================================
+# BASE STAGE
+# Create a minimal base image with CANN runtime and common libraries
+# ==============================================================================
+FROM ${CANN_BASE_IMAGE} AS base
+
+# -- Install runtime dependencies --
+RUN yum install -y libgomp curl && \
+ yum clean all && \
+ rm -rf /var/cache/yum
+
+# -- Set CANN environment variables (required for runtime) --
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+# ... You can add other environment variables from the original file as needed ...
+
+WORKDIR /app
+
+# Copy compiled .so files from the build stage
+COPY --from=build /app/lib/ /app
+
+# ==============================================================================
+# FINAL STAGES (TARGETS)
+# ==============================================================================
+
+### Target: full
+# Complete image with all tools, Python bindings, and dependencies
+# ==============================================================================
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+# Install Python dependencies
+RUN yum install -y git python3 python3-pip && \
+ pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
+ pip3 install --no-cache-dir -r requirements.txt && \
+ yum clean all && \
+ rm -rf /var/cache/yum
+
+# You need to provide a tools.sh script as the entrypoint
+ENTRYPOINT ["/app/tools.sh"]
+# If there is no tools.sh, you can set the default to start the server
+# ENTRYPOINT ["/app/llama-server"]
+
+### Target: light
+# Lightweight image containing only llama-cli
+# ==============================================================================
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Target: server
+# Dedicated server image containing only llama-server
+# ==============================================================================
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile
index 87ce2393f..b0c86dccd 100644
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -1,10 +1,10 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc4.0.1
+ARG MUSA_VERSION=rc4.2.0
# Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index 6e8050a49..651a54db4 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -47,6 +47,7 @@ let
inherit (lib)
cmakeBool
cmakeFeature
+ optionalAttrs
optionals
strings
;
@@ -197,7 +198,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
];
# Environment variables needed for ROCm
- env = optionals useRocm {
+ env = optionalAttrs useRocm {
ROCM_PATH = "${rocmPackages.clr}";
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
};
diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile
index 1c00f1b9c..cf19e6e02 100644
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -1,8 +1,8 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=6.3
-ARG AMDGPU_VERSION=6.3
+ARG ROCM_VERSION=6.4
+ARG AMDGPU_VERSION=6.4
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
diff --git a/.devops/tools.sh b/.devops/tools.sh
index 41a6b1e55..8a3a69340 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
set -e
# Read the first argument into a variable
diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
index b85bf5741..95a0b5cc7 100644
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -40,7 +40,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
- options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
+ options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
multiple: true
validations:
required: true
diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml
index 1ccef0793..d1034bbb6 100644
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -42,7 +42,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
- options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
+ options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
multiple: true
validations:
required: true
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 3c2f67707..df6a7a40e 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,10 +1,4 @@
# https://github.com/actions/labeler
-Kompute:
- - changed-files:
- - any-glob-to-any-file:
- - ggml/include/ggml-kompute.h
- - ggml/src/ggml-kompute/**
- - README-kompute.md
Apple Metal:
- changed-files:
- any-glob-to-any-file:
@@ -93,3 +87,8 @@ Ascend NPU:
- ggml/include/ggml-cann.h
- ggml/src/ggml-cann/**
- docs/backend/CANN.md
+OpenCL:
+ - changed-files:
+ - any-glob-to-any-file:
+ - ggml/include/ggml-opencl.h
+ - ggml/src/ggml-opencl/**
diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml
index 7cfc82ba4..04ad187d3 100644
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -48,98 +48,98 @@ jobs:
cmake --build build --config Release -j $(nproc)
- ubuntu-24-riscv64-vulkan-cross:
- runs-on: ubuntu-24.04
+ # ubuntu-24-riscv64-vulkan-cross:
+ # runs-on: ubuntu-24.04
- steps:
- - uses: actions/checkout@v4
- - name: Setup Riscv
- run: |
- sudo dpkg --add-architecture riscv64
+ # steps:
+ # - uses: actions/checkout@v4
+ # - name: Setup Riscv
+ # run: |
+ # sudo dpkg --add-architecture riscv64
- # Add arch-specific repositories for non-amd64 architectures
- cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
- deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
- deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
- deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
- deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
- EOF
+ # # Add arch-specific repositories for non-amd64 architectures
+ # cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
+ # deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+ # deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+ # deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+ # deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+ # EOF
- sudo apt-get update || true ;# Prevent failure due to missing URLs.
+ # sudo apt-get update || true ;# Prevent failure due to missing URLs.
- sudo apt-get install -y --no-install-recommends \
- build-essential \
- glslc \
- gcc-14-riscv64-linux-gnu \
- g++-14-riscv64-linux-gnu \
- libvulkan-dev:riscv64
+ # sudo apt-get install -y --no-install-recommends \
+ # build-essential \
+ # glslc \
+ # gcc-14-riscv64-linux-gnu \
+ # g++-14-riscv64-linux-gnu \
+ # libvulkan-dev:riscv64
- - name: Build
- run: |
- cmake -B build -DLLAMA_CURL=OFF \
- -DCMAKE_BUILD_TYPE=Release \
- -DGGML_VULKAN=ON \
- -DGGML_OPENMP=OFF \
- -DLLAMA_BUILD_EXAMPLES=ON \
- -DLLAMA_BUILD_TOOLS=ON \
- -DLLAMA_BUILD_TESTS=OFF \
- -DCMAKE_SYSTEM_NAME=Linux \
- -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
- -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
- -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
- -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
- -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
- -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
- -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
- -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+ # - name: Build
+ # run: |
+ # cmake -B build -DLLAMA_CURL=OFF \
+ # -DCMAKE_BUILD_TYPE=Release \
+ # -DGGML_VULKAN=ON \
+ # -DGGML_OPENMP=OFF \
+ # -DLLAMA_BUILD_EXAMPLES=ON \
+ # -DLLAMA_BUILD_TOOLS=ON \
+ # -DLLAMA_BUILD_TESTS=OFF \
+ # -DCMAKE_SYSTEM_NAME=Linux \
+ # -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+ # -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+ # -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+ # -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+ # -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+ # -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+ # -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+ # -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
- cmake --build build --config Release -j $(nproc)
+ # cmake --build build --config Release -j $(nproc)
- ubuntu-24-arm64-vulkan-cross:
- runs-on: ubuntu-24.04
+ # ubuntu-24-arm64-vulkan-cross:
+ # runs-on: ubuntu-24.04
- steps:
- - uses: actions/checkout@v4
- - name: Setup Arm64
- run: |
- sudo dpkg --add-architecture arm64
+ # steps:
+ # - uses: actions/checkout@v4
+ # - name: Setup Arm64
+ # run: |
+ # sudo dpkg --add-architecture arm64
- # Add arch-specific repositories for non-amd64 architectures
- cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
- deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
- deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
- deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
- deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
- EOF
+ # # Add arch-specific repositories for non-amd64 architectures
+ # cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
+ # deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+ # deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+ # deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+ # deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+ # EOF
- sudo apt-get update || true ;# Prevent failure due to missing URLs.
+ # sudo apt-get update || true ;# Prevent failure due to missing URLs.
- sudo apt-get install -y --no-install-recommends \
- build-essential \
- glslc \
- crossbuild-essential-arm64 \
- libvulkan-dev:arm64
+ # sudo apt-get install -y --no-install-recommends \
+ # build-essential \
+ # glslc \
+ # crossbuild-essential-arm64 \
+ # libvulkan-dev:arm64
- - name: Build
- run: |
- cmake -B build -DLLAMA_CURL=OFF \
- -DCMAKE_BUILD_TYPE=Release \
- -DGGML_VULKAN=ON \
- -DGGML_OPENMP=OFF \
- -DLLAMA_BUILD_EXAMPLES=ON \
- -DLLAMA_BUILD_TOOLS=ON \
- -DLLAMA_BUILD_TESTS=OFF \
- -DCMAKE_SYSTEM_NAME=Linux \
- -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
- -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
- -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
- -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
- -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
- -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
- -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
- -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+ # - name: Build
+ # run: |
+ # cmake -B build -DLLAMA_CURL=OFF \
+ # -DCMAKE_BUILD_TYPE=Release \
+ # -DGGML_VULKAN=ON \
+ # -DGGML_OPENMP=OFF \
+ # -DLLAMA_BUILD_EXAMPLES=ON \
+ # -DLLAMA_BUILD_TOOLS=ON \
+ # -DLLAMA_BUILD_TESTS=OFF \
+ # -DCMAKE_SYSTEM_NAME=Linux \
+ # -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
+ # -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
+ # -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
+ # -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+ # -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
+ # -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+ # -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+ # -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
- cmake --build build --config Release -j $(nproc)
+ # cmake --build build --config Release -j $(nproc)
ubuntu-24-ppc64el-cpu-cross:
runs-on: ubuntu-24.04
@@ -185,52 +185,52 @@ jobs:
cmake --build build --config Release -j $(nproc)
- ubuntu-24-ppc64el-vulkan-cross:
- runs-on: ubuntu-24.04
+ # ubuntu-24-ppc64el-vulkan-cross:
+ # runs-on: ubuntu-24.04
- steps:
- - uses: actions/checkout@v4
- - name: Setup PowerPC64le
- run: |
- sudo dpkg --add-architecture ppc64el
+ # steps:
+ # - uses: actions/checkout@v4
+ # - name: Setup PowerPC64le
+ # run: |
+ # sudo dpkg --add-architecture ppc64el
- # Add arch-specific repositories for non-amd64 architectures
- cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
- deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
- deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
- deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
- deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
- EOF
+ # # Add arch-specific repositories for non-amd64 architectures
+ # cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
+ # deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+ # deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+ # deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+ # deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+ # EOF
- sudo apt-get update || true ;# Prevent failure due to missing URLs.
+ # sudo apt-get update || true ;# Prevent failure due to missing URLs.
- sudo apt-get install -y --no-install-recommends \
- build-essential \
- glslc \
- gcc-14-powerpc64le-linux-gnu \
- g++-14-powerpc64le-linux-gnu \
- libvulkan-dev:ppc64el
+ # sudo apt-get install -y --no-install-recommends \
+ # build-essential \
+ # glslc \
+ # gcc-14-powerpc64le-linux-gnu \
+ # g++-14-powerpc64le-linux-gnu \
+ # libvulkan-dev:ppc64el
- - name: Build
- run: |
- cmake -B build -DLLAMA_CURL=OFF \
- -DCMAKE_BUILD_TYPE=Release \
- -DGGML_VULKAN=ON \
- -DGGML_OPENMP=OFF \
- -DLLAMA_BUILD_EXAMPLES=ON \
- -DLLAMA_BUILD_TOOLS=ON \
- -DLLAMA_BUILD_TESTS=OFF \
- -DCMAKE_SYSTEM_NAME=Linux \
- -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
- -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
- -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
- -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
- -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
- -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
- -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
- -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+ # - name: Build
+ # run: |
+ # cmake -B build -DLLAMA_CURL=OFF \
+ # -DCMAKE_BUILD_TYPE=Release \
+ # -DGGML_VULKAN=ON \
+ # -DGGML_OPENMP=OFF \
+ # -DLLAMA_BUILD_EXAMPLES=ON \
+ # -DLLAMA_BUILD_TOOLS=ON \
+ # -DLLAMA_BUILD_TESTS=OFF \
+ # -DCMAKE_SYSTEM_NAME=Linux \
+ # -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
+ # -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
+ # -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
+ # -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+ # -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
+ # -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+ # -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+ # -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
- cmake --build build --config Release -j $(nproc)
+ # cmake --build build --config Release -j $(nproc)
debian-13-loongarch64-cpu-cross:
runs-on: ubuntu-24.04
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 4feccf21e..3d4f837e2 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -84,7 +84,8 @@ jobs:
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL_USE_BF16=ON \
- -DGGML_METAL_EMBED_LIBRARY=ON \
+ -DGGML_METAL_EMBED_LIBRARY=OFF \
+ -DGGML_METAL_SHADER_DEBUG=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
@@ -134,6 +135,53 @@ jobs:
cd build
ctest -L main --verbose --timeout 900
+ macOS-latest-cmake-arm64-webgpu:
+ runs-on: macos-14
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+
+ - name: ccache
+ uses: hendrikmuhs/ccache-action@v1.2.16
+ with:
+ key: macOS-latest-cmake-arm64-webgpu
+ evict-old-files: 1d
+
+ - name: Dependencies
+ id: depends
+ continue-on-error: true
+ run: |
+ brew update
+ brew install curl
+
+ - name: Dawn Dependency
+ id: dawn-depends
+ run: |
+ DAWN_VERSION="v1.0.0"
+ DAWN_OWNER="reeselevine"
+ DAWN_REPO="dawn"
+ DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
+ echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+ curl -L -o artifact.tar.gz \
+ "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+ mkdir dawn
+ tar -xvf artifact.tar.gz -C dawn --strip-components=1
+
+ - name: Build
+ id: cmake_build
+ run: |
+ export CMAKE_PREFIX_PATH=dawn
+ cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
+ cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ ctest -L main --verbose --timeout 900
+
ubuntu-cpu-cmake:
strategy:
matrix:
@@ -341,6 +389,56 @@ jobs:
cd build
export GGML_VK_VISIBLE_DEVICES=0
# This is using llvmpipe and runs slower than other backends
+ ctest -L main --verbose --timeout 4200
+
+ ubuntu-22-cmake-webgpu:
+ runs-on: ubuntu-22.04
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+
+ - name: ccache
+ uses: hendrikmuhs/ccache-action@v1.2.16
+ with:
+ key: ubuntu-22-cmake-webgpu
+ evict-old-files: 1d
+
+ - name: Vulkan SDK Dependencies
+ id: vulkan-depends
+ run: |
+ wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+ sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+ sudo apt-get update -y
+ sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
+
+ - name: Dawn Dependency
+ id: dawn-depends
+ run: |
+ sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
+ DAWN_VERSION="v1.0.0"
+ DAWN_OWNER="reeselevine"
+ DAWN_REPO="dawn"
+ DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
+ echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+ curl -L -o artifact.tar.gz \
+ "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+ mkdir dawn
+ tar -xvf artifact.tar.gz -C dawn --strip-components=1
+
+ - name: Build
+ id: cmake_build
+ run: |
+ export Dawn_DIR=dawn/lib64/cmake/Dawn
+ cmake -B build -DGGML_WEBGPU=ON
+ cmake --build build --config Release -j $(nproc)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ # This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 3600
ubuntu-22-cmake-hip:
@@ -385,7 +483,7 @@ jobs:
ubuntu-22-cmake-musa:
runs-on: ubuntu-22.04
- container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
+ container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
steps:
- name: Clone
@@ -664,7 +762,7 @@ jobs:
./build-xcframework.sh
windows-msys2:
- runs-on: windows-latest
+ runs-on: windows-2025
strategy:
fail-fast: false
@@ -714,7 +812,7 @@ jobs:
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake:
- runs-on: windows-latest
+ runs-on: windows-2025
env:
OPENBLAS_VERSION: 0.3.23
@@ -725,17 +823,20 @@ jobs:
matrix:
include:
- build: 'cpu-x64 (static)'
+ arch: 'x64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
- build: 'openblas-x64'
+ arch: 'x64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'vulkan-x64'
+ arch: 'x64'
defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
- build: 'llvm-arm64'
+ arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
- build: 'llvm-arm64-opencl-adreno'
+ arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
- # - build: 'kompute-x64'
- # defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
steps:
- name: Clone
@@ -749,12 +850,6 @@ jobs:
variant: ccache
evict-old-files: 1d
- - name: Clone Kompute submodule
- id: clone_kompute
- if: ${{ matrix.build == 'kompute-x64' }}
- run: |
- git submodule update --init ggml/src/ggml-kompute/kompute
-
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas-x64' }}
@@ -770,7 +865,7 @@ jobs:
- name: Install Vulkan SDK
id: get_vulkan
- if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
+ if: ${{ matrix.build == 'vulkan-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
@@ -805,6 +900,8 @@ jobs:
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
+ with:
+ architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
- name: Build
id: cmake_build
@@ -825,7 +922,7 @@ jobs:
- name: Test
id: cmake_test
- if: ${{ matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' }}
+ if: ${{ matrix.arch == 'x64' }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
@@ -930,7 +1027,7 @@ jobs:
cmake --build build --config Release
windows-latest-cmake-sycl:
- runs-on: windows-latest
+ runs-on: windows-2022
defaults:
run:
@@ -964,7 +1061,7 @@ jobs:
windows-latest-cmake-hip:
if: ${{ github.event.inputs.create_release != 'true' }}
- runs-on: windows-latest
+ runs-on: windows-2022
steps:
- name: Clone
diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml
index 276a217d4..19e785474 100644
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -17,7 +17,7 @@ jobs:
steps:
- uses: actions/stale@v5
with:
- exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
+ exempt-issue-labels: "refactoring,help wanted,good first issue,research,bug,roadmap"
days-before-issue-stale: 30
days-before-issue-close: 14
stale-issue-label: "stale"
diff --git a/.github/workflows/pre-tokenizer-hashes.yml b/.github/workflows/pre-tokenizer-hashes.yml
new file mode 100644
index 000000000..dff998e23
--- /dev/null
+++ b/.github/workflows/pre-tokenizer-hashes.yml
@@ -0,0 +1,45 @@
+name: Check Pre-Tokenizer Hashes
+
+on:
+ push:
+ paths:
+ - 'convert_hf_to_gguf.py'
+ - 'convert_hf_to_gguf_update.py'
+ pull_request:
+ paths:
+ - 'convert_hf_to_gguf.py'
+ - 'convert_hf_to_gguf_update.py'
+
+jobs:
+ pre-tokenizer-hashes:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Install Python dependencies
+ run: |
+ python3 -m venv .venv
+ .venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
+
+ - name: Update pre-tokenizer hashes
+ run: |
+ cp convert_hf_to_gguf.py /tmp
+ .venv/bin/python convert_hf_to_gguf_update.py --check-missing
+
+ - name: Check if committed pre-tokenizer hashes matches generated version
+ run: |
+ if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
+ echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
+ echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
+ echo "Differences found:"
+ diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
+ exit 1
+ fi
+ echo "Model pre-tokenizer hashes are up to date."
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 64fff175e..4ed6126f4 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -49,7 +49,8 @@ jobs:
run: |
sysctl -a
cmake -B build \
- -DCMAKE_BUILD_RPATH="@loader_path" \
+ -DCMAKE_INSTALL_RPATH='@loader_path' \
+ -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
@@ -103,7 +104,8 @@ jobs:
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build \
- -DCMAKE_BUILD_RPATH="@loader_path" \
+ -DCMAKE_INSTALL_RPATH='@loader_path' \
+ -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON
@@ -160,6 +162,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
+ -DCMAKE_INSTALL_RPATH='$ORIGIN' \
+ -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
@@ -211,6 +215,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
+ -DCMAKE_INSTALL_RPATH='$ORIGIN' \
+ -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
@@ -235,7 +241,7 @@ jobs:
name: llama-bin-ubuntu-vulkan-x64.zip
windows-cpu:
- runs-on: windows-latest
+ runs-on: windows-2025
strategy:
matrix:
@@ -271,7 +277,7 @@ jobs:
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
- call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
+ call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
cmake -S . -B build -G "Ninja Multi-Config" ^
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
-DGGML_NATIVE=OFF ^
@@ -288,7 +294,7 @@ jobs:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
- Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
+ Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
- name: Upload artifacts
@@ -298,7 +304,7 @@ jobs:
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
windows:
- runs-on: windows-latest
+ runs-on: windows-2025
env:
OPENBLAS_VERSION: 0.3.23
@@ -448,7 +454,7 @@ jobs:
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
windows-sycl:
- runs-on: windows-latest
+ runs-on: windows-2022
defaults:
run:
@@ -520,7 +526,7 @@ jobs:
name: llama-bin-win-sycl-x64.zip
windows-hip:
- runs-on: windows-latest
+ runs-on: windows-2022
strategy:
matrix:
diff --git a/.github/workflows/update-ops-docs.yml b/.github/workflows/update-ops-docs.yml
new file mode 100644
index 000000000..c0218fa74
--- /dev/null
+++ b/.github/workflows/update-ops-docs.yml
@@ -0,0 +1,40 @@
+name: Update Operations Documentation
+
+on:
+ push:
+ paths:
+ - 'docs/ops/**'
+ - 'scripts/create_ops_docs.py'
+ pull_request:
+ paths:
+ - 'docs/ops/**'
+ - 'scripts/create_ops_docs.py'
+
+jobs:
+ update-ops-docs:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.x'
+
+ - name: Generate operations documentation to temporary file
+ run: |
+ mkdir -p /tmp/ops_check
+ ./scripts/create_ops_docs.py /tmp/ops_check/ops.md
+
+ - name: Check if docs/ops.md matches generated version
+ run: |
+ if ! diff -q docs/ops.md /tmp/ops_check/ops.md; then
+ echo "Operations documentation (docs/ops.md) is not up to date with the backend CSV files."
+ echo "To fix: run ./scripts/create_ops_docs.py and commit the updated docs/ops.md along with your changes"
+ echo "Differences found:"
+ diff docs/ops.md /tmp/ops_check/ops.md || true
+ exit 1
+ fi
+ echo "Operations documentation is up to date."
diff --git a/.gitignore b/.gitignore
index f8ceb1560..f48ce4cac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -82,6 +82,7 @@ models/*
models-mnt
!models/.editorconfig
!models/ggml-vocab-*.gguf*
+!models/templates
# Zig
zig-out/
diff --git a/.gitmodules b/.gitmodules
index 23ce5ff05..e69de29bb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "kompute"]
- path = ggml/src/ggml-kompute/kompute
- url = https://github.com/nomic-ai/kompute.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d2becb04c..c79ccd09e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -120,7 +120,6 @@ endfunction()
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
-llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
diff --git a/CMakePresets.json b/CMakePresets.json
index e98447013..b5afeb3c0 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -55,6 +55,17 @@
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
}
},
+ {
+ "name": "x64-linux-gcc", "hidden": true,
+ "cacheVariables": {
+ "CMAKE_C_COMPILER": "gcc",
+ "CMAKE_CXX_COMPILER": "g++"
+ }
+ },
+ { "name": "x64-linux-gcc-debug", "inherits": [ "base", "x64-linux-gcc", "debug" ] },
+ { "name": "x64-linux-gcc-release", "inherits": [ "base", "x64-linux-gcc", "release" ] },
+ { "name": "x64-linux-gcc-reldbg", "inherits": [ "base", "x64-linux-gcc", "reldbg" ] },
+ { "name": "x64-linux-gcc+static-release", "inherits": [ "base", "x64-linux-gcc", "release", "static" ] },
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
diff --git a/CODEOWNERS b/CODEOWNERS
index 3186f8eb1..4c0dd4b72 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -9,3 +9,4 @@
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/gguf.cpp @JohannesGaessler
+/ggml/src/ggml-vulkan/ @0cc4m
diff --git a/README.md b/README.md
index 90c7364df..954fff83d 100644
--- a/README.md
+++ b/README.md
@@ -6,9 +6,9 @@
[](https://github.com/ggml-org/llama.cpp/releases)
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
+[Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) / [ops](https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md)
-Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
+LLM inference in C/C++
## Recent API changes
@@ -17,10 +17,10 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
## Hot topics
-- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
-- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
+- Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
+- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
+- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
-- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
@@ -134,6 +134,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
+- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
#### Multimodal
@@ -269,6 +270,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU |
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
+| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
## Obtaining and quantizing models
@@ -434,7 +436,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
## [`llama-perplexity`](tools/perplexity)
-#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
+#### A tool for measuring the [perplexity](tools/perplexity/README.md) [^1] (and other quality metrics) of a model over a given text.
-
Measure the perplexity over a text file
@@ -457,8 +459,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
-[^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
-[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
+[^1]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
## [`llama-bench`](tools/llama-bench)
diff --git a/build-xcframework.sh b/build-xcframework.sh
index a08419a80..f813984db 100755
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
#
# Options
IOS_MIN_OS_VERSION=16.4
diff --git a/ci/README.md b/ci/README.md
index 6e297f1a8..8eebe988d 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -54,7 +54,7 @@ docker run --privileged -it \
-v $HOME/llama.cpp/ci-cache:/ci-cache \
-v $HOME/llama.cpp/ci-results:/ci-results \
-v $PWD:/ws -w /ws \
- mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
+ mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
```
Inside the container, execute the following commands:
diff --git a/ci/run.sh b/ci/run.sh
index e1b777c30..4d3abf923 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
#
# sample usage:
#
@@ -16,6 +16,9 @@
# # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
+# # with WebGPU support
+# GG_BUILD_WEBGPU=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
# # with MUSA support
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
@@ -81,6 +84,10 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
fi
+if [ ! -z ${GG_BUILD_WEBGPU} ]; then
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
+fi
+
if [ ! -z ${GG_BUILD_MUSA} ]; then
# Use qy1 by default (MTT S80)
MUSA_ARCH=${MUSA_ARCH:-21}
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index f43a630c9..0ae4d698f 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -86,8 +86,7 @@ if (LLAMA_CURL)
endif()
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
- find_library(CURL_LIBRARY curl REQUIRED)
- set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
+ set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
endif ()
if (LLAMA_LLGUIDANCE)
@@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
ExternalProject_Add(llguidance_ext
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
- # v0.7.20 (+ fix to build on GCC 15):
- GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
+ # v1.0.1:
+ GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
PREFIX ${CMAKE_BINARY_DIR}/llguidance
SOURCE_DIR ${LLGUIDANCE_SRC}
BUILD_IN_SOURCE TRUE
CONFIGURE_COMMAND ""
- BUILD_COMMAND cargo build --release
+ BUILD_COMMAND cargo build --release --package llguidance
INSTALL_COMMAND ""
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
UPDATE_COMMAND ""
diff --git a/common/arg.cpp b/common/arg.cpp
index c4ad85c47..0f01bb314 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -24,6 +24,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -977,6 +978,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
string_process_escapes(seq_breaker);
}
+ for (auto & pair : params.speculative.replacements) {
+ string_process_escapes(pair.first);
+ string_process_escapes(pair.second);
+ }
}
if (!params.kv_overrides.empty()) {
@@ -1464,6 +1469,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.swa_full = true;
}
).set_env("LLAMA_ARG_SWA_FULL"));
+ add_opt(common_arg(
+ {"--kv-unified", "-kvu"},
+ string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
+ [](common_params & params) {
+ params.kv_unified = true;
+ }
+ ).set_env("LLAMA_ARG_KV_SPLIT"));
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -1604,7 +1617,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.antiprompt.emplace_back(value);
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-sp", "--special"},
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
@@ -2083,6 +2096,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.no_kv_offload = true;
}
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
+ add_opt(common_arg(
+ {"-nr", "--no-repack"},
+ "disable weight repacking",
+ [](common_params & params) {
+ params.no_extra_bufts = true;
+ }
+ ).set_env("LLAMA_ARG_NO_REPACK"));
add_opt(common_arg(
{"-ctk", "--cache-type-k"}, "TYPE",
string_format(
@@ -2356,11 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
throw std::invalid_argument("unknown buffer type");
}
- // FIXME: this leaks memory
- params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
+ // keep strings alive and avoid leaking memory by storing them in a static vector
+ static std::list buft_overrides;
+ buft_overrides.push_back(tensor_name);
+ params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
}
}
));
+ add_opt(common_arg(
+ {"--cpu-moe", "-cmoe"},
+ "keep all Mixture of Experts (MoE) weights in the CPU",
+ [](common_params & params) {
+ params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
+ }
+ ).set_env("LLAMA_ARG_CPU_MOE"));
+ add_opt(common_arg(
+ {"--n-cpu-moe", "-ncmoe"}, "N",
+ "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
+ [](common_params & params, int value) {
+ if (value < 0) {
+ throw std::invalid_argument("invalid value");
+ }
+ for (int i = 0; i < value; ++i) {
+ // keep strings alive and avoid leaking memory by storing them in a static vector
+ static std::list buft_overrides;
+ buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
+ params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
+ }
+ }
+ ).set_env("LLAMA_ARG_N_CPU_MOE"));
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
"number of layers to store in VRAM",
@@ -2619,6 +2663,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_out_freq = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+ add_opt(common_arg(
+ {"--output-format"}, "{gguf,dat}",
+ string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
+ [](common_params & params, const std::string & value) {
+ /**/ if (value == "gguf") { params.imat_dat = -1; }
+ else if (value == "dat") { params.imat_dat = 1; }
+ else { throw std::invalid_argument("invalid output format"); }
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--save-frequency"}, "N",
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
@@ -2647,6 +2700,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.i_chunk = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+ add_opt(common_arg(
+ {"--show-statistics"},
+ string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
+ [](common_params & params) {
+ params.show_statistics = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--parse-special"},
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
@@ -2734,6 +2794,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.public_path = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
+ add_opt(common_arg(
+ {"--api-prefix"}, "PREFIX",
+ string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
+ [](common_params & params, const std::string & value) {
+ params.api_prefix = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
add_opt(common_arg(
{"--no-webui"},
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
@@ -2794,6 +2861,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.ssl_file_cert = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
+ add_opt(common_arg(
+ {"--chat-template-kwargs"}, "STRING",
+ string_format("sets additional params for the json template parser"),
+ [](common_params & params, const std::string & value) {
+ auto parsed = json::parse(value);
+ for (const auto & item : parsed.items()) {
+ params.default_template_kwargs[item.key()] = item.value().dump();
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
add_opt(common_arg(
{"-to", "--timeout"}, "N",
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -2870,11 +2947,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
"- none: leaves thoughts unparsed in `message.content`\n"
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
- "(default: deepseek)",
+ "(default: auto)",
[](common_params & params, const std::string & value) {
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
+ else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
else { throw std::invalid_argument("invalid value"); }
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
@@ -3217,6 +3295,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.model.path = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+ add_opt(common_arg(
+ {"--spec-replace"}, "TARGET", "DRAFT",
+ "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
+ [](common_params & params, const std::string & tgt, const std::string & dft) {
+ params.speculative.replacements.push_back({ tgt, dft });
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
string_format(
@@ -3406,5 +3491,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ { "--diffusion-steps" }, "N",
+ string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
+ [](common_params & params, int value) { params.diffusion.steps = value; }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ { "--diffusion-visual" },
+ string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
+ params.diffusion.visual_mode ? "true" : "false"),
+ [](common_params & params) { params.diffusion.visual_mode = true; }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+
+ add_opt(common_arg(
+ { "--diffusion-eps" }, "F",
+ string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
+ [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ { "--diffusion-algorithm" }, "N",
+ string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
+ params.diffusion.algorithm),
+ [](common_params & params, int value) { params.diffusion.algorithm = value; }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ { "--diffusion-alg-temp" }, "F",
+ string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
+ [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+
+ add_opt(common_arg(
+ { "--diffusion-block-length" }, "N",
+ string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
+ [](common_params & params, int value) { params.diffusion.block_length = value; }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ { "--diffusion-cfg-scale" }, "F",
+ string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
+ [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ { "--diffusion-add-gumbel-noise" }, "F",
+ string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
+ [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+
+
return ctx_arg;
}
diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
index 18a30e49a..96ba8f533 100644
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -55,7 +55,15 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
- std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
+ std::string arguments = "";
+ if (tool_call.contains("arguments")) {
+ if (tool_call.at("arguments").is_object()) {
+ arguments = tool_call.at("arguments").dump();
+ } else {
+ arguments = tool_call.at("arguments");
+ }
+ }
+
return add_tool_call(name, id, arguments);
}
diff --git a/common/chat.cpp b/common/chat.cpp
index 7d9aaeb12..316bd2417 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -17,6 +17,8 @@
#include
#include
+using json = nlohmann::ordered_json;
+
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
auto time = std::chrono::system_clock::to_time_t(now);
auto local_time = *std::localtime(&time);
@@ -124,6 +126,8 @@ std::vector common_chat_msg_diff::compute_diffs(const comm
typedef minja::chat_template common_chat_template;
struct common_chat_templates {
+ bool add_bos;
+ bool add_eos;
bool has_explicit_template; // Model had builtin template or template overridde was specified.
std::unique_ptr template_default; // always set (defaults to chatml)
std::unique_ptr template_tool_use;
@@ -140,6 +144,9 @@ struct templates_params {
bool add_generation_prompt = true;
bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
+ json extra_context;
+ bool add_bos;
+ bool add_eos;
};
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -442,6 +449,8 @@ std::string common_chat_format_single(
common_chat_templates_inputs inputs;
inputs.use_jinja = use_jinja;
+ inputs.add_bos = tmpls->add_bos;
+ inputs.add_eos = tmpls->add_eos;
std::string fmt_past_msg;
if (!past_msg.empty()) {
@@ -466,6 +475,8 @@ std::string common_chat_format_single(
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
common_chat_templates_inputs inputs;
inputs.use_jinja = use_jinja;
+ inputs.add_bos = tmpls->add_bos;
+ inputs.add_eos = tmpls->add_eos;
auto add_simple_msg = [&](auto role, auto content) {
common_chat_msg msg;
msg.role = role;
@@ -543,6 +554,8 @@ common_chat_templates_ptr common_chat_templates_init(
}
std::string token_bos = bos_token_override;
std::string token_eos = eos_token_override;
+ bool add_bos = false;
+ bool add_eos = false;
if (model) {
const auto * vocab = llama_model_get_vocab(model);
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
@@ -557,9 +570,13 @@ common_chat_templates_ptr common_chat_templates_init(
};
token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
+ add_bos = llama_vocab_get_add_bos(vocab);
+ add_eos = llama_vocab_get_add_eos(vocab);
}
common_chat_templates_ptr tmpls(new common_chat_templates());
tmpls->has_explicit_template = has_explicit_template;
+ tmpls->add_bos = add_bos;
+ tmpls->add_eos = add_eos;
try {
tmpls->template_default = std::make_unique(default_template_src, token_bos, token_eos);
} catch (const std::exception & e) {
@@ -589,6 +606,8 @@ const char * common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
+ case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
+ case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
default:
throw std::runtime_error("Unknown chat format");
}
@@ -597,8 +616,10 @@ const char * common_chat_format_name(common_chat_format format) {
const char * common_reasoning_format_name(common_reasoning_format format) {
switch (format) {
case COMMON_REASONING_FORMAT_NONE: return "none";
+ case COMMON_REASONING_FORMAT_AUTO: return "auto";
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
+ case COMMON_REASONING_FORMAT_GRANITE: return "granite";
default:
throw std::runtime_error("Unknown reasoning format");
}
@@ -720,16 +741,23 @@ static void foreach_function(const json & tools, const std::function & messages_override = std::nullopt,
+ const std::optional & tools_override = std::nullopt,
+ const std::optional & additional_context = std::nullopt)
{
minja::chat_template_inputs tmpl_inputs;
- tmpl_inputs.messages = messages;
- tmpl_inputs.tools = tools;
- tmpl_inputs.add_generation_prompt = add_generation_prompt;
- tmpl_inputs.extra_context = extra_context;
+ tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
+ if (tools_override) {
+ tmpl_inputs.tools = *tools_override;
+ } else {
+ tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
+ }
+ tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
+ tmpl_inputs.extra_context = inputs.extra_context;
+ if (additional_context) {
+ tmpl_inputs.extra_context.merge_patch(*additional_context);
+ }
// TODO: add flag to control date/time, if only for testing purposes.
// tmpl_inputs.now = std::chrono::system_clock::now();
@@ -738,10 +766,10 @@ static std::string apply(
// instead of using `chat_template_options.use_bos_token = false`, since these tokens
// may be needed inside the template / between messages too.
auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
- if (string_starts_with(result, tmpl.bos_token())) {
+ if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
result = result.substr(tmpl.bos_token().size());
}
- if (string_ends_with(result, tmpl.eos_token())) {
+ if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
result = result.substr(0, result.size() - tmpl.eos_token().size());
}
return result;
@@ -828,7 +856,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
inputs.messages,
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
- data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
data.format = COMMON_CHAT_FORMAT_GENERIC;
return data;
}
@@ -904,7 +932,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
data.preserved_tokens = {
"[TOOL_CALLS]",
};
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+ data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
return data;
}
@@ -934,7 +962,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
adjusted_messages.push_back(msg);
}
}
- data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
if (!inputs.enable_thinking) {
@@ -1122,7 +1150,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
} else {
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
}
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
{"date_string", format_time(inputs.now, "%d %b %Y")},
{"tools_in_user_message", false},
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
@@ -1187,7 +1215,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
- auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+ auto prompt = apply(tmpl, inputs);
// Hacks to fix the official (broken) prompt.
// It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
@@ -1279,10 +1307,30 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
tool_calls_end);
}
+static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+ auto prompt = apply(tmpl, inputs);
+
+ data.prompt = prompt;
+ data.format = COMMON_CHAT_FORMAT_GPT_OSS;
+
+ // TODO: support tool calls in GPT-OSS?
+
+ return data;
+}
+static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
+ // TODO @ngxson : this won't work with --special enabled, we should fix that
+ builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
+ if (!builder.syntax().parse_tool_calls) {
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+}
+
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
LOG_DBG("%s\n", __func__);
common_chat_params data;
- data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
});
@@ -1338,7 +1386,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
// If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
common_chat_params data;
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+ data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
if (inputs.tools.is_array() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -1465,7 +1513,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
}
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+ data.prompt = apply(tmpl, inputs);
// TODO: if (has_raw_python)
return data;
}
@@ -1498,14 +1546,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
- json additional_context = {
+ json extra_context = json {
{"enable_thinking", inputs.enable_thinking},
};
+ extra_context.update(inputs.extra_context);
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
if (string_ends_with(data.prompt, "\n")) {
- if (!inputs.enable_thinking) {
+ if (!extra_context["enable_thinking"]) {
data.prompt += "";
} else {
data.thinking_forced_open = true;
@@ -1635,7 +1684,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
"|" // match 5 (function name again)
);
- if (auto res = builder.try_find_regex(open_regex)) {
+ while (auto res = builder.try_find_regex(open_regex)) {
const auto & block_start = res->groups[1];
std::string block_end = block_start.empty() ? "" : "```";
@@ -1657,7 +1706,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
builder.consume_literal(block_end);
builder.consume_spaces();
}
- builder.add_content(builder.consume_rest());
} else {
throw common_chat_msg_partial_exception("failed to parse tool call");
}
@@ -1682,7 +1730,124 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
builder.consume_spaces();
}
}
- builder.add_content(builder.consume_rest());
+ }
+ }
+
+ builder.add_content(builder.consume_rest());
+}
+
+static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ // Pass thinking context for Granite template
+ json additional_context = {
+ {"thinking", inputs.enable_thinking},
+ };
+
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
+ data.format = COMMON_CHAT_FORMAT_GRANITE;
+
+ if (string_ends_with(data.prompt, "\n") || string_ends_with(data.prompt, "")) {
+ if (!inputs.enable_thinking) {
+ data.prompt += "";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ }
+
+ if (!inputs.tools.is_null()) {
+ // Granite uses <|tool_call|> followed by JSON list
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ std::vector tool_rules;
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+ builder.resolve_refs(parameters);
+ tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
+"-args", {
+ {"type", "object"},
+ {"properties", {
+ {"name", {{"const", name}}},
+ {"arguments", parameters},
+ }},
+ {"required", json::array({"name", "arguments"})},
+ })));
+ });
+
+ auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
+ auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
+
+ if (data.thinking_forced_open) {
+ builder.add_rule("root", "\"\" space \"\" space [^<]* \"\" space \"<|tool_call|>\" space " + tool_list);
+ } else {
+ builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
+ }
+
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+ "<|tool_call|>"
+ });
+
+ data.preserved_tokens = {
+ "",
+ "",
+ "",
+ "",
+ "<|tool_call|>",
+ };
+ });
+ } else {
+ // Handle thinking tags for non-tool responses
+ if (data.thinking_forced_open && inputs.enable_thinking) {
+ data.grammar_lazy = false;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ builder.add_rule("root", "\"\" space \"\" space .* \"\" space");
+ });
+ data.preserved_tokens = {
+ "",
+ "",
+ "",
+ "",
+ };
+ }
+ }
+
+ return data;
+}
+
+static void common_chat_parse_granite(common_chat_msg_parser & builder) {
+ // Parse thinking tags
+ builder.try_parse_reasoning("", "");
+
+ // Parse response tags using regex
+ static const common_regex response_regex("([\\s\\S]*?)");
+ if (auto res = builder.try_find_regex(response_regex)) {
+ // Extract the content between the tags (capture group 1)
+ auto content = builder.str(res->groups[1]);
+ builder.add_content(content);
+ builder.move_to(res->groups[0].end);
+ }
+
+ if (!builder.syntax().parse_tool_calls) {
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+
+ // Look for tool calls
+ static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
+ if (auto res = builder.try_find_regex(tool_call_regex)) {
+ builder.move_to(res->groups[0].end);
+
+ // Expect JSON array of tool calls
+ auto tool_calls_data = builder.consume_json();
+ if (tool_calls_data.json.is_array()) {
+ if (!builder.add_tool_calls(tool_calls_data.json)) {
+ builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
+ }
+ } else {
+ builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
}
} else {
builder.add_content(builder.consume_rest());
@@ -1691,7 +1856,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+ data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
data.grammar_lazy = false;
if (!inputs.json_schema.is_null()) {
@@ -1722,6 +1887,14 @@ static common_chat_params common_chat_templates_apply_jinja(
params.enable_thinking = inputs.enable_thinking;
params.grammar = inputs.grammar;
params.now = inputs.now;
+ params.add_bos = inputs.add_bos;
+ params.add_eos = inputs.add_eos;
+
+ params.extra_context = json::object();
+ for (auto el : inputs.chat_template_kwargs) {
+ params.extra_context[el.first] = json::parse(el.second);
+ }
+
if (!inputs.json_schema.empty()) {
params.json_schema = json::parse(inputs.json_schema);
}
@@ -1752,11 +1925,21 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_command_r7b(tmpl, params);
}
+ // Granite (IBM) - detects thinking / tools support
+ if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
+ return common_chat_params_init_granite(tmpl, params);
+ }
+
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
if (src.find("") != std::string::npos && params.json_schema.is_null()) {
return common_chat_params_init_hermes_2_pro(tmpl, params);
}
+ // GPT-OSS
+ if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
+ return common_chat_params_init_gpt_oss(tmpl, params);
+ }
+
// Use generic handler when mixing tools + JSON schema.
// TODO: support that mix in handlers below.
if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -1807,6 +1990,7 @@ static common_chat_params common_chat_templates_apply_legacy(
int alloc_size = 0;
std::vector chat;
std::vector contents;
+
for (const auto & msg : inputs.messages) {
auto content = msg.content;
for (const auto & part : msg.content_parts) {
@@ -1908,6 +2092,12 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
case COMMON_CHAT_FORMAT_COMMAND_R7B:
common_chat_parse_command_r7b(builder);
break;
+ case COMMON_CHAT_FORMAT_GRANITE:
+ common_chat_parse_granite(builder);
+ break;
+ case COMMON_CHAT_FORMAT_GPT_OSS:
+ common_chat_parse_gpt_oss(builder);
+ break;
default:
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
}
@@ -1927,6 +2117,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
}
}
auto msg = builder.result();
- LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
+ if (!is_partial) {
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
+ }
return msg;
}
diff --git a/common/chat.h b/common/chat.h
index 9f59e6b08..eb628d8bc 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -7,6 +7,7 @@
#include
#include
#include
+#include