diff --git a/docs/backend/ZenDNN.md b/docs/backend/ZenDNN.md index b57fd97b6..3b1f8242f 100644 --- a/docs/backend/ZenDNN.md +++ b/docs/backend/ZenDNN.md @@ -22,7 +22,7 @@ **Llama.cpp + ZenDNN** -The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL BLIS, LibXSMM, OneDNN). +The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL DLP, LibXSMM, OneDNN). For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendnn.html @@ -32,7 +32,7 @@ For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendn |:-------:|:-------:|:----------------------------------------------:| | Linux | Support | Ubuntu 20.04, 22.04, 24.04 | -For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/zendnnl/README.md#15-supported-os). +For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/README.md#15-supported-os). ## Hardware @@ -44,9 +44,9 @@ ZenDNN is optimized for AMD EPYC™ processors and AMD Ryzen™ processors based | CPU Family | Status | Notes | |:-----------------------------:|:-------:|:----------------------------------:| -| AMD EPYC™ 9005 Series (Turin)| Support | 5th Gen - Zen 5 architecture | -| AMD EPYC™ 9004 Series (Genoa)| Support | 4th Gen - Zen 4 architecture | -| AMD EPYC™ 7003 Series (Milan)| Support | 3rd Gen - Zen 3 architecture | +| AMD EPYC™ 9005 Series (Turin) | Support | 5th Gen - Zen 5 architecture | +| AMD EPYC™ 9004 Series (Genoa) | Support | 4th Gen - Zen 4 architecture | +| AMD EPYC™ 7003 Series (Milan) | Support | 3rd Gen - Zen 3 architecture | | AMD Ryzen™ AI MAX (Strix Halo)| Support | High-performance mobile processors | *Notes:* @@ -61,7 +61,7 @@ The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** ope | Operation | Status | Notes | |:-------------|:-------:|:----------------------------------------------:| -| MUL_MAT | ✓ | Accelerated via ZenDNN LowOHA MatMul | +| MUL_MAT | Support | Accelerated via ZenDNN LowOHA MatMul | *Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs). @@ -104,7 +104,6 @@ If you want to build ZenDNN yourself or use a specific version: # Clone ZenDNN repository git clone https://github.com/amd/ZenDNN.git cd ZenDNN -git checkout zendnnl # Build and install (requires CMake >= 3.25) mkdir build && cd build @@ -114,7 +113,7 @@ cmake --build . --target all Default installation path: `ZenDNN/build/install` -**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/zendnnl/README.md). +**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/README.md). **Step 2: Build llama.cpp with custom ZenDNN path** @@ -146,8 +145,7 @@ Run llama.cpp server with ZenDNN acceleration: ```sh # Set optimal configuration -export OMP_NUM_THREADS=64 # Adjust to your CPU core count -export ZENDNNL_MATMUL_ALGO=2 # Blocked AOCL BLIS for best performance +export ZENDNNL_MATMUL_ALGO=1 # Blocked AOCL DLP algo for best performance # Start server ./build/bin/llama-server \ @@ -160,62 +158,26 @@ export ZENDNNL_MATMUL_ALGO=2 # Blocked AOCL BLIS for best performance Access the server at `http://localhost:8080`. **Performance tips**: -- Set `OMP_NUM_THREADS` to match your physical core count -- Use `ZENDNNL_MATMUL_ALGO=2` for optimal performance +- Use `ZENDNNL_MATMUL_ALGO=1` for optimal performance - For NUMA systems: `numactl --cpunodebind=0 --membind=0 ./build/bin/llama-server ...` ## Environment Variable -### Build Time +For environment variables related to ZenDNN, refer to the [ZenDNN Environment Variables Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md). -| Name | Value | Function | -|--------------------|---------------------------------------|---------------------------------------------| -| GGML_ZENDNN | ON/OFF | Enable ZenDNN backend support | -| ZENDNN_ROOT | Path to ZenDNN installation | Set ZenDNN installation directory | -| GGML_OPENMP | ON/OFF (recommended: ON) | Enable OpenMP for multi-threading | +### Performance Optimization -### Runtime - -| Name | Value | Function | -|-------------------------|--------------------------|-------------------------------------------------------------------| -| OMP_NUM_THREADS | Number (e.g., 64) | Set number of OpenMP threads (recommended: physical core count) | -| ZENDNNL_MATMUL_ALGO | 0-5 | Select MatMul backend algorithm (see Performance Optimization) | -| ZENDNNL_PROFILE_LOG_LEVEL | 0-4 | Profiling log level (0=disabled, 4=verbose) | -| ZENDNNL_ENABLE_PROFILER | 0 or 1 | Enable detailed profiling (1=enabled) | -| ZENDNNL_API_LOG_LEVEL | 0-4 | API log level (0=disabled, 4=verbose) | - -**Example**: +ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL DLP** algorithm: ```sh -export OMP_NUM_THREADS=64 -export ZENDNNL_MATMUL_ALGO=2 # Use Blocked AOCL BLIS for best performance -./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Test" -n 100 +export ZENDNNL_MATMUL_ALGO=1 # Blocked AOCL DLP algo (recommended) ``` -## Performance Optimization - -### MatMul Algorithm Selection - -ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL BLIS** algorithm: - -```sh -export ZENDNNL_MATMUL_ALGO=2 # Blocked AOCL BLIS (recommended) -``` - -**Available algorithms**: - -| Value | Algorithm | Description | -|:-----:|:-----------------------|:----------------------------------------------| -| 0 | Dynamic Dispatch | Automatic backend selection (default) | -| 1 | AOCL BLIS | AOCL BLIS backend | -| 2 | AOCL BLIS Blocked | **Blocked AOCL BLIS (recommended)** | -| 3 | OneDNN | OneDNN backend | -| 4 | OneDNN Blocked | Blocked OneDNN | -| 5 | LibXSMM | LibXSMM backend | +For more details on available algorithms, see the [ZenDNN MatMul Algorithm Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md#algorithm-details). ### Profiling and Debugging -For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/zendnnl/docs/logging.md). +For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/logging.md). ## Known Issues @@ -245,10 +207,9 @@ A: Currently, ZenDNN primarily supports FP32 and BF16 data types. Quantized mode A: Ensure: 1. You're using an AMD EPYC or Ryzen processor (Zen 2 or newer) -2. `OMP_NUM_THREADS` is set appropriately (physical core count) -3. `ZENDNNL_MATMUL_ALGO=2` is set for best performance (Blocked AOCL BLIS) -4. You're using a sufficiently large model (small models may not benefit as much) -5. Enable profiling to verify ZenDNN MatMul is being called +2. `ZENDNNL_MATMUL_ALGO=1` is set for best performance (Blocked AOCL DLP) +3. You're using a sufficiently large model (small models may not benefit as much) +4. Enable profiling to verify ZenDNN MatMul is being called ### **GitHub Contribution**: Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-team check/address them without delay. diff --git a/ggml/src/ggml-zendnn/CMakeLists.txt b/ggml/src/ggml-zendnn/CMakeLists.txt index f5cf6eedd..9bdb4e836 100644 --- a/ggml/src/ggml-zendnn/CMakeLists.txt +++ b/ggml/src/ggml-zendnn/CMakeLists.txt @@ -1,12 +1,19 @@ ggml_add_backend_library(ggml-zendnn ggml-zendnn.cpp) -# Get ZenDNN path if (NOT DEFINED ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "") set(ZENDNN_ROOT "$ENV{ZENDNN_ROOT}") endif() -# Check if path is still empty or OFF +if (BUILD_SHARED_LIBS) + set(ZENDNN_SHARED_LIB ON) + set(ZENDNN_ARCHIVE_LIB OFF) +else() + set(ZENDNN_SHARED_LIB OFF) + set(ZENDNN_ARCHIVE_LIB ON) +endif() + +# Download and build ZenDNN if not provided if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF") message(STATUS "ZENDNN_ROOT not set. Automatically downloading and building ZenDNN...") message(STATUS "This will take several minutes on first build...") @@ -21,7 +28,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF") ExternalProject_Add( zendnn GIT_REPOSITORY https://github.com/amd/ZenDNN.git - GIT_TAG 21ce8f7879c86bf3637f707fae6f29e0951db5fe + GIT_TAG a18adf8c605fb5f5e52cefd7eda08a7b18febbaf # ZenDNN-2026-WW08 PREFIX ${ZENDNN_PREFIX} SOURCE_DIR ${ZENDNN_SOURCE_DIR} BINARY_DIR ${ZENDNN_BUILD_DIR} @@ -32,7 +39,9 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF") -DZENDNNL_BUILD_DOXYGEN=OFF -DZENDNNL_BUILD_GTEST=OFF -DZENDNNL_BUILD_BENCHDNN=OFF - # Enable ALL matmul algorithm backends + -DZENDNNL_DEPENDS_FBGEMM=OFF + -DZENDNNL_LIB_BUILD_ARCHIVE=${ZENDNN_ARCHIVE_LIB} + -DZENDNNL_LIB_BUILD_SHARED=${ZENDNN_SHARED_LIB} -DZENDNNL_DEPENDS_AOCLDLP=ON -DZENDNNL_DEPENDS_ONEDNN=ON -DZENDNNL_DEPENDS_LIBXSMM=ON @@ -45,47 +54,37 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF") LOG_INSTALL ON ) - # Add dependency so ZenDNN builds before our library add_dependencies(ggml-zendnn zendnn) - - # Set ZENDNN_ROOT to the installation directory set(ZENDNN_ROOT ${ZENDNN_INSTALL_DIR}) - message(STATUS "ZenDNN will be built to: ${ZENDNN_ROOT}") else() message(STATUS "Using custom ZenDNN installation at: ${ZENDNN_ROOT}") endif() -# ZenDNN headers + libs target_include_directories(ggml-zendnn PRIVATE ${ZENDNN_ROOT}/zendnnl/include - ${ZENDNN_ROOT}/deps/aocldlp/include - ${ZENDNN_ROOT}/deps/aoclutils/include ${ZENDNN_ROOT}/deps/json/include - ${ZENDNN_ROOT}/deps/libxsmm/include + ${ZENDNN_ROOT}/deps/aoclutils/include + ${ZENDNN_ROOT}/deps/aocldlp/include ${ZENDNN_ROOT}/deps/onednn/include -) + ${ZENDNN_ROOT}/deps/libxsmm/include) -target_link_directories(ggml-zendnn PRIVATE - ${ZENDNN_ROOT}/zendnnl/lib - ${ZENDNN_ROOT}/deps/aocldlp/lib - ${ZENDNN_ROOT}/deps/aoclutils/lib - ${ZENDNN_ROOT}/deps/libxsmm/lib - ${ZENDNN_ROOT}/deps/onednn/lib -) +if (ZENDNN_SHARED_LIB) + target_link_directories(ggml-zendnn PRIVATE ${ZENDNN_ROOT}/zendnnl/lib) + target_link_libraries(ggml-zendnn PRIVATE zendnnl) +elseif (ZENDNN_ARCHIVE_LIB) + target_link_libraries(ggml-zendnn PRIVATE + ${ZENDNN_ROOT}/zendnnl/lib/libzendnnl_archive.a + ${ZENDNN_ROOT}/deps/aoclutils/${CMAKE_INSTALL_LIBDIR}/libaoclutils.a + ${ZENDNN_ROOT}/deps/aoclutils/${CMAKE_INSTALL_LIBDIR}/libau_cpuid.a + ${ZENDNN_ROOT}/deps/aocldlp/lib/libaocl-dlp.a + ${ZENDNN_ROOT}/deps/onednn/${CMAKE_INSTALL_LIBDIR}/libdnnl.a + ${ZENDNN_ROOT}/deps/libxsmm/lib/libxsmm.a + ${ZENDNN_ROOT}/deps/libxsmm/lib/libxsmmext.a + ${ZENDNN_ROOT}/deps/libxsmm/lib/libxsmmnoblas.a) +endif() -target_link_libraries(ggml-zendnn PRIVATE - zendnnl_archive # ZenDNN main - aocl-dlp # AOCL libraries - aoclutils - au_cpuid - dnnl # OneDNN - xsmm # libxsmm small matrix math - xsmmext - xsmmnoblas - m - pthread -) +target_link_libraries(ggml-zendnn PRIVATE m pthread) if (GGML_OPENMP) target_link_libraries(ggml-zendnn PRIVATE OpenMP::OpenMP_CXX) diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp index 551c15bb4..c87603040 100644 --- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp +++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp @@ -41,13 +41,13 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int const TA * A, int64_t lda, const TB * B, int64_t ldb, TC * C, int64_t ldc) { - zendnnl::lowoha::lowoha_params params; + zendnnl::lowoha::matmul::matmul_params params; params.dtypes.src = ggml_to_zendnn_type(); params.dtypes.wei = ggml_to_zendnn_type(); params.dtypes.dst = ggml_to_zendnn_type(); params.num_threads = ctx->n_threads; - zendnnl::lowoha::status_t status = zendnnl::lowoha::matmul_direct( + zendnnl::error_handling::status_t status = zendnnl::lowoha::matmul::matmul_direct( 'r', false, true, // row-major, don't transpose B, transpose A (because it's column-major) n, // M: rows of B and C m, // N: cols of A^T and C @@ -63,7 +63,7 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int params // params ); - if (status != zendnnl::lowoha::status_t::success) { + if (status != zendnnl::error_handling::status_t::success) { GGML_LOG_ERROR("%s, ZenDNN matmul failed: status=%d\n", __func__, static_cast(status)); return false; }