From fadfd1897380a682fb4759d898060c7d40591a97 Mon Sep 17 00:00:00 2001
From: Vitaly Chikunov <vt@altlinux.org>
Date: Mon, 10 Mar 2025 03:32:02 +0300
Subject: [PATCH] 1:4855-alt1

- Update to b4855 (2025-03-07).
- Enable CUDA backend (for NVIDIA GPU) in llama.cpp-cuda package.
- Disable BLAS backend (issues/12282).
- Install bash-completions.
---
 .gear/llama.cpp.spec | 160 ++++++++++++++++++++++++++++++-------------
 1 file changed, 113 insertions(+), 47 deletions(-)

diff --git a/.gear/llama.cpp.spec b/.gear/llama.cpp.spec
index eaaaca41d..c26c8a825 100644
--- a/.gear/llama.cpp.spec
+++ b/.gear/llama.cpp.spec
@@ -3,34 +3,41 @@
 %define _stripped_files_terminate_build 1
 %set_verify_elf_method strict
 
+%ifarch x86_64
+%def_with cuda
+%else
+%def_without cuda
+%endif
+
 Name: llama.cpp
-Version: 3441
+Version: 4855
 Release: alt1
 Epoch: 1
 Summary: LLM inference in C/C++
 License: MIT
 Group: Sciences/Computer science
 Url: https://github.com/ggerganov/llama.cpp
-Requires: libllama = %EVR
+ExcludeArch: %ix86
+Requires: %name-cpu = %EVR
+%if_with cuda
+Requires: %name-cuda = %EVR
+%filter_from_requires /(libcudart\.so\.12)/d
+%filter_from_requires /debug64(libcuda\.so\.1)/d
+%endif
 
-ExclusiveArch: aarch64 x86_64
 Source: %name-%version.tar
 Source1: kompute-0.tar
 
-AutoReqProv: nopython3
-Requires: python3
-Requires: python3(argparse)
-Requires: python3(glob)
-Requires: python3(os)
-Requires: python3(pip)
-Requires: python3(struct)
-%add_findreq_skiplist %_datadir/%name/examples/*
-
 BuildRequires(pre): rpm-macros-cmake
 BuildRequires: cmake
 BuildRequires: gcc-c++
 BuildRequires: libcurl-devel
-BuildRequires: libopenblas-devel
+BuildRequires: libgomp-devel
+BuildRequires: libstdc++-devel-static
+%if_with cuda
+BuildRequires: gcc12-c++
+BuildRequires: nvidia-cuda-devel-static
+%endif
 %{?!_without_check:%{?!_disable_check:
 BuildRequires: ctest
 BuildRequires: tinyllamas-gguf
@@ -38,10 +45,10 @@ BuildRequires: tinyllamas-gguf
 
 %description
 Plain C/C++ implementation (of inference of many LLM models) without
-dependencies. AVX, AVX2 and AVX512 support for x86 architectures.
+dependencies. AVX, AVX2, AVX512, and AMX support for x86 architectures.
 Mixed F16/F32 precision. 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and
 8-bit integer quantization for faster inference and reduced memory use.
-Runs on the CPU.
+Supports CPU, GPU, and hybrid CPU+GPU inference.
 
 Supported models:
 
@@ -51,24 +58,24 @@ Supported models:
    Persimmon 8B, MPT, Bloom, Yi models, StableLM models, Deepseek models,
    Qwen models, PLaMo-13B, Phi models, GPT-2, Orion 14B, InternLM2,
    CodeShell, Gemma, Mamba, Grok-1, Xverse, Command-R models, SEA-LION,
-   GritLM-7B + GritLM-8x7B, OLMo, GPT-NeoX + Pythia
+   GritLM-7B + GritLM-8x7B, OLMo, GPT-NeoX + Pythia,  Snowflake-Arctic
+   MoE, Smaug, Poro 34B, Bitnet b1.58 models, Flan T5, Open Elm models,
+   ChatGLM3-6b + ChatGLM4-9b + GLMEdge-1.5b + GLMEdge-4b, SmolLM,
+   EXAONE-3.0-7.8B-Instruct, FalconMamba Models, Jais, Bielik-11B-v2.3,
+   RWKV-6, QRWKV-6, GigaChat-20B-A3B
 
 Multimodal models:
 
    LLaVA 1.5 models, BakLLaVA, Obsidian, ShareGPT4V, MobileVLM 1.7B/3B
-   models, Yi-VL, Mini CPM, Moondream, Bunny
+   models, Yi-VL, Mini CPM, Moondream, Bunny, GLM-EDGE, Qwen2-VL
 
-NOTE 1: You will need to:
+NOTE 1: For data format conversion script to work you will need to:
 
   pip3 install -r /usr/share/llama.cpp/requirements.txt
 
-for data format conversion scripts to work.
-
 NOTE 2:
-  MODELS ARE NOT PROVIDED. You need to download them from original
-  sites and place them into "./models" directory.
-
-  For example, LLaMA downloaded via public torrent link is 220 GB.
+  MODELS ARE NOT PROVIDED. You'll need to download them from the original
+  sites (or Hugging Face Hub).
 
 Overall this is all raw and EXPERIMENTAL, no warranty, no support.
 
@@ -87,34 +94,67 @@ Requires: libllama = %EVR
 %description -n libllama-devel
 %summary.
 
+%package cpu
+Summary: %name tools including backend for CPU
+Group: Sciences/Computer science
+Requires: libllama = %EVR
+AutoReqProv: nopython3
+Requires: python3
+Requires: python3(argparse)
+Requires: python3(glob)
+Requires: python3(os)
+Requires: python3(pip)
+Requires: python3(struct)
+%add_findreq_skiplist %_datadir/%name/examples/*
+
+%description cpu
+%summary.
+
+%package cuda
+Summary: %name backend for NVIDIA GPU
+Group: Sciences/Computer science
+Requires: libnvidia-ptxjitcompiler
+Requires: %name-cpu = %EVR
+
+%description cuda
+%summary.
+
 %prep
 %setup
 tar xf %SOURCE1 -C ggml/src/ggml-kompute
 cat <<-EOF >> cmake/build-info.cmake
 	set(BUILD_NUMBER %version)
+	set(GGML_BUILD_NUMBER %version)
 	set(BUILD_COMMIT "%release")
 EOF
-sed -i '/lib\/pkgconfig/s/lib/${CMAKE_INSTALL_LIBDIR}/' CMakeLists.txt
 sed -i '/POSITION_INDEPENDENT_CODE/s/PROPERTIES/& SOVERSION 0.0.%version/' ggml/src/CMakeLists.txt src/CMakeLists.txt
-sed -i 's/@PROJECT_VERSION@/0.0.%version/' cmake/llama.pc.in
+sed -i 's/POSITION_INDEPENDENT_CODE/SOVERSION 0.0.%version &/' ggml/cmake/ggml-config.cmake.in
 
 %build
 # Unless -DCMAKE_SKIP_BUILD_RPATH=yes CMake fails to strip build time RPATH
 # from (installed) binaries.
+export NVCC_PREPEND_FLAGS=-ccbin=g++-12
 %cmake \
 	-DCMAKE_SKIP_BUILD_RPATH=yes \
-	-DGGML_BLAS=ON \
-	-DGGML_BLAS_VENDOR=OpenBLAS \
+	-DLLAMA_BUILD_TESTS=ON \
 	-DLLAMA_CURL=ON \
-	-DLLAMA_BUILD_TESTS=OFF \
+	-DGGML_BACKEND_DL=ON \
+	-DGGML_CPU=ON \
+%ifarch x86_64
+	-DGGML_CPU_ALL_VARIANTS=ON \
+%endif
+%if_with cuda
+	-DGGML_CUDA=ON \
+	-DCMAKE_CUDA_ARCHITECTURES='52-virtual;80-virtual' \
+%endif
 	%nil
 grep -E 'LLAMA|GGML' %_cmake__builddir/CMakeCache.txt | sort | tee build-options.txt
 %cmake_build
 find -name '*.py' | xargs sed -i '1s|#!/usr/bin/env python3|#!%__python3|'
+LD_LIBRARY_PATH=%_cmake__builddir/bin %_cmake__builddir/bin/llama-cli --completion-bash > llama.bash
 
 %install
 %cmake_install
-
 # Python requirements files.
 install -Dpm644 requirements.txt -t %buildroot%_datadir/%name
 cp -a requirements -t %buildroot%_datadir/%name
@@ -124,40 +164,66 @@ cp -rp grammars -t %buildroot%_datadir/%name
 # Not all examples.
 install -Dp examples/*.sh -t %buildroot%_datadir/%name/examples
 install -Dp examples/*.py -t %buildroot%_datadir/%name/examples
-
-# llava belongs to examples which we don't install.
-rm %buildroot%_libdir/libllava_shared.so
+# We need to run the tests, not install them.
+rm %buildroot%_bindir/test-*
+# Completions.
+install -Dpm644 llama.bash %buildroot%_datadir/bash-completion/completions/llama-cli
+printf '%%s\n' llama-server llama-simple llama-run |
+	xargs -ti ln -s llama-cli %buildroot%_datadir/bash-completion/completions/{}
 
 %check
-export LD_LIBRARY_PATH=$PWD/%_cmake__builddir/src:$PWD/%_cmake__builddir/ggml/src
-%_cmake__builddir/bin/llama-cli --version |& grep -Fx 'version: %version (%release)'
+# Local path are more useful for debugging becasue they are not stripped by default.
+%dnl export LD_LIBRARY_PATH=%buildroot%_libdir:%buildroot%_libexecdir/llama PATH+=:%buildroot%_bindir
+export LD_LIBRARY_PATH=$PWD/%_cmake__builddir/bin PATH+=:$PWD/%_cmake__builddir/bin
+llama-cli --version
+llama-cli --version |& grep -Fx 'version: %version (%release)'
 # test-eval-callback wants network.
 %ctest -j1 -E test-eval-callback
-PATH=%buildroot%_bindir:$PATH
 llama-cli -m %_datadir/tinyllamas/stories260K.gguf -p "Hello" -s 42 -n 500
 llama-cli -m %_datadir/tinyllamas/stories260K.gguf -p "Once upon a time" -s 55 -n 33 |
 	grep 'Once upon a time, there was a boy named Tom. Tom had a big box of colors.'
 
 %files
+
+%files -n libllama
+%_libdir/libllama.so.0.0.%version
+%_libdir/libggml.so.0.0.%version
+%_libdir/libggml-base.so.0.0.%version
+
+%files -n libllama-devel
+%_libdir/libllama.so
+%_libdir/libggml.so
+%_libdir/libggml-base.so
+%_includedir/llama*.h
+%_includedir/gguf.h
+%_includedir/ggml*.h
+%_cmakedir/ggml
+%_cmakedir/llama
+%_pkgconfigdir/llama.pc
+
+%files cpu
 %define _customdocdir %_docdir/%name
 %doc LICENSE README.md docs build-options.txt
 %_bindir/llama-*
 %_bindir/convert*.py
 %_datadir/%name
+%dir %_libexecdir/llama
+%_libexecdir/llama/libggml-cpu*.so
+%_datadir/bash-completion/completions/llama-*
 
-%files -n libllama
-%_libdir/libggml.so.0.0.%version
-%_libdir/libllama.so.0.0.%version
-
-%files -n libllama-devel
-%_includedir/ggml*.h
-%_includedir/llama.h
-%_cmakedir/llama
-%_pkgconfigdir/llama.pc
-%_libdir/libggml.so
-%_libdir/libllama.so
+%if_with cuda
+%files cuda
+%dir %_libexecdir/llama
+%_libexecdir/llama/libggml-cuda.so
+%endif
 
 %changelog
+* Mon Mar 10 2025 Vitaly Chikunov <vt@altlinux.org> 1:4855-alt1
+- Update to b4855 (2025-03-07).
+- Enable CUDA backend (for NVIDIA GPU) in llama.cpp-cuda package.
+- Disable BLAS backend (issues/12282).
+- Install bash-completions.
+
 * Tue Jul 23 2024 Vitaly Chikunov <vt@altlinux.org> 1:3441-alt1
 - Update to b3441 (2024-07-23).
 - spec: Package libllama and libllama-devel (ALT#50962).