From fadfd1897380a682fb4759d898060c7d40591a97 Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Mon, 10 Mar 2025 03:32:02 +0300 Subject: [PATCH] 1:4855-alt1 - Update to b4855 (2025-03-07). - Enable CUDA backend (for NVIDIA GPU) in llama.cpp-cuda package. - Disable BLAS backend (issues/12282). - Install bash-completions. --- .gear/llama.cpp.spec | 160 ++++++++++++++++++++++++++++++------------- 1 file changed, 113 insertions(+), 47 deletions(-) diff --git a/.gear/llama.cpp.spec b/.gear/llama.cpp.spec index eaaaca41d..c26c8a825 100644 --- a/.gear/llama.cpp.spec +++ b/.gear/llama.cpp.spec @@ -3,34 +3,41 @@ %define _stripped_files_terminate_build 1 %set_verify_elf_method strict +%ifarch x86_64 +%def_with cuda +%else +%def_without cuda +%endif + Name: llama.cpp -Version: 3441 +Version: 4855 Release: alt1 Epoch: 1 Summary: LLM inference in C/C++ License: MIT Group: Sciences/Computer science Url: https://github.com/ggerganov/llama.cpp -Requires: libllama = %EVR +ExcludeArch: %ix86 +Requires: %name-cpu = %EVR +%if_with cuda +Requires: %name-cuda = %EVR +%filter_from_requires /(libcudart\.so\.12)/d +%filter_from_requires /debug64(libcuda\.so\.1)/d +%endif -ExclusiveArch: aarch64 x86_64 Source: %name-%version.tar Source1: kompute-0.tar -AutoReqProv: nopython3 -Requires: python3 -Requires: python3(argparse) -Requires: python3(glob) -Requires: python3(os) -Requires: python3(pip) -Requires: python3(struct) -%add_findreq_skiplist %_datadir/%name/examples/* - BuildRequires(pre): rpm-macros-cmake BuildRequires: cmake BuildRequires: gcc-c++ BuildRequires: libcurl-devel -BuildRequires: libopenblas-devel +BuildRequires: libgomp-devel +BuildRequires: libstdc++-devel-static +%if_with cuda +BuildRequires: gcc12-c++ +BuildRequires: nvidia-cuda-devel-static +%endif %{?!_without_check:%{?!_disable_check: BuildRequires: ctest BuildRequires: tinyllamas-gguf @@ -38,10 +45,10 @@ BuildRequires: tinyllamas-gguf %description Plain C/C++ implementation (of inference of many LLM models) without -dependencies. AVX, AVX2 and AVX512 support for x86 architectures. +dependencies. AVX, AVX2, AVX512, and AMX support for x86 architectures. Mixed F16/F32 precision. 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use. -Runs on the CPU. +Supports CPU, GPU, and hybrid CPU+GPU inference. Supported models: @@ -51,24 +58,24 @@ Supported models: Persimmon 8B, MPT, Bloom, Yi models, StableLM models, Deepseek models, Qwen models, PLaMo-13B, Phi models, GPT-2, Orion 14B, InternLM2, CodeShell, Gemma, Mamba, Grok-1, Xverse, Command-R models, SEA-LION, - GritLM-7B + GritLM-8x7B, OLMo, GPT-NeoX + Pythia + GritLM-7B + GritLM-8x7B, OLMo, GPT-NeoX + Pythia, Snowflake-Arctic + MoE, Smaug, Poro 34B, Bitnet b1.58 models, Flan T5, Open Elm models, + ChatGLM3-6b + ChatGLM4-9b + GLMEdge-1.5b + GLMEdge-4b, SmolLM, + EXAONE-3.0-7.8B-Instruct, FalconMamba Models, Jais, Bielik-11B-v2.3, + RWKV-6, QRWKV-6, GigaChat-20B-A3B Multimodal models: LLaVA 1.5 models, BakLLaVA, Obsidian, ShareGPT4V, MobileVLM 1.7B/3B - models, Yi-VL, Mini CPM, Moondream, Bunny + models, Yi-VL, Mini CPM, Moondream, Bunny, GLM-EDGE, Qwen2-VL -NOTE 1: You will need to: +NOTE 1: For data format conversion script to work you will need to: pip3 install -r /usr/share/llama.cpp/requirements.txt -for data format conversion scripts to work. - NOTE 2: - MODELS ARE NOT PROVIDED. You need to download them from original - sites and place them into "./models" directory. - - For example, LLaMA downloaded via public torrent link is 220 GB. + MODELS ARE NOT PROVIDED. You'll need to download them from the original + sites (or Hugging Face Hub). Overall this is all raw and EXPERIMENTAL, no warranty, no support. @@ -87,34 +94,67 @@ Requires: libllama = %EVR %description -n libllama-devel %summary. +%package cpu +Summary: %name tools including backend for CPU +Group: Sciences/Computer science +Requires: libllama = %EVR +AutoReqProv: nopython3 +Requires: python3 +Requires: python3(argparse) +Requires: python3(glob) +Requires: python3(os) +Requires: python3(pip) +Requires: python3(struct) +%add_findreq_skiplist %_datadir/%name/examples/* + +%description cpu +%summary. + +%package cuda +Summary: %name backend for NVIDIA GPU +Group: Sciences/Computer science +Requires: libnvidia-ptxjitcompiler +Requires: %name-cpu = %EVR + +%description cuda +%summary. + %prep %setup tar xf %SOURCE1 -C ggml/src/ggml-kompute cat <<-EOF >> cmake/build-info.cmake set(BUILD_NUMBER %version) + set(GGML_BUILD_NUMBER %version) set(BUILD_COMMIT "%release") EOF -sed -i '/lib\/pkgconfig/s/lib/${CMAKE_INSTALL_LIBDIR}/' CMakeLists.txt sed -i '/POSITION_INDEPENDENT_CODE/s/PROPERTIES/& SOVERSION 0.0.%version/' ggml/src/CMakeLists.txt src/CMakeLists.txt -sed -i 's/@PROJECT_VERSION@/0.0.%version/' cmake/llama.pc.in +sed -i 's/POSITION_INDEPENDENT_CODE/SOVERSION 0.0.%version &/' ggml/cmake/ggml-config.cmake.in %build # Unless -DCMAKE_SKIP_BUILD_RPATH=yes CMake fails to strip build time RPATH # from (installed) binaries. +export NVCC_PREPEND_FLAGS=-ccbin=g++-12 %cmake \ -DCMAKE_SKIP_BUILD_RPATH=yes \ - -DGGML_BLAS=ON \ - -DGGML_BLAS_VENDOR=OpenBLAS \ + -DLLAMA_BUILD_TESTS=ON \ -DLLAMA_CURL=ON \ - -DLLAMA_BUILD_TESTS=OFF \ + -DGGML_BACKEND_DL=ON \ + -DGGML_CPU=ON \ +%ifarch x86_64 + -DGGML_CPU_ALL_VARIANTS=ON \ +%endif +%if_with cuda + -DGGML_CUDA=ON \ + -DCMAKE_CUDA_ARCHITECTURES='52-virtual;80-virtual' \ +%endif %nil grep -E 'LLAMA|GGML' %_cmake__builddir/CMakeCache.txt | sort | tee build-options.txt %cmake_build find -name '*.py' | xargs sed -i '1s|#!/usr/bin/env python3|#!%__python3|' +LD_LIBRARY_PATH=%_cmake__builddir/bin %_cmake__builddir/bin/llama-cli --completion-bash > llama.bash %install %cmake_install - # Python requirements files. install -Dpm644 requirements.txt -t %buildroot%_datadir/%name cp -a requirements -t %buildroot%_datadir/%name @@ -124,40 +164,66 @@ cp -rp grammars -t %buildroot%_datadir/%name # Not all examples. install -Dp examples/*.sh -t %buildroot%_datadir/%name/examples install -Dp examples/*.py -t %buildroot%_datadir/%name/examples - -# llava belongs to examples which we don't install. -rm %buildroot%_libdir/libllava_shared.so +# We need to run the tests, not install them. +rm %buildroot%_bindir/test-* +# Completions. +install -Dpm644 llama.bash %buildroot%_datadir/bash-completion/completions/llama-cli +printf '%%s\n' llama-server llama-simple llama-run | + xargs -ti ln -s llama-cli %buildroot%_datadir/bash-completion/completions/{} %check -export LD_LIBRARY_PATH=$PWD/%_cmake__builddir/src:$PWD/%_cmake__builddir/ggml/src -%_cmake__builddir/bin/llama-cli --version |& grep -Fx 'version: %version (%release)' +# Local path are more useful for debugging becasue they are not stripped by default. +%dnl export LD_LIBRARY_PATH=%buildroot%_libdir:%buildroot%_libexecdir/llama PATH+=:%buildroot%_bindir +export LD_LIBRARY_PATH=$PWD/%_cmake__builddir/bin PATH+=:$PWD/%_cmake__builddir/bin +llama-cli --version +llama-cli --version |& grep -Fx 'version: %version (%release)' # test-eval-callback wants network. %ctest -j1 -E test-eval-callback -PATH=%buildroot%_bindir:$PATH llama-cli -m %_datadir/tinyllamas/stories260K.gguf -p "Hello" -s 42 -n 500 llama-cli -m %_datadir/tinyllamas/stories260K.gguf -p "Once upon a time" -s 55 -n 33 | grep 'Once upon a time, there was a boy named Tom. Tom had a big box of colors.' %files + +%files -n libllama +%_libdir/libllama.so.0.0.%version +%_libdir/libggml.so.0.0.%version +%_libdir/libggml-base.so.0.0.%version + +%files -n libllama-devel +%_libdir/libllama.so +%_libdir/libggml.so +%_libdir/libggml-base.so +%_includedir/llama*.h +%_includedir/gguf.h +%_includedir/ggml*.h +%_cmakedir/ggml +%_cmakedir/llama +%_pkgconfigdir/llama.pc + +%files cpu %define _customdocdir %_docdir/%name %doc LICENSE README.md docs build-options.txt %_bindir/llama-* %_bindir/convert*.py %_datadir/%name +%dir %_libexecdir/llama +%_libexecdir/llama/libggml-cpu*.so +%_datadir/bash-completion/completions/llama-* -%files -n libllama -%_libdir/libggml.so.0.0.%version -%_libdir/libllama.so.0.0.%version - -%files -n libllama-devel -%_includedir/ggml*.h -%_includedir/llama.h -%_cmakedir/llama -%_pkgconfigdir/llama.pc -%_libdir/libggml.so -%_libdir/libllama.so +%if_with cuda +%files cuda +%dir %_libexecdir/llama +%_libexecdir/llama/libggml-cuda.so +%endif %changelog +* Mon Mar 10 2025 Vitaly Chikunov 1:4855-alt1 +- Update to b4855 (2025-03-07). +- Enable CUDA backend (for NVIDIA GPU) in llama.cpp-cuda package. +- Disable BLAS backend (issues/12282). +- Install bash-completions. + * Tue Jul 23 2024 Vitaly Chikunov 1:3441-alt1 - Update to b3441 (2024-07-23). - spec: Package libllama and libllama-devel (ALT#50962).