diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index 360602d65..77a9ddc14 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -14,7 +14,8 @@ ARG CUDA_DOCKER_ARCH=all RUN apt-get update && \ apt-get install -y build-essential python3 python3-pip git -COPY requirements.txt requirements.txt +COPY requirements.txt requirements.txt +COPY requirements requirements RUN pip install --upgrade pip setuptools wheel \ && pip install -r requirements.txt diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile index 6c521e9b4..8b9633dc4 100644 --- a/.devops/full-rocm.Dockerfile +++ b/.devops/full-rocm.Dockerfile @@ -23,7 +23,8 @@ ARG ROCM_DOCKER_ARCH=\ gfx1101 \ gfx1102 -COPY requirements.txt requirements.txt +COPY requirements.txt requirements.txt +COPY requirements requirements RUN pip install --upgrade pip setuptools wheel \ && pip install -r requirements.txt diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile index 687628b35..cef1297d3 100644 --- a/.devops/full.Dockerfile +++ b/.devops/full.Dockerfile @@ -5,7 +5,8 @@ FROM ubuntu:$UBUNTU_VERSION as build RUN apt-get update && \ apt-get install -y build-essential python3 python3-pip git -COPY requirements.txt requirements.txt +COPY requirements.txt requirements.txt +COPY requirements requirements RUN pip install --upgrade pip setuptools wheel \ && pip install -r requirements.txt diff --git a/.devops/main-intel.Dockerfile b/.devops/main-intel.Dockerfile new file mode 100644 index 000000000..572e5d8ea --- /dev/null +++ b/.devops/main-intel.Dockerfile @@ -0,0 +1,28 @@ +ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04 + +FROM intel/oneapi-basekit:$ONEAPI_VERSION as build + +ARG LLAMA_SYCL_F16=OFF +RUN apt-get update && \ + apt-get install -y git + +WORKDIR /app + +COPY . . + +RUN mkdir build && \ + cd build && \ + if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ + echo "LLAMA_SYCL_F16 is set" && \ + export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ + fi && \ + cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ + cmake --build . --config Release --target main + +FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime + +COPY --from=build /app/build/bin/main /main + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/main" ] diff --git a/.devops/main-rocm.Dockerfile b/.devops/main-rocm.Dockerfile index 789deff6d..0a706dc73 100644 --- a/.devops/main-rocm.Dockerfile +++ b/.devops/main-rocm.Dockerfile @@ -23,7 +23,8 @@ ARG ROCM_DOCKER_ARCH=\ gfx1101 \ gfx1102 -COPY requirements.txt requirements.txt +COPY requirements.txt requirements.txt +COPY requirements requirements RUN pip install --upgrade pip setuptools wheel \ && pip install -r requirements.txt diff --git a/.devops/main-vulkan.Dockerfile b/.devops/main-vulkan.Dockerfile new file mode 100644 index 000000000..bca460365 --- /dev/null +++ b/.devops/main-vulkan.Dockerfile @@ -0,0 +1,29 @@ +ARG UBUNTU_VERSION=jammy + +FROM ubuntu:$UBUNTU_VERSION as build + +# Install build tools +RUN apt update && apt install -y git build-essential cmake wget + +# Install Vulkan SDK +RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ + apt update -y && \ + apt-get install -y vulkan-sdk + +# Build it +WORKDIR /app +COPY . . +RUN mkdir build && \ + cd build && \ + cmake .. -DLLAMA_VULKAN=1 && \ + cmake --build . --config Release --target main + +# Clean up +WORKDIR / +RUN cp /app/build/bin/main /main && \ + rm -rf /app + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/main" ] diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix new file mode 100644 index 000000000..b8a12cc0a --- /dev/null +++ b/.devops/nix/apps.nix @@ -0,0 +1,22 @@ +{ + perSystem = + { config, lib, ... }: + { + apps = + let + inherit (config.packages) default; + binaries = [ + "llama" + "llama-embedding" + "llama-server" + "quantize" + "train-text-from-scratch" + ]; + mkApp = name: { + type = "app"; + program = "${default}/bin/${name}"; + }; + in + lib.genAttrs binaries mkApp; + }; +} diff --git a/.devops/nix/devshells.nix b/.devops/nix/devshells.nix new file mode 100644 index 000000000..1862f0f08 --- /dev/null +++ b/.devops/nix/devshells.nix @@ -0,0 +1,13 @@ +{ + perSystem = + { config, lib, ... }: + { + devShells = + lib.concatMapAttrs + (name: package: { + ${name} = package.passthru.shell; + ${name + "-extra"} = package.passthru.shell-extra; + }) + config.packages; + }; +} diff --git a/.devops/nix/docker.nix b/.devops/nix/docker.nix new file mode 100644 index 000000000..d607b4575 --- /dev/null +++ b/.devops/nix/docker.nix @@ -0,0 +1,37 @@ +{ + lib, + dockerTools, + buildEnv, + llama-cpp, + interactive ? true, + coreutils, +}: + +# A tar that can be fed into `docker load`: +# +# $ nix build .#llamaPackages.docker +# $ docker load < result + +# For details and variations cf. +# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage +# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922 +# - https://nixery.dev/ + +# Approximate (compressed) sizes, at the time of writing, are: +# +# .#llamaPackages.docker: 125M; +# .#llamaPackagesCuda.docker: 537M; +# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M. + +dockerTools.buildLayeredImage { + name = llama-cpp.pname; + tag = "latest"; + + contents = + [ llama-cpp ] + ++ lib.optionals interactive [ + coreutils + dockerTools.binSh + dockerTools.caCertificates + ]; +} diff --git a/.devops/nix/jetson-support.nix b/.devops/nix/jetson-support.nix new file mode 100644 index 000000000..78e2e40e0 --- /dev/null +++ b/.devops/nix/jetson-support.nix @@ -0,0 +1,39 @@ +{ inputs, ... }: +{ + perSystem = + { + config, + system, + lib, + pkgsCuda, + ... + }: + { + legacyPackages = + let + caps.llamaPackagesXavier = "7.2"; + caps.llamaPackagesOrin = "8.7"; + caps.llamaPackagesTX2 = "6.2"; + caps.llamaPackagesNano = "5.3"; + + pkgsFor = + cap: + import inputs.nixpkgs { + inherit system; + config = { + cudaSupport = true; + cudaCapabilities = [ cap ]; + cudaEnableForwardCompat = false; + inherit (pkgsCuda.config) allowUnfreePredicate; + }; + }; + in + builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps; + + packages = lib.optionalAttrs (system == "aarch64-linux") { + jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp; + jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp; + jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp; + }; + }; +} diff --git a/.devops/nix/nixpkgs-instances.nix b/.devops/nix/nixpkgs-instances.nix new file mode 100644 index 000000000..4a2f81c4b --- /dev/null +++ b/.devops/nix/nixpkgs-instances.nix @@ -0,0 +1,47 @@ +{ inputs, ... }: +{ + # The _module.args definitions are passed on to modules as arguments. E.g. + # the module `{ pkgs ... }: { /* config */ }` implicitly uses + # `_module.args.pkgs` (defined in this case by flake-parts). + perSystem = + { system, ... }: + { + _module.args = { + # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs + # again, the below creates several nixpkgs instances which the + # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`. + # + # This is currently "slow" and "expensive", on a certain scale. + # This also isn't "right" in that this hinders dependency injection at + # the level of flake inputs. This might get removed in the foreseeable + # future. + # + # Note that you can use these expressions without Nix + # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point). + + pkgsCuda = import inputs.nixpkgs { + inherit system; + # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc, + # and ucx are built with CUDA support) + config.cudaSupport = true; + config.allowUnfreePredicate = + p: + builtins.all + ( + license: + license.free + || builtins.elem license.shortName [ + "CUDA EULA" + "cuDNN EULA" + ] + ) + (p.meta.licenses or [ p.meta.license ]); + }; + # Ensure dependencies use ROCm consistently + pkgsRocm = import inputs.nixpkgs { + inherit system; + config.rocmSupport = true; + }; + }; + }; +} diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix new file mode 100644 index 000000000..815db6a2d --- /dev/null +++ b/.devops/nix/package.nix @@ -0,0 +1,290 @@ +{ + lib, + config, + stdenv, + mkShell, + cmake, + ninja, + pkg-config, + git, + python3, + mpi, + openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations + cudaPackages, + darwin, + rocmPackages, + vulkan-headers, + vulkan-loader, + clblast, + useBlas ? builtins.all (x: !x) [ + useCuda + useMetalKit + useOpenCL + useRocm + useVulkan + ], + useCuda ? config.cudaSupport, + useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL, + useMpi ? false, # Increases the runtime closure size by ~700M + useOpenCL ? false, + useRocm ? config.rocmSupport, + useVulkan ? false, + llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake +}@inputs: + +let + inherit (lib) + cmakeBool + cmakeFeature + optionals + strings + versionOlder + ; + + # It's necessary to consistently use backendStdenv when building with CUDA support, + # otherwise we get libstdc++ errors downstream. + stdenv = throw "Use effectiveStdenv instead"; + effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv; + + suffices = + lib.optionals useBlas [ "BLAS" ] + ++ lib.optionals useCuda [ "CUDA" ] + ++ lib.optionals useMetalKit [ "MetalKit" ] + ++ lib.optionals useMpi [ "MPI" ] + ++ lib.optionals useOpenCL [ "OpenCL" ] + ++ lib.optionals useRocm [ "ROCm" ] + ++ lib.optionals useVulkan [ "Vulkan" ]; + + pnameSuffix = + strings.optionalString (suffices != [ ]) + "-${strings.concatMapStringsSep "-" strings.toLower suffices}"; + descriptionSuffix = + strings.optionalString (suffices != [ ]) + ", accelerated with ${strings.concatStringsSep ", " suffices}"; + + # TODO: package the Python in this repository in a Nix-like way. + # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo + # is PEP 517-compatible, and ensure the correct .dist-info is generated. + # https://peps.python.org/pep-0517/ + llama-python = python3.withPackages ( + ps: [ + ps.numpy + ps.sentencepiece + ] + ); + + # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime + llama-python-extra = python3.withPackages ( + ps: [ + ps.numpy + ps.sentencepiece + ps.tiktoken + ps.torchWithoutCuda + ps.transformers + ] + ); + + # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64 + # separately + darwinBuildInputs = + with darwin.apple_sdk.frameworks; + [ + Accelerate + CoreVideo + CoreGraphics + ] + ++ optionals useMetalKit [ MetalKit ]; + + cudaBuildInputs = with cudaPackages; [ + cuda_cccl.dev # + + # A temporary hack for reducing the closure size, remove once cudaPackages + # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792 + cuda_cudart.dev + cuda_cudart.lib + cuda_cudart.static + libcublas.dev + libcublas.lib + libcublas.static + ]; + + rocmBuildInputs = with rocmPackages; [ + clr + hipblas + rocblas + ]; + + vulkanBuildInputs = [ + vulkan-headers + vulkan-loader + ]; +in + +effectiveStdenv.mkDerivation ( + finalAttrs: { + pname = "llama-cpp${pnameSuffix}"; + version = llamaVersion; + + # Note: none of the files discarded here are visible in the sandbox or + # affect the output hash. This also means they can be modified without + # triggering a rebuild. + src = lib.cleanSourceWith { + filter = + name: type: + let + noneOf = builtins.all (x: !x); + baseName = baseNameOf name; + in + noneOf [ + (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths + (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths + (lib.hasPrefix "." baseName) # Skip hidden files and directories + (baseName == "flake.lock") + ]; + src = lib.cleanSource ../../.; + }; + + postPatch = '' + substituteInPlace ./ggml-metal.m \ + --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" + + # TODO: Package up each Python script or service appropriately. + # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`, + # we could make those *.py into setuptools' entrypoints + substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python" + ''; + + nativeBuildInputs = + [ + cmake + ninja + pkg-config + git + ] + ++ optionals useCuda [ + cudaPackages.cuda_nvcc + + # TODO: Replace with autoAddDriverRunpath + # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged + cudaPackages.autoAddOpenGLRunpathHook + ]; + + buildInputs = + optionals effectiveStdenv.isDarwin darwinBuildInputs + ++ optionals useCuda cudaBuildInputs + ++ optionals useMpi [ mpi ] + ++ optionals useOpenCL [ clblast ] + ++ optionals useRocm rocmBuildInputs + ++ optionals useVulkan vulkanBuildInputs; + + cmakeFlags = + [ + (cmakeBool "LLAMA_NATIVE" false) + (cmakeBool "LLAMA_BUILD_SERVER" true) + (cmakeBool "BUILD_SHARED_LIBS" true) + (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) + (cmakeBool "LLAMA_BLAS" useBlas) + (cmakeBool "LLAMA_CLBLAST" useOpenCL) + (cmakeBool "LLAMA_CUBLAS" useCuda) + (cmakeBool "LLAMA_HIPBLAS" useRocm) + (cmakeBool "LLAMA_METAL" useMetalKit) + (cmakeBool "LLAMA_MPI" useMpi) + (cmakeBool "LLAMA_VULKAN" useVulkan) + ] + ++ optionals useCuda [ + ( + with cudaPackages.flags; + cmakeFeature "CMAKE_CUDA_ARCHITECTURES" ( + builtins.concatStringsSep ";" (map dropDot cudaCapabilities) + ) + ) + ] + ++ optionals useRocm [ + (cmakeFeature "CMAKE_C_COMPILER" "hipcc") + (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc") + + # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM + # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt + # and select the line that matches the current nixpkgs version of rocBLAS. + # Should likely use `rocmPackages.clr.gpuTargets`. + "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102" + ] + ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ] + ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ]; + + # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, + # if they haven't been added yet. + postInstall = '' + mv $out/bin/main $out/bin/llama + mv $out/bin/server $out/bin/llama-server + mkdir -p $out/include + cp $src/llama.h $out/include/ + ''; + + # Define the shells here, but don't add in the inputsFrom to avoid recursion. + passthru = { + inherit + useBlas + useCuda + useMetalKit + useMpi + useOpenCL + useRocm + useVulkan + ; + + shell = mkShell { + name = "shell-${finalAttrs.finalPackage.name}"; + description = "contains numpy and sentencepiece"; + buildInputs = [ llama-python ]; + inputsFrom = [ finalAttrs.finalPackage ]; + shellHook = '' + addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib" + ''; + }; + + shell-extra = mkShell { + name = "shell-extra-${finalAttrs.finalPackage.name}"; + description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers"; + buildInputs = [ llama-python-extra ]; + inputsFrom = [ finalAttrs.finalPackage ]; + }; + }; + + meta = { + # Configurations we don't want even the CI to evaluate. Results in the + # "unsupported platform" messages. This is mostly a no-op, because + # cudaPackages would've refused to evaluate anyway. + badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin; + + # Configurations that are known to result in build failures. Can be + # overridden by importing Nixpkgs with `allowBroken = true`. + broken = (useMetalKit && !effectiveStdenv.isDarwin); + + description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}"; + homepage = "https://github.com/ggerganov/llama.cpp/"; + license = lib.licenses.mit; + + # Accommodates `nix run` and `lib.getExe` + mainProgram = "llama"; + + # These people might respond, on the best effort basis, if you ping them + # in case of Nix-specific regressions or for reviewing Nix-specific PRs. + # Consider adding yourself to this list if you want to ensure this flake + # stays maintained and you're willing to invest your time. Do not add + # other people without their consent. Consider removing people after + # they've been unreachable for long periods of time. + + # Note that lib.maintainers is defined in Nixpkgs, but you may just add + # an attrset following the same format as in + # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix + maintainers = with lib.maintainers; [ + philiptaron + SomeoneSerge + ]; + + # Extend `badPlatforms` instead + platforms = lib.platforms.all; + }; + } +) diff --git a/.devops/nix/scope.nix b/.devops/nix/scope.nix new file mode 100644 index 000000000..78530c9e8 --- /dev/null +++ b/.devops/nix/scope.nix @@ -0,0 +1,19 @@ +{ + lib, + newScope, + llamaVersion ? "0.0.0", +}: + +# We're using `makeScope` instead of just writing out an attrset +# because it allows users to apply overlays later using `overrideScope'`. +# Cf. https://noogle.dev/f/lib/makeScope + +lib.makeScope newScope ( + self: { + inherit llamaVersion; + llama-cpp = self.callPackage ./package.nix { }; + docker = self.callPackage ./docker.nix { }; + docker-min = self.callPackage ./docker.nix { interactive = false; }; + sif = self.callPackage ./sif.nix { }; + } +) diff --git a/.devops/nix/sif.nix b/.devops/nix/sif.nix new file mode 100644 index 000000000..7535ca0f3 --- /dev/null +++ b/.devops/nix/sif.nix @@ -0,0 +1,27 @@ +{ + lib, + singularity-tools, + llama-cpp, + bashInteractive, + interactive ? false, +}: + +let + optionalInt = cond: x: if cond then x else 0; +in +singularity-tools.buildImage rec { + inherit (llama-cpp) name; + contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ]; + + # These are excessive (but safe) for most variants. Building singularity + # images requires superuser privileges, so we build them inside a VM in a + # writable image of pre-determined size. + # + # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846 + # + # Expected image sizes: + # - cpu/blas: 150M, + # - cuda, all gencodes: 560M, + diskSize = 4096 + optionalInt llama-cpp.useRocm 16384; + memSize = diskSize; +} diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile new file mode 100644 index 000000000..4f83904bc --- /dev/null +++ b/.devops/server-cuda.Dockerfile @@ -0,0 +1,32 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=11.7.1 +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +# Target the CUDA runtime image +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} as build + +# Unless otherwise specified, we make a fat build. +ARG CUDA_DOCKER_ARCH=all + +RUN apt-get update && \ + apt-get install -y build-essential git + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} +# Enable cuBLAS +ENV LLAMA_CUBLAS=1 + +RUN make + +FROM ${BASE_CUDA_RUN_CONTAINER} as runtime + +COPY --from=build /app/server /server + +ENTRYPOINT [ "/server" ] diff --git a/.devops/server-intel.Dockerfile b/.devops/server-intel.Dockerfile new file mode 100644 index 000000000..312f2df80 --- /dev/null +++ b/.devops/server-intel.Dockerfile @@ -0,0 +1,28 @@ +ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04 + +FROM intel/oneapi-basekit:$ONEAPI_VERSION as build + +ARG LLAMA_SYCL_F16=OFF +RUN apt-get update && \ + apt-get install -y git + +WORKDIR /app + +COPY . . + +RUN mkdir build && \ + cd build && \ + if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ + echo "LLAMA_SYCL_F16 is set" && \ + export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ + fi && \ + cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ + cmake --build . --config Release --target server + +FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime + +COPY --from=build /app/build/bin/server /server + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/server" ] diff --git a/.devops/server-rocm.Dockerfile b/.devops/server-rocm.Dockerfile new file mode 100644 index 000000000..e9a31647c --- /dev/null +++ b/.devops/server-rocm.Dockerfile @@ -0,0 +1,45 @@ +ARG UBUNTU_VERSION=22.04 + +# This needs to generally match the container host's environment. +ARG ROCM_VERSION=5.6 + +# Target the CUDA build image +ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete + +FROM ${BASE_ROCM_DEV_CONTAINER} as build + +# Unless otherwise specified, we make a fat build. +# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# This is mostly tied to rocBLAS supported archs. +ARG ROCM_DOCKER_ARCH=\ + gfx803 \ + gfx900 \ + gfx906 \ + gfx908 \ + gfx90a \ + gfx1010 \ + gfx1030 \ + gfx1100 \ + gfx1101 \ + gfx1102 + +COPY requirements.txt requirements.txt +COPY requirements requirements + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +# Enable ROCm +ENV LLAMA_HIPBLAS=1 +ENV CC=/opt/rocm/llvm/bin/clang +ENV CXX=/opt/rocm/llvm/bin/clang++ + +RUN make + +ENTRYPOINT [ "/app/server" ] diff --git a/.devops/server-vulkan.Dockerfile b/.devops/server-vulkan.Dockerfile new file mode 100644 index 000000000..e0add6fc3 --- /dev/null +++ b/.devops/server-vulkan.Dockerfile @@ -0,0 +1,29 @@ +ARG UBUNTU_VERSION=jammy + +FROM ubuntu:$UBUNTU_VERSION as build + +# Install build tools +RUN apt update && apt install -y git build-essential cmake wget + +# Install Vulkan SDK +RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ + apt update -y && \ + apt-get install -y vulkan-sdk + +# Build it +WORKDIR /app +COPY . . +RUN mkdir build && \ + cd build && \ + cmake .. -DLLAMA_VULKAN=1 && \ + cmake --build . --config Release --target server + +# Clean up +WORKDIR / +RUN cp /app/build/bin/server /server && \ + rm -rf /app + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/server" ] diff --git a/.devops/server.Dockerfile b/.devops/server.Dockerfile new file mode 100644 index 000000000..134588fe2 --- /dev/null +++ b/.devops/server.Dockerfile @@ -0,0 +1,20 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION as build + +RUN apt-get update && \ + apt-get install -y build-essential git + +WORKDIR /app + +COPY . . + +RUN make + +FROM ubuntu:$UBUNTU_VERSION as runtime + +COPY --from=build /app/server /server + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/server" ] diff --git a/.devops/tools.sh b/.devops/tools.sh index 9d999315f..3a7d274e4 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then ./quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then ./main "$@" +elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then + ./finetune "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do @@ -34,6 +36,8 @@ else echo " ex: --outtype f16 \"/models/7B/\" " echo " --quantize (-q): Optimize with quantization process ggml" echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" + echo " --finetune (-f): Run finetune command to create a lora finetune of the model" + echo " See documentation for finetune for command-line parameters" echo " --all-in-one (-a): Execute --convert & --quantize" echo " ex: \"/models/\" 7B" echo " --server (-s): Run a model on the server" diff --git a/.ecrc b/.ecrc index b682057dd..a3351f4e6 100644 --- a/.ecrc +++ b/.ecrc @@ -1,4 +1,5 @@ { + "Exclude": ["^\\.gitmodules$"], "Disable": { "IndentSize": true } diff --git a/.editorconfig b/.editorconfig index f8245b85c..16d16b3b5 100644 --- a/.editorconfig +++ b/.editorconfig @@ -15,8 +15,14 @@ indent_size = 4 [Makefile] indent_style = tab +[scripts/*.mk] +indent_style = tab + [prompts/*.txt] insert_final_newline = unset [examples/server/public/*] indent_size = 2 + +[examples/llama.swiftui/llama.swiftui.xcodeproj/*] +indent_style = tab diff --git a/.flake8 b/.flake8 index 113ca5fd3..18fba2c15 100644 --- a/.flake8 +++ b/.flake8 @@ -1,2 +1,3 @@ [flake8] max-line-length = 125 +ignore = W503 diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md new file mode 100644 index 000000000..49812832c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug.md @@ -0,0 +1,11 @@ +--- +name: Bug template +about: Used to report bugs in llama.cpp +labels: ["bug-unconfirmed"] +assignees: '' + +--- + +Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug. + +If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests). diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/custom.md deleted file mode 100644 index 8fd955356..000000000 --- a/.github/ISSUE_TEMPLATE/custom.md +++ /dev/null @@ -1,185 +0,0 @@ ---- -name: Issue and enhancement template -about: Used to report issues and request enhancements for llama.cpp -title: "[User] Insert summary of your issue or enhancement.." -labels: '' -assignees: '' - ---- - -# Prerequisites - -Please answer the following questions for yourself before submitting an issue. - -- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now. -- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md). -- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed). -- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share. - -# Expected Behavior - -Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do. - -# Current Behavior - -Please provide a detailed written description of what `llama.cpp` did, instead. - -# Environment and Context - -Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions. - -* Physical (or virtual) hardware you are using, e.g. for Linux: - -`$ lscpu` - -* Operating System, e.g. for Linux: - -`$ uname -a` - -* SDK version, e.g. for Linux: - -``` -$ python3 --version -$ make --version -$ g++ --version -``` - -# Failure Information (for bugs) - -Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template. - -# Steps to Reproduce - -Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better. - -1. step 1 -2. step 2 -3. step 3 -4. etc. - -# Failure Logs - -Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes. - -Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability. - -Example environment info: -``` -llama.cpp$ git log | head -1 -commit 2af23d30434a677c6416812eea52ccc0af65119c - -llama.cpp$ lscpu | egrep "AMD|Flags" -Vendor ID: AuthenticAMD -Model name: AMD Ryzen Threadripper 1950X 16-Core Processor -Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev -Virtualization: AMD-V - -llama.cpp$ python3 --version -Python 3.10.9 - -llama.cpp$ pip list | egrep "torch|numpy|sentencepiece" -numpy 1.24.2 -numpydoc 1.5.0 -sentencepiece 0.1.97 -torch 1.13.1 -torchvision 0.14.1 - -llama.cpp$ make --version | head -1 -GNU Make 4.3 - -$ md5sum ./models/65B/ggml-model-q4_0.bin -dbdd682cce80e2d6e93cefc7449df487 ./models/65B/ggml-model-q4_0.bin -``` - -Example run with the Linux command [perf](https://www.brendangregg.com/perf.html) -``` -llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered." -main: seed = 1679149377 -llama_model_load: loading model from './models/65B/ggml-model-q4_0.bin' - please wait ... -llama_model_load: n_vocab = 32000 -llama_model_load: n_ctx = 512 -llama_model_load: n_embd = 8192 -llama_model_load: n_mult = 256 -llama_model_load: n_head = 64 -llama_model_load: n_layer = 80 -llama_model_load: n_rot = 128 -llama_model_load: f16 = 2 -llama_model_load: n_ff = 22016 -llama_model_load: n_parts = 8 -llama_model_load: ggml ctx size = 41477.73 MB -llama_model_load: memory_size = 2560.00 MB, n_mem = 40960 -llama_model_load: loading model part 1/8 from './models/65B/ggml-model-q4_0.bin' -llama_model_load: .......................................................................................... done -llama_model_load: model size = 4869.09 MB / num tensors = 723 -llama_model_load: loading model part 2/8 from './models/65B/ggml-model-q4_0.bin.1' -llama_model_load: .......................................................................................... done -llama_model_load: model size = 4869.09 MB / num tensors = 723 -llama_model_load: loading model part 3/8 from './models/65B/ggml-model-q4_0.bin.2' -llama_model_load: .......................................................................................... done -llama_model_load: model size = 4869.09 MB / num tensors = 723 -llama_model_load: loading model part 4/8 from './models/65B/ggml-model-q4_0.bin.3' -llama_model_load: .......................................................................................... done -llama_model_load: model size = 4869.09 MB / num tensors = 723 -llama_model_load: loading model part 5/8 from './models/65B/ggml-model-q4_0.bin.4' -llama_model_load: .......................................................................................... done -llama_model_load: model size = 4869.09 MB / num tensors = 723 -llama_model_load: loading model part 6/8 from './models/65B/ggml-model-q4_0.bin.5' -llama_model_load: .......................................................................................... done -llama_model_load: model size = 4869.09 MB / num tensors = 723 -llama_model_load: loading model part 7/8 from './models/65B/ggml-model-q4_0.bin.6' -llama_model_load: .......................................................................................... done -llama_model_load: model size = 4869.09 MB / num tensors = 723 -llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.7' -llama_model_load: .......................................................................................... done -llama_model_load: model size = 4869.09 MB / num tensors = 723 - -system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | - -main: prompt: 'Please close your issue when it has been answered.' -main: number of tokens in prompt = 11 - 1 -> '' - 12148 -> 'Please' - 3802 -> ' close' - 596 -> ' your' - 2228 -> ' issue' - 746 -> ' when' - 372 -> ' it' - 756 -> ' has' - 1063 -> ' been' - 7699 -> ' answered' - 29889 -> '.' - -sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000 - - -Please close your issue when it has been answered. -@duncan-donut: I'm trying to figure out what kind of "support" you need for this script and why, exactly? Is there a question about how the code works that hasn't already been addressed in one or more comments below this ticket, or are we talking something else entirely like some sorta bugfixing job because your server setup is different from mine?? -I can understand if your site needs to be running smoothly and you need help with a fix of sorts but there should really be nothing wrong here that the code itself could not handle. And given that I'm getting reports about how it works perfectly well on some other servers, what exactly are we talking? A detailed report will do wonders in helping us get this resolved for ya quickly so please take your time and describe the issue(s) you see as clearly & concisely as possible!! -@duncan-donut: I'm not sure if you have access to cPanel but you could try these instructions. It is worth a shot! Let me know how it goes (or what error message, exactly!) when/if ya give that code a go? [end of text] - - -main: mem per token = 71159620 bytes -main: load time = 19309.95 ms -main: sample time = 168.62 ms -main: predict time = 223895.61 ms / 888.47 ms per token -main: total time = 246406.42 ms - - Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.': - - 3636882.89 msec task-clock # 14.677 CPUs utilized - 13509 context-switches # 3.714 /sec - 2436 cpu-migrations # 0.670 /sec - 10476679 page-faults # 2.881 K/sec - 13133115082869 cycles # 3.611 GHz (16.77%) - 29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%) - 10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%) - 23479217109614 instructions # 1.79 insn per cycle - # 0.44 stalled cycles per insn (16.76%) - 2353072268027 branches # 647.002 M/sec (16.77%) - 1998682780 branch-misses # 0.08% of all branches (16.76%) - - 247.802177522 seconds time elapsed - - 3618.573072000 seconds user - 18.491698000 seconds sys -``` diff --git a/.github/ISSUE_TEMPLATE/enhancement.md b/.github/ISSUE_TEMPLATE/enhancement.md new file mode 100644 index 000000000..dcffda750 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/enhancement.md @@ -0,0 +1,28 @@ +--- +name: Enhancement template +about: Used to request enhancements for llama.cpp +labels: ["enhancement"] +assignees: '' + +--- + +# Prerequisites + +Please answer the following questions for yourself before submitting an issue. + +- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now. +- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md). +- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed). +- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share. + +# Feature Description + +Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement. + +# Motivation + +Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users. + +# Possible Implementation + +If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better. diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5af497a3c..03d76d455 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -37,6 +37,8 @@ jobs: - name: Build id: make_build + env: + LLAMA_FATAL_WARNINGS: 1 run: | CC=gcc-8 make -j $(nproc) @@ -65,14 +67,14 @@ jobs: run: | mkdir build cd build - cmake .. + cmake .. -DLLAMA_FATAL_WARNINGS=ON cmake --build . --config Release -j $(nproc) - name: Test id: cmake_test run: | cd build - ctest --verbose --timeout 900 + ctest -L main --verbose --timeout 900 ubuntu-latest-cmake-sanitizer: runs-on: ubuntu-latest @@ -100,14 +102,14 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - name: Test id: cmake_test run: | cd build - ctest --verbose --timeout 900 + ctest -L main --verbose --timeout 900 ubuntu-latest-cmake-mpi: runs-on: ubuntu-latest @@ -141,8 +143,93 @@ jobs: id: cmake_test run: | cd build - ctest --verbose + ctest -L main --verbose + ubuntu-22-cmake-sycl: + runs-on: ubuntu-22.04 + + continue-on-error: true + + steps: + - uses: actions/checkout@v2 + + - name: add oneAPI to apt + shell: bash + run: | + cd /tmp + wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" + + - name: install oneAPI dpcpp compiler + shell: bash + run: | + sudo apt update + sudo apt install intel-oneapi-compiler-dpcpp-cpp + + - name: install oneAPI MKL library + shell: bash + run: | + sudo apt install intel-oneapi-mkl-devel + + - name: Clone + id: checkout + uses: actions/checkout@v3 + + - name: Build + id: cmake_build + run: | + source /opt/intel/oneapi/setvars.sh + mkdir build + cd build + cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx .. + cmake --build . --config Release -j $(nproc) + + ubuntu-22-cmake-sycl-fp16: + runs-on: ubuntu-22.04 + + continue-on-error: true + + steps: + - uses: actions/checkout@v2 + + - name: add oneAPI to apt + shell: bash + run: | + cd /tmp + wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" + + - name: install oneAPI dpcpp compiler + shell: bash + run: | + sudo apt update + sudo apt install intel-oneapi-compiler-dpcpp-cpp + + - name: install oneAPI MKL library + shell: bash + run: | + sudo apt install intel-oneapi-mkl-devel + + - name: Clone + id: checkout + uses: actions/checkout@v3 + + - name: Build + id: cmake_build + run: | + source /opt/intel/oneapi/setvars.sh + mkdir build + cd build + cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON .. + cmake --build . --config Release -j $(nproc) + + # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know + # how to debug it. + # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 macOS-latest-make: runs-on: macos-latest @@ -159,15 +246,21 @@ jobs: - name: Build id: make_build + env: + LLAMA_FATAL_WARNINGS: 1 run: | - make -j $(sysctl -n hw.logicalcpu) + LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu) - name: Test id: make_test run: | - make tests -j $(sysctl -n hw.logicalcpu) - make test -j $(sysctl -n hw.logicalcpu) + LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu) + LLAMA_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu) + # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know + # how to debug it. + # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584 + # would be great if we fix these macOS-latest-cmake: runs-on: macos-latest @@ -188,14 +281,14 @@ jobs: sysctl -a mkdir build cd build - cmake .. + cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test id: cmake_test run: | cd build - ctest --verbose --timeout 900 + ctest -L main --verbose --timeout 900 macOS-latest-cmake-ios: runs-on: macos-latest @@ -288,6 +381,8 @@ jobs: OPENBLAS_VERSION: 0.3.23 OPENCL_VERSION: 2023.04.17 CLBLAST_VERSION: 1.6.0 + SDE_VERSION: 9.33.0-2024-01-07 + VULKAN_VERSION: 1.3.261.1 strategy: matrix: @@ -304,6 +399,10 @@ jobs: defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' - build: 'openblas' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + - build: 'kompute' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' + - build: 'vulkan' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON' steps: - name: Clone @@ -312,6 +411,12 @@ jobs: with: fetch-depth: 0 + - name: Clone Kompute submodule + id: clone_kompute + if: ${{ matrix.build == 'kompute' }} + run: | + git submodule update --init kompute + - name: Download OpenCL SDK id: get_opencl if: ${{ matrix.build == 'clblast' }} @@ -346,6 +451,15 @@ jobs: $lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe') & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll + - name: Install Vulkan SDK + id: get_vulkan + if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }} + run: | + curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" + & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install + Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" + Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" + - name: Build id: cmake_build run: | @@ -383,10 +497,23 @@ jobs: - name: Test id: cmake_test - if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible + # not all machines have native AVX-512 + if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} run: | cd build - ctest -C Release --verbose --timeout 900 + ctest -L main -C Release --verbose --timeout 900 + + - name: Test (Intel SDE) + id: cmake_test_sde + if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation + run: | + curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz" + # for some weird reason windows tar doesn't like sde tar.xz + 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz + 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar + $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) + cd build + & $sde -future -- ctest -L main -C Release --verbose --timeout 900 - name: Determine tag name id: tag @@ -485,6 +612,66 @@ jobs: path: | cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip + windows-latest-cmake-sycl: + runs-on: windows-latest + defaults: + run: + shell: bash + + env: + WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe + WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel + + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Install + run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL + + - name: Build + id: cmake_build + run: examples/sycl/win-build-sycl.bat + + ios-xcode-build: + runs-on: macos-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Build Xcode project + run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build + + android-build: + runs-on: ubuntu-latest + + steps: + - name: Clone + uses: actions/checkout@v3 + + - name: Set up JDK + uses: actions/setup-java@v3 + with: + java-version: 17 + distribution: zulu + + - name: Setup Android SDK + uses: android-actions/setup-android@v3 + with: + log-accepted-android-sdk-licenses: false + + - name: Build + run: | + cd examples/llama.android + + # Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820). + ./gradlew build --no-daemon -Pskip-armeabi-v7a + # freeBSD-latest: # runs-on: macos-12 # steps: diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 9c90c77ac..94f9161fc 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -28,13 +28,18 @@ jobs: config: - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" } # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I # have disabled them for now until the reason why # is understood. - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" } steps: - name: Check out the repo uses: actions/checkout@v3 @@ -52,6 +57,36 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} + # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: false + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi + - name: Build and push Docker image (versioned) if: github.event_name == 'push' uses: docker/build-push-action@v4 @@ -59,7 +94,7 @@ jobs: context: . push: true platforms: ${{ matrix.config.platforms }} - tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" + tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" file: ${{ matrix.config.dockerfile }} - name: Build and push Docker image (tagged) @@ -68,5 +103,5 @@ jobs: context: . push: ${{ github.event_name == 'push' }} platforms: ${{ matrix.config.platforms }} - tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}" + tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}" file: ${{ matrix.config.dockerfile }} diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml index b4e535acf..0e0993cd4 100644 --- a/.github/workflows/editorconfig.yml +++ b/.github/workflows/editorconfig.yml @@ -1,6 +1,12 @@ name: EditorConfig Checker on: + workflow_dispatch: # allows manual triggering + inputs: + create_release: + description: 'Create new release' + required: true + type: boolean push: branches: - master diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml new file mode 100644 index 000000000..8d0a3fd7f --- /dev/null +++ b/.github/workflows/nix-ci-aarch64.yml @@ -0,0 +1,61 @@ +name: Nix aarch64 builds + +on: + workflow_dispatch: # allows manual triggering + schedule: + # Rebuild daily rather than on every push because QEMU is expensive (e.g. + # 1.5h instead of minutes with the cold cache). + # + # randint(0, 59), randint(0, 23) + - cron: '26 12 * * *' + # But also rebuild if we touched any of the Nix expressions: + push: + branches: + - master + paths: ['**/*.nix', 'flake.lock'] + pull_request: + types: [opened, synchronize, reopened] + paths: ['**/*.nix', 'flake.lock'] + +jobs: + nix-build-aarch64: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Install QEMU + # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654 + run: | + sudo apt-get update + sudo apt-get install -y qemu-user-static qemu-system-aarch64 + sudo usermod -a -G kvm $USER + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@v9 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + extra-conf: | + extra-platforms = aarch64-linux + extra-system-features = nixos-test kvm + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + - uses: DeterminateSystems/magic-nix-cache-action@v2 + with: + upstream-cache: https://${{ matrix.cachixName }}.cachix.org + - name: Set-up cachix to push the results to + uses: cachix/cachix-action@v13 + with: + authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' + name: llama-cpp + - name: Show all output paths + run: > + nix run github:nix-community/nix-eval-jobs + -- --gc-roots-dir gcroot + --flake + ".#packages.aarch64-linux" + - name: Build + run: > + nix run github:Mic92/nix-fast-build + -- --skip-cached --no-nom + --systems aarch64-linux + --flake + ".#checks.aarch64-linux" diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml new file mode 100644 index 000000000..01c5a9d5a --- /dev/null +++ b/.github/workflows/nix-ci.yml @@ -0,0 +1,68 @@ +name: Nix CI + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + pull_request: + types: [opened, synchronize, reopened] + +jobs: + nix-eval: + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest, macos-latest ] + runs-on: ${{ matrix.os }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@v9 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + extra-conf: | + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + - uses: DeterminateSystems/magic-nix-cache-action@v2 + with: + upstream-cache: https://${{ matrix.cachixName }}.cachix.org + - name: List all flake outputs + run: nix flake show --all-systems + - name: Show all output paths + run: > + nix run github:nix-community/nix-eval-jobs + -- --gc-roots-dir gcroot + --flake + ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)" + nix-build: + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest, macos-latest ] + runs-on: ${{ matrix.os }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@v9 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + extra-conf: | + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + - uses: DeterminateSystems/magic-nix-cache-action@v2 + with: + upstream-cache: https://${{ matrix.cachixName }}.cachix.org + - name: Set-up cachix to push the results to + uses: cachix/cachix-action@v13 + with: + authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' + name: llama-cpp + - name: Build + run: > + nix run github:Mic92/nix-fast-build + -- --skip-cached --no-nom + --flake + ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)" diff --git a/.github/workflows/nix-flake-update.yml b/.github/workflows/nix-flake-update.yml new file mode 100644 index 000000000..3a6a96e26 --- /dev/null +++ b/.github/workflows/nix-flake-update.yml @@ -0,0 +1,22 @@ +name: update-flake-lock +on: + workflow_dispatch: + schedule: + - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00 + +jobs: + lockfile: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@main + - name: Update flake.lock + uses: DeterminateSystems/update-flake-lock@main + with: + pr-title: "nix: update flake.lock" + pr-labels: | + nix + pr-reviewers: philiptaron,SomeoneSerge + token: ${{ secrets.FLAKE_TOKEN }} diff --git a/.github/workflows/nix-publish-flake.yml b/.github/workflows/nix-publish-flake.yml new file mode 100644 index 000000000..2c3c1ebda --- /dev/null +++ b/.github/workflows/nix-publish-flake.yml @@ -0,0 +1,36 @@ +# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes +name: "Publish a flake to flakestry & flakehub" +on: + push: + tags: + - "*" + workflow_dispatch: + inputs: + tag: + description: "The existing tag to publish" + type: "string" + required: true +jobs: + flakestry-publish: + runs-on: ubuntu-latest + permissions: + id-token: "write" + contents: "read" + steps: + - uses: flakestry/flakestry-publish@main + with: + version: "${{ inputs.tag || github.ref_name }}" + flakehub-publish: + runs-on: "ubuntu-latest" + permissions: + id-token: "write" + contents: "read" + steps: + - uses: "actions/checkout@v4" + with: + ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}" + - uses: "DeterminateSystems/nix-installer-action@main" + - uses: "DeterminateSystems/flakehub-push@main" + with: + visibility: "public" + tag: "${{ inputs.tag }}" diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml new file mode 100644 index 000000000..92e1108b3 --- /dev/null +++ b/.github/workflows/python-check-requirements.yml @@ -0,0 +1,29 @@ +name: Python check requirements.txt + +on: + push: + paths: + - 'scripts/check-requirements.sh' + - 'convert*.py' + - 'requirements.txt' + - 'requirements/*.txt' + pull_request: + paths: + - 'scripts/check-requirements.sh' + - 'convert*.py' + - 'requirements.txt' + - 'requirements/*.txt' + +jobs: + python-check-requirements: + runs-on: ubuntu-latest + name: check-requirements + steps: + - name: Check out source repository + uses: actions/checkout@v3 + - name: Set up Python environment + uses: actions/setup-python@v4 + with: + python-version: "3.11" + - name: Run check-requirements.sh script + run: bash scripts/check-requirements.sh nocleanup diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml new file mode 100644 index 000000000..ea0a05ea1 --- /dev/null +++ b/.github/workflows/python-lint.yml @@ -0,0 +1,20 @@ +name: flake8 Lint + +on: [push, pull_request] + +jobs: + flake8-lint: + runs-on: ubuntu-latest + name: Lint + steps: + - name: Check out source repository + uses: actions/checkout@v3 + - name: Set up Python environment + uses: actions/setup-python@v4 + with: + python-version: "3.11" + - name: flake8 Lint + uses: py-actions/flake8@v2 + with: + ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503" + exclude: "examples/*,examples/*/**,*/**/__init__.py" diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml new file mode 100644 index 000000000..ed27dc528 --- /dev/null +++ b/.github/workflows/server.yml @@ -0,0 +1,127 @@ +# Server build and tests +name: Server + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + - test/server-add-ci-test # FIXME remove + paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] + pull_request: + types: [opened, synchronize, reopened] + paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] + +jobs: + server: + runs-on: ubuntu-latest + + strategy: + matrix: + build: [noavx, avx2, avx, avx512, cublas, clblast, openblas, kompute, vulkan] + sanitizer: [ADDRESS, THREAD, UNDEFINED] + build_type: [Debug, Release] + include: + - build: 'noavx' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF' + image: ubuntu:latest + - build: 'avx2' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' + image: ubuntu:latest + - build: 'avx' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF' + image: ubuntu:latest + - build: 'avx512' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON' + image: ubuntu:latest + experimental: true + - build: 'cublas' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON' + image: nvidia/cuda:12.3.1-devel-ubuntu22.04 + arch_not_available: true # require nvidia docker engine + - build: 'clblast' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON' + image: ubuntu:latest + arch_not_available: true + - build: 'openblas' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS' + image: ubuntu:latest + - build: 'kompute' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON' + image: ubuntu:latest + arch_not_available: true + - build: 'vulkan' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON' + image: ubuntu:latest + arch_not_available: true + + container: + image: ${{ matrix.image }} + ports: + - 8888 + options: --cpus 4 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + + - name: Dependencies + id: depends + run: | + apt-get update + apt-get -y install \ + build-essential \ + pkg-config \ + git \ + cmake \ + python3-pip \ + wget \ + psmisc + + - name: Download CLBlast + id: get_clblast + if: ${{ matrix.build == 'clblast' }} + run: | + apt install -y libclblast-dev + + - name: Download OpenBLAS + id: get_openblas + if: ${{ matrix.build == 'openblas' }} + run: | + apt-get -y install libopenblas-dev + + - name: Install Vulkan SDK + id: get_vulkan + if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }} + run: | + wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | tee /etc/apt/trusted.gpg.d/lunarg.asc + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list + apt-get update + apt-get -y install vulkan-sdk + + - name: Build + id: cmake_build + run: | + mkdir build + cd build + cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.defines }} + cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server + + - name: Tests dependencies + id: test_dependencies + run: | + pip install -r examples/server/tests/requirements.txt + + - name: Download models + id: download_models + run: | + cd examples/server/tests + ../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf + + - name: Tests + id: server_integration_test + continue-on-error: ${{ matrix.experimental || matrix.arch_not_available }} + run: | + cd examples/server/tests + PORT=8888 ./tests.sh diff --git a/.gitignore b/.gitignore index 471cf90d5..62b6b8b1a 100644 --- a/.gitignore +++ b/.gitignore @@ -10,10 +10,12 @@ *.gcno *.gcda *.dot +*.bat *.metallib .DS_Store .build/ .cache/ +.ccls-cache/ .direnv/ .envrc .swiftpm @@ -21,11 +23,13 @@ .clang-tidy .vs/ .vscode/ +.idea/ lcov-report/ gcovr-report/ -build*/ +build* +cmake-build-* out/ tmp/ @@ -41,12 +45,16 @@ models-mnt /embedding /gguf /gguf-llama-simple +/imatrix /infill /libllama.so /llama-bench -/llava +/llava-cli +/lookahead +/lookup /main /metal +/passkey /perplexity /q8dot /quantize @@ -62,8 +70,9 @@ models-mnt /speculative /parallel /train-text-from-scratch +/tokenize /vdot -build-info.h +/common/build-info.cpp arm_neon.h compile_commands.json CMakeSettings.json @@ -82,17 +91,4 @@ examples/jeopardy/results.txt poetry.lock poetry.toml - -# Test binaries -tests/test-grammar-parser -tests/test-llama-grammar -tests/test-double-float -tests/test-grad0 -tests/test-opt -tests/test-quantize-fns -tests/test-quantize-perf -tests/test-sampling -tests/test-tokenizer-0-llama -tests/test-tokenizer-0-falcon -tests/test-tokenizer-1-llama -tests/test-tokenizer-1-bpe +nppBackup diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..b7e8b8ff2 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "kompute"] + path = kompute + url = https://github.com/nomic-ai/kompute.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 6af42a6c2..48880f720 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,6 @@ -cmake_minimum_required(VERSION 3.13) # for add_link_options +cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. project("llama.cpp" C CXX) +include(CheckIncludeFileCXX) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -10,7 +11,7 @@ endif() set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) +if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) set(LLAMA_STANDALONE ON) # configure project version @@ -43,15 +44,20 @@ else() endif() # general +option(BUILD_SHARED_LIBS "build shared libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_NATIVE "llama: enable -march=native flag" ON) option(LLAMA_LTO "llama: enable link time optimization" OFF) +option(LLAMA_CCACHE "llama: use ccache if available" ON) # debug option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) option(LLAMA_GPROF "llama: enable gprof" OFF) +# build +option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF) + # sanitizers option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) @@ -75,6 +81,10 @@ if (NOT MSVC) option(LLAMA_F16C "llama: enable F16C" ${INS_ENB}) endif() +if (WIN32) + set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version") +endif() + # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) @@ -82,6 +92,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUBLAS "llama: use CUDA" OFF) #option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) +option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) @@ -89,76 +100,71 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "llama: max. batch size for using peer access") option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) +option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF) +option(LLAMA_VULKAN "llama: use Vulkan" OFF) +option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF) +option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF) +option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF) +option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) +option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) +option(LLAMA_METAL_EMBED_LIBRARY "llama: embed Metal library" OFF) +option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_MPI "llama: use MPI" OFF) -option(LLAMA_K_QUANTS "llama: use k-quants" ON) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) +option(LLAMA_SYCL "llama: use SYCL" OFF) +option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) +option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF) -option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_SERVER "llama: build server example" ON) +option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_SERVER "llama: build server example" ON) -# -# Build info header -# +# add perf arguments +option(LLAMA_PERF "llama: enable perf" OFF) -# Generate initial build-info.h +# Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) -if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") - set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git") - - # Is git submodule - if(NOT IS_DIRECTORY "${GIT_DIR}") - file(READ ${GIT_DIR} REAL_GIT_DIR_LINK) - string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK}) - set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}") - endif() - - # Add a custom target for build-info.h - add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") - - # Add a custom command to rebuild build-info.h when .git/index changes - add_custom_command( - OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h" - COMMENT "Generating build details from Git" - COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake" - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS "${GIT_DIR}/index" - VERBATIM - ) -else() - message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.") -endif() - # # Compile flags # -set(CMAKE_CXX_STANDARD 11) +if (LLAMA_SYCL) + set(CMAKE_CXX_STANDARD 17) +else() + set(CMAKE_CXX_STANDARD 11) +endif() + set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) include(CheckCXXCompilerFlag) +# enable libstdc++ assertions for debug builds +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + add_compile_definitions($<$:_GLIBCXX_ASSERTIONS>) +endif() + if (NOT MSVC) if (LLAMA_SANITIZE_THREAD) add_compile_options(-fsanitize=thread) - link_libraries(-fsanitize=thread) + link_libraries (-fsanitize=thread) endif() if (LLAMA_SANITIZE_ADDRESS) add_compile_options(-fsanitize=address -fno-omit-frame-pointer) - link_libraries(-fsanitize=address) + link_libraries (-fsanitize=address) endif() if (LLAMA_SANITIZE_UNDEFINED) add_compile_options(-fsanitize=undefined) - link_libraries(-fsanitize=undefined) + link_libraries (-fsanitize=undefined) endif() endif() @@ -177,9 +183,9 @@ if (APPLE AND LLAMA_ACCELERATE) endif() if (LLAMA_METAL) - find_library(FOUNDATION_LIBRARY Foundation REQUIRED) - find_library(METAL_FRAMEWORK Metal REQUIRED) - find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) message(STATUS "Metal framework found") set(GGML_HEADERS_METAL ggml-metal.h) @@ -194,7 +200,59 @@ if (LLAMA_METAL) #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") # copy ggml-metal.metal to bin directory - configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY) + configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) + + if (LLAMA_METAL_EMBED_LIBRARY) + enable_language(ASM) + add_compile_definitions(GGML_METAL_EMBED_LIBRARY) + + set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal") + file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") + set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s") + + add_custom_command( + OUTPUT ${EMBED_METALLIB_ASSEMBLY} + COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY} + COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY} + COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY} + COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY} + COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY} + COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY} + DEPENDS ${METALLIB_SOURCE} + COMMENT "Generate assembly for embedded Metal library" + ) + + set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY}) + endif() + + if (LLAMA_METAL_SHADER_DEBUG) + # custom command to do the following: + # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air + # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib + # + # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works + # disabling fast math is needed in order to pass tests/test-backend-ops + # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 + # note: unfortunately, we have to call it default.metallib instead of ggml.metallib + # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 + set(XC_FLAGS -fno-fast-math -fno-inline -g) + if (LLAMA_QKK_64) + set(XC_FLAGS ${XC_FLAGS} -DQK_K=64) + endif() + + add_custom_command( + OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air + COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + DEPENDS ggml-metal.metal + COMMENT "Compiling Metal kernels" + ) + + add_custom_target( + ggml-metal ALL + DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + ) + endif() set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${FOUNDATION_LIBRARY} @@ -223,7 +281,11 @@ if (LLAMA_BLAS) if (${LLAMA_BLAS_VENDOR} MATCHES "Generic") pkg_check_modules(DepBLAS REQUIRED blas) elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS") - pkg_check_modules(DepBLAS REQUIRED openblas) + # As of openblas v0.3.22, the 64-bit is named openblas64.pc + pkg_check_modules(DepBLAS openblas64) + if (NOT DepBLAS_FOUND) + pkg_check_modules(DepBLAS REQUIRED openblas) + endif() elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME") pkg_check_modules(DepBLAS REQUIRED blis) elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS") @@ -262,14 +324,17 @@ if (LLAMA_BLAS) endif() message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") + add_compile_options(${BLAS_LINKER_FLAGS}) + add_compile_definitions(GGML_USE_OPENBLAS) + if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) add_compile_definitions(GGML_BLAS_USE_MKL) endif() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) - set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) + set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) else() message(WARNING "BLAS not found, please refer to " "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" @@ -277,13 +342,8 @@ if (LLAMA_BLAS) endif() endif() -if (LLAMA_K_QUANTS) - set(GGML_HEADERS_EXTRA k_quants.h) - set(GGML_SOURCES_EXTRA k_quants.c) - add_compile_definitions(GGML_USE_K_QUANTS) - if (LLAMA_QKK_64) - add_compile_definitions(GGML_QKK_64) - endif() +if (LLAMA_QKK_64) + add_compile_definitions(GGML_QKK_64) endif() if (LLAMA_CUBLAS) @@ -299,12 +359,12 @@ if (LLAMA_CUBLAS) set(GGML_SOURCES_CUDA ggml-cuda.cu) add_compile_definitions(GGML_USE_CUBLAS) -# if (LLAMA_CUDA_CUBLAS) -# add_compile_definitions(GGML_CUDA_CUBLAS) -# endif() if (LLAMA_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) endif() + if (LLAMA_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) + endif() add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) if (DEFINED LLAMA_CUDA_DMMV_Y) @@ -317,11 +377,18 @@ if (LLAMA_CUBLAS) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) if (LLAMA_STATIC) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + if (WIN32) + # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) + else () + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + endif() else() set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) + if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) # 52 == lowest CUDA 12 standard # 60 == f16 CUDA intrinsics @@ -331,6 +398,7 @@ if (LLAMA_CUBLAS) set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics else() set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics + #set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work endif() endif() message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") @@ -345,15 +413,20 @@ if (LLAMA_MPI) find_package(MPI) if (MPI_C_FOUND) message(STATUS "MPI found") + set(GGML_HEADERS_MPI ggml-mpi.h) - set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h) + set(GGML_SOURCES_MPI ggml-mpi.c) + add_compile_definitions(GGML_USE_MPI) add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS}) + if (NOT MSVC) add_compile_options(-Wno-cast-qual) endif() + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS}) + # Even if you're only using the C header, C++ programs may bring in MPI # C++ functions, so more linkage is needed if (MPI_CXX_FOUND) @@ -380,97 +453,394 @@ if (LLAMA_CLBLAST) endif() endif() +if (LLAMA_VULKAN) + find_package(Vulkan) + if (Vulkan_FOUND) + message(STATUS "Vulkan found") + + set(GGML_HEADERS_VULKAN ggml-vulkan.h) + set(GGML_SOURCES_VULKAN ggml-vulkan.cpp) + + add_compile_definitions(GGML_USE_VULKAN) + + if (LLAMA_VULKAN_CHECK_RESULTS) + add_compile_definitions(GGML_VULKAN_CHECK_RESULTS) + endif() + + if (LLAMA_VULKAN_DEBUG) + add_compile_definitions(GGML_VULKAN_DEBUG) + endif() + + if (LLAMA_VULKAN_VALIDATE) + add_compile_definitions(GGML_VULKAN_VALIDATE) + endif() + + if (LLAMA_VULKAN_RUN_TESTS) + add_compile_definitions(GGML_VULKAN_RUN_TESTS) + endif() + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan) + else() + message(WARNING "Vulkan not found") + endif() +endif() + if (LLAMA_HIPBLAS) list(APPEND CMAKE_PREFIX_PATH /opt/rocm) if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") endif() + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") endif() - find_package(hip) - find_package(hipblas) - find_package(rocblas) + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) - if (${hipblas_FOUND} AND ${hip_FOUND}) - message(STATUS "HIP and hipBLAS found") - add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) - add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) - if (BUILD_SHARED_LIBS) - set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON) - endif() - if (LLAMA_CUDA_FORCE_DMMV) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) - endif() - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) - target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) - target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) + message(STATUS "HIP and hipBLAS found") - if (LLAMA_STATIC) - message(FATAL_ERROR "Static linking not supported for HIP/ROCm") - endif() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm) + set(GGML_HEADERS_ROCM ggml-cuda.h) + set(GGML_SOURCES_ROCM ggml-cuda.cu) + + add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) + + if (LLAMA_HIP_UMA) + add_compile_definitions(GGML_HIP_UMA) + endif() + + if (LLAMA_CUDA_FORCE_DMMV) + add_compile_definitions(GGML_CUDA_FORCE_DMMV) + endif() + + if (LLAMA_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) + endif() + + add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) + add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) + add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + + set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) + + if (LLAMA_STATIC) + message(FATAL_ERROR "Static linking not supported for HIP/ROCm") + endif() + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas) +endif() + +if (LLAMA_SYCL) + if ( NOT DEFINED ENV{ONEAPI_ROOT}) + message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh") + endif() + #todo: AOT + + find_package(IntelSYCL REQUIRED) + + message(STATUS "SYCL found") + + add_compile_definitions(GGML_USE_SYCL) + + if (LLAMA_SYCL_F16) + add_compile_definitions(GGML_SYCL_F16) + endif() + + add_compile_options(-I./) #include DPCT + add_compile_options(-I/${SYCL_INCLUDE_DIR}) + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") + + set(GGML_HEADERS_SYCL ggml-sycl.h) + set(GGML_SOURCES_SYCL ggml-sycl.cpp) + + if (WIN32) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib) else() - message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) + endif() +endif() + +if (LLAMA_KOMPUTE) + add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) + find_package(Vulkan COMPONENTS glslc REQUIRED) + find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc) + if (NOT glslc_executable) + message(FATAL_ERROR "glslc not found") + endif() + + function(compile_shader) + set(options) + set(oneValueArgs) + set(multiValueArgs SOURCES) + cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + foreach(source ${compile_shader_SOURCES}) + get_filename_component(filename ${source} NAME) + set(spv_file ${filename}.spv) + add_custom_command( + OUTPUT ${spv_file} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp + COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source} + COMMENT "Compiling ${source} to ${spv_file}" + ) + + get_filename_component(RAW_FILE_NAME ${spv_file} NAME) + set(FILE_NAME "shader${RAW_FILE_NAME}") + string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME}) + string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE) + string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") + set(OUTPUT_HEADER_FILE "${HEADER_FILE}") + message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") + if(CMAKE_GENERATOR MATCHES "Visual Studio") + add_custom_command( + OUTPUT ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/$/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + DEPENDS ${spv_file} xxd + COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$/xxd" + ) + else() + add_custom_command( + OUTPUT ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + DEPENDS ${spv_file} xxd + COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd" + ) + endif() + endforeach() + endfunction() + + if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") + message(STATUS "Kompute found") + set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level") + add_subdirectory(kompute) + + # Compile our shaders + compile_shader(SOURCES + kompute-shaders/op_scale.comp + kompute-shaders/op_scale_8.comp + kompute-shaders/op_add.comp + kompute-shaders/op_addrow.comp + kompute-shaders/op_mul.comp + kompute-shaders/op_silu.comp + kompute-shaders/op_relu.comp + kompute-shaders/op_gelu.comp + kompute-shaders/op_softmax.comp + kompute-shaders/op_norm.comp + kompute-shaders/op_rmsnorm.comp + kompute-shaders/op_diagmask.comp + kompute-shaders/op_mul_mat_mat_f32.comp + kompute-shaders/op_mul_mat_f16.comp + kompute-shaders/op_mul_mat_q8_0.comp + kompute-shaders/op_mul_mat_q4_0.comp + kompute-shaders/op_mul_mat_q4_1.comp + kompute-shaders/op_mul_mat_q6_k.comp + kompute-shaders/op_getrows_f16.comp + kompute-shaders/op_getrows_q4_0.comp + kompute-shaders/op_getrows_q4_1.comp + kompute-shaders/op_getrows_q6_k.comp + kompute-shaders/op_rope_f16.comp + kompute-shaders/op_rope_f32.comp + kompute-shaders/op_cpy_f16_f16.comp + kompute-shaders/op_cpy_f16_f32.comp + kompute-shaders/op_cpy_f32_f16.comp + kompute-shaders/op_cpy_f32_f32.comp + ) + + # Create a custom target for our generated shaders + add_custom_target(generated_shaders DEPENDS + shaderop_scale.h + shaderop_scale_8.h + shaderop_add.h + shaderop_addrow.h + shaderop_mul.h + shaderop_silu.h + shaderop_relu.h + shaderop_gelu.h + shaderop_softmax.h + shaderop_norm.h + shaderop_rmsnorm.h + shaderop_diagmask.h + shaderop_mul_mat_mat_f32.h + shaderop_mul_mat_f16.h + shaderop_mul_mat_q8_0.h + shaderop_mul_mat_q4_0.h + shaderop_mul_mat_q4_1.h + shaderop_mul_mat_q6_k.h + shaderop_getrows_f16.h + shaderop_getrows_q4_0.h + shaderop_getrows_q4_1.h + shaderop_getrows_q6_k.h + shaderop_rope_f16.h + shaderop_rope_f32.h + shaderop_cpy_f16_f16.h + shaderop_cpy_f16_f32.h + shaderop_cpy_f32_f16.h + shaderop_cpy_f32_f32.h + ) + + # Create a custom command that depends on the generated_shaders + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp + COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp + DEPENDS generated_shaders + COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp" + ) + + # Add the stamp to the main sources to ensure dependency tracking + set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) + set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) + + add_compile_definitions(GGML_USE_KOMPUTE) + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute) + set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR}) + else() + message(WARNING "Kompute not found") + endif() +endif() + +if (LLAMA_CPU_HBM) + find_library(memkind memkind REQUIRED) + + add_compile_definitions(GGML_USE_CPU_HBM) + + target_link_libraries(ggml PUBLIC memkind) +endif() + +if (LLAMA_PERF) + add_compile_definitions(GGML_PERF) +endif() + +function(get_flags CCID CCVER) + set(C_FLAGS "") + set(CXX_FLAGS "") + + if (CCID MATCHES "Clang") + set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return) + set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi) + + if ( + (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR + (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) + ) + list(APPEND C_FLAGS -Wdouble-promotion) + endif() + elseif (CCID STREQUAL "GNU") + set(C_FLAGS -Wdouble-promotion) + set(CXX_FLAGS -Wno-array-bounds) + + if (CCVER VERSION_GREATER_EQUAL 7.1.0) + list(APPEND CXX_FLAGS -Wno-format-truncation) + endif() + if (CCVER VERSION_GREATER_EQUAL 8.1.0) + list(APPEND CXX_FLAGS -Wextra-semi) + endif() + endif() + + set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) + set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) +endfunction() + +if (LLAMA_FATAL_WARNINGS) + if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + list(APPEND C_FLAGS -Werror) + list(APPEND CXX_FLAGS -Werror) + elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + add_compile_options(/WX) endif() endif() if (LLAMA_ALL_WARNINGS) if (NOT MSVC) - set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) - set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration) - set(cxx_flags -Wmissing-declarations -Wmissing-noreturn) - set(host_cxx_flags "") + list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) + list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes + -Werror=implicit-int -Werror=implicit-function-declaration) + list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) - if (CMAKE_C_COMPILER_ID MATCHES "Clang") - set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return) - set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi) + list(APPEND C_FLAGS ${WARNING_FLAGS}) + list(APPEND CXX_FLAGS ${WARNING_FLAGS}) - if ( - (CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR - (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0) - ) - set(c_flags ${c_flags} -Wdouble-promotion) - endif() - elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU") - set(c_flags ${c_flags} -Wdouble-promotion) - set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds) + get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0) - set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation) - endif() - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0) - set(host_cxx_flags ${host_cxx_flags} -Wextra-semi) - endif() - endif() + add_compile_options("$<$:${C_FLAGS};${GF_C_FLAGS}>" + "$<$:${CXX_FLAGS};${GF_CXX_FLAGS}>") else() # todo : msvc + set(C_FLAGS "") + set(CXX_FLAGS "") + endif() +endif() + +set(CUDA_CXX_FLAGS "") + +if (LLAMA_CUBLAS) + set(CUDA_FLAGS -use_fast_math) + + if (LLAMA_FATAL_WARNINGS) + list(APPEND CUDA_FLAGS -Werror all-warnings) endif() - set(c_flags ${c_flags} ${warning_flags}) - set(cxx_flags ${cxx_flags} ${warning_flags}) - add_compile_options("$<$:${c_flags}>" - "$<$:${cxx_flags}>" - "$<$:${host_cxx_flags}>") + if (LLAMA_ALL_WARNINGS AND NOT MSVC) + set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) + if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") + list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER}) + endif() + execute_process( + COMMAND ${NVCC_CMD} -Xcompiler --version + OUTPUT_VARIABLE CUDA_CCFULLVER + ERROR_QUIET + ) + + if (NOT CUDA_CCFULLVER MATCHES clang) + set(CUDA_CCID "GNU") + execute_process( + COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion" + OUTPUT_VARIABLE CUDA_CCVER + ERROR_QUIET + ) + else() + if (CUDA_CCFULLVER MATCHES Apple) + set(CUDA_CCID "AppleClang") + else() + set(CUDA_CCID "Clang") + endif() + string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER}) + endif() + + message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") + + get_flags(${CUDA_CCID} ${CUDA_CCVER}) + list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later + endif() + + if (NOT MSVC) + list(APPEND CUDA_CXX_FLAGS -Wno-pedantic) + endif() endif() -if (NOT MSVC) - set(cuda_flags -Wno-pedantic) -endif() -set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags}) - -list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument -if (NOT cuda_host_flags STREQUAL "") - set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags}) -endif() - -add_compile_options("$<$:${cuda_flags}>") - if (WIN32) add_compile_definitions(_CRT_SECURE_NO_WARNINGS) @@ -489,15 +859,37 @@ if (LLAMA_LTO) endif() endif() +if (LLAMA_CCACHE) + find_program(LLAMA_CCACHE_FOUND ccache) + if (LLAMA_CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set(ENV{CCACHE_SLOPPINESS} time_macros) + message(STATUS "ccache found, compilation results will be cached. Disable with LLAMA_CCACHE=OFF.") + else() + message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with LLAMA_CCACHE=OFF") + endif () +endif() + +# this version of Apple ld64 is buggy +execute_process( + COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v + ERROR_VARIABLE output + OUTPUT_QUIET +) + +if (output MATCHES "dyld-1015\.7") + add_compile_definitions(HAVE_BUGGY_APPLE_LINKER) +endif() + # Architecture specific # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") if (MSVC) - string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) - message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") + string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) + message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") else () - set(CMAKE_GENERATOR_PLATFORM_LWR "") + set(CMAKE_GENERATOR_PLATFORM_LWR "") endif () if (NOT MSVC) @@ -512,38 +904,63 @@ if (NOT MSVC) endif() endif() -if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64")) +set(ARCH_FLAGS "") + +if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) message(STATUS "ARM detected") if (MSVC) + add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead add_compile_definitions(__ARM_NEON) add_compile_definitions(__ARM_FEATURE_FMA) - add_compile_definitions(__ARM_FEATURE_DOTPROD) - # add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16 - add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead + + set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) + string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + add_compile_definitions(__ARM_FEATURE_DOTPROD) + endif () + check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + endif () + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) else() check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") - add_compile_options(-mfp16-format=ieee) + list(APPEND ARCH_FLAGS -mfp16-format=ieee) endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") # Raspberry Pi 1, Zero - add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access) + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") - # Raspberry Pi 2 - add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") + # Android armeabi-v7a + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) + else() + # Raspberry Pi 2 + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + endif() endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Android arm64-v8a # Raspberry Pi 3, 4, Zero 2 (32-bit) - add_compile_options(-mno-unaligned-access) + list(APPEND ARCH_FLAGS -mno-unaligned-access) endif() endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" ) +elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) message(STATUS "x86 detected") if (MSVC) + # instruction set detection for MSVC only + if (LLAMA_NATIVE) + include(cmake/FindSIMD.cmake) + endif () if (LLAMA_AVX512) - add_compile_options($<$:/arch:AVX512>) - add_compile_options($<$:/arch:AVX512>) + list(APPEND ARCH_FLAGS /arch:AVX512) # MSVC has no compile-time flags enabling specific # AVX512 extensions, neither it defines the # macros corresponding to the extensions. @@ -557,47 +974,66 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE add_compile_definitions($<$:__AVX512VNNI__>) endif() elseif (LLAMA_AVX2) - add_compile_options($<$:/arch:AVX2>) - add_compile_options($<$:/arch:AVX2>) + list(APPEND ARCH_FLAGS /arch:AVX2) elseif (LLAMA_AVX) - add_compile_options($<$:/arch:AVX>) - add_compile_options($<$:/arch:AVX>) + list(APPEND ARCH_FLAGS /arch:AVX) endif() else() if (LLAMA_NATIVE) - add_compile_options(-march=native) + list(APPEND ARCH_FLAGS -march=native) endif() if (LLAMA_F16C) - add_compile_options(-mf16c) + list(APPEND ARCH_FLAGS -mf16c) endif() if (LLAMA_FMA) - add_compile_options(-mfma) + list(APPEND ARCH_FLAGS -mfma) endif() if (LLAMA_AVX) - add_compile_options(-mavx) + list(APPEND ARCH_FLAGS -mavx) endif() if (LLAMA_AVX2) - add_compile_options(-mavx2) + list(APPEND ARCH_FLAGS -mavx2) endif() if (LLAMA_AVX512) - add_compile_options(-mavx512f) - add_compile_options(-mavx512bw) + list(APPEND ARCH_FLAGS -mavx512f) + list(APPEND ARCH_FLAGS -mavx512bw) endif() if (LLAMA_AVX512_VBMI) - add_compile_options(-mavx512vbmi) + list(APPEND ARCH_FLAGS -mavx512vbmi) endif() if (LLAMA_AVX512_VNNI) - add_compile_options(-mavx512vnni) + list(APPEND ARCH_FLAGS -mavx512vnni) endif() endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") message(STATUS "PowerPC detected") - add_compile_options(-mcpu=native -mtune=native) - #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) + else() + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) + #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) + endif() else() message(STATUS "Unknown architecture") endif() +add_compile_options("$<$:${ARCH_FLAGS}>") +add_compile_options("$<$:${ARCH_FLAGS}>") + +if (LLAMA_CUBLAS) + list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) + list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument + if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") + list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) + endif() + add_compile_options("$<$:${CUDA_FLAGS}>") +endif() + +if (MINGW) + # Target Windows 8 for PrefetchVirtualMemory + add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER}) +endif() + # # POSIX conformance # @@ -653,11 +1089,6 @@ endif() # ggml -if (GGML_USE_CPU_HBM) - add_definitions(-DGGML_USE_CPU_HBM) - find_library(memkind memkind REQUIRED) -endif() - add_library(ggml OBJECT ggml.c ggml.h @@ -665,21 +1096,26 @@ add_library(ggml OBJECT ggml-alloc.h ggml-backend.c ggml-backend.h - ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} - ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} - ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} - ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} + ggml-quants.c + ggml-quants.h + ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} + ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} + ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} + ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} + ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} + ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} + ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} + ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) -target_compile_features(ggml PUBLIC c_std_11) # don't bump +target_compile_features (ggml PUBLIC c_std_11) # don't bump + target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) -if (GGML_USE_CPU_HBM) - target_link_libraries(ggml PUBLIC memkind) -endif() add_library(ggml_static STATIC $) + if (BUILD_SHARED_LIBS) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(ggml_shared SHARED $) @@ -695,7 +1131,8 @@ add_library(llama ) target_include_directories(llama PUBLIC .) -target_compile_features(llama PUBLIC cxx_std_11) # don't bump +target_compile_features (llama PUBLIC cxx_std_11) # don't bump + target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS} @@ -745,8 +1182,8 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama) -set(GGML_PUBLIC_HEADERS "ggml.h" - "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" +set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h" + "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}") set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/Makefile b/Makefile index 04104bee8..f03faf6ed 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,15 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ - main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search \ - speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o + main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ + simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ + speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o # Binaries only useful for tests TEST_TARGETS = \ tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ - tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe + tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ + tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease # Code coverage output files COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report @@ -25,20 +26,6 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif -ifeq '' '$(findstring clang,$(shell $(CC) --version))' - CC_IS_GCC=1 - CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }') -else - CC_IS_CLANG=1 - ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))' - CC_IS_LLVM_CLANG=1 - else - CC_IS_APPLE_CLANG=1 - endif - CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \ - | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }') -endif - # Mac OS + Arm can report x86_64 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 ifeq ($(UNAME_S),Darwin) @@ -56,10 +43,6 @@ ifeq ($(UNAME_S),Darwin) endif endif -ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))' -BUILD_TARGETS += metal -endif - default: $(BUILD_TARGETS) test: $(TEST_TARGETS) @@ -78,7 +61,7 @@ test: $(TEST_TARGETS) ./$$test_target; \ fi; \ if [ $$? -ne 0 ]; then \ - printf 'Test $$test_target FAILED!\n\n' $$test_target; \ + printf 'Test %s FAILED!\n\n' $$test_target; \ failures=$$(( failures + 1 )); \ else \ printf 'Test %s passed.\n\n' $$test_target; \ @@ -114,20 +97,34 @@ endif # # keep standard at C11 and C++11 -MK_CPPFLAGS = -I. -Icommon -MK_CFLAGS = -std=c11 -fPIC -MK_CXXFLAGS = -std=c++11 -fPIC +MK_CPPFLAGS = -I. -Icommon +MK_CFLAGS = -std=c11 -fPIC +MK_CXXFLAGS = -std=c++11 -fPIC +MK_NVCCFLAGS = -std=c++11 # -Ofast tends to produce faster code, but may not be available for some compilers. ifdef LLAMA_FAST -MK_CFLAGS += -Ofast -MK_HOST_CXXFLAGS += -Ofast -MK_CUDA_CXXFLAGS += -O3 +MK_CFLAGS += -Ofast +HOST_CXXFLAGS += -Ofast +MK_NVCCFLAGS += -O3 else -MK_CFLAGS += -O3 -MK_CXXFLAGS += -O3 +MK_CFLAGS += -O3 +MK_CXXFLAGS += -O3 +MK_NVCCFLAGS += -O3 endif +ifndef LLAMA_NO_CCACHE +CCACHE := $(shell which ccache) +ifdef CCACHE +export CCACHE_SLOPPINESS = time_macros +$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.) +CC := $(CCACHE) $(CC) +CXX := $(CCACHE) $(CXX) +else +$(info I ccache not found. Consider installing it for faster compilation.) +endif # CCACHE +endif # LLAMA_NO_CCACHE + # clock_gettime came in POSIX.1b (1993) # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional # posix_memalign came in POSIX.1-2001 / SUSv3 @@ -174,6 +171,10 @@ ifdef LLAMA_DEBUG MK_CFLAGS += -O0 -g MK_CXXFLAGS += -O0 -g MK_LDFLAGS += -g + + ifeq ($(UNAME_S),Linux) + MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS + endif else MK_CPPFLAGS += -DNDEBUG endif @@ -215,28 +216,14 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis -Werror=implicit-function-declaration MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn -ifeq ($(CC_IS_CLANG), 1) - # clang options - MK_CFLAGS += -Wunreachable-code-break -Wunreachable-code-return - MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi +ifeq ($(LLAMA_FATAL_WARNINGS),1) + MK_CFLAGS += -Werror + MK_CXXFLAGS += -Werror +endif - ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))' - MK_CFLAGS += -Wdouble-promotion - endif - ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))' - MK_CFLAGS += -Wdouble-promotion - endif -else - # gcc options - MK_CFLAGS += -Wdouble-promotion - MK_HOST_CXXFLAGS += -Wno-array-bounds - - ifeq ($(shell expr $(CC_VER) \>= 070100), 1) - MK_HOST_CXXFLAGS += -Wno-format-truncation - endif - ifeq ($(shell expr $(CC_VER) \>= 080100), 1) - MK_HOST_CXXFLAGS += -Wextra-semi - endif +# this version of Apple ld64 is buggy +ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))' + MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER endif # OS specific @@ -284,8 +271,8 @@ ifndef RISCV ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) # Use all CPU extensions that are available: - MK_CFLAGS += -march=native -mtune=native - MK_HOST_CXXFLAGS += -march=native -mtune=native + MK_CFLAGS += -march=native -mtune=native + HOST_CXXFLAGS += -march=native -mtune=native # Usage AVX-only #MK_CFLAGS += -mfma -mf16c -mavx @@ -296,19 +283,31 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) #MK_CXXFLAGS += -mssse3 endif -# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves. -# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 -# https://github.com/ggerganov/llama.cpp/issues/2922 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' + # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves. + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 + # https://github.com/ggerganov/llama.cpp/issues/2922 MK_CFLAGS += -Xassembler -muse-unaligned-vector-move MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move + + # Target Windows 8 for PrefetchVirtualMemory + MK_CPPFLAGS += -D_WIN32_WINNT=0x602 endif ifneq ($(filter aarch64%,$(UNAME_M)),) # Apple M1, M2, etc. # Raspberry Pi 3, 4, Zero 2 (64-bit) + # Nvidia Jetson MK_CFLAGS += -mcpu=native MK_CXXFLAGS += -mcpu=native + JETSON_RELEASE_INFO = $(shell jetson_release) + ifdef JETSON_RELEASE_INFO + ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),) + JETSON_EOL_MODULE_DETECT = 1 + CC = aarch64-unknown-linux-gnu-gcc + cxx = aarch64-unknown-linux-gnu-g++ + endif + endif endif ifneq ($(filter armv6%,$(UNAME_M)),) @@ -337,18 +336,20 @@ ifneq ($(filter ppc64%,$(UNAME_M)),) endif endif +ifneq ($(filter ppc64le%,$(UNAME_M)),) + MK_CFLAGS += -mcpu=powerpc64le + MK_CXXFLAGS += -mcpu=powerpc64le + CUDA_POWER_ARCH = 1 +endif + else MK_CFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d endif -ifndef LLAMA_NO_K_QUANTS - MK_CPPFLAGS += -DGGML_USE_K_QUANTS - OBJS += k_quants.o ifdef LLAMA_QKK_64 MK_CPPFLAGS += -DGGML_QKK_64 endif -endif ifndef LLAMA_NO_ACCELERATE # Mac OS - include Accelerate framework. @@ -365,7 +366,7 @@ ifdef LLAMA_MPI MK_CPPFLAGS += -DGGML_USE_MPI MK_CFLAGS += -Wno-cast-qual MK_CXXFLAGS += -Wno-cast-qual - OBJS += ggml-mpi.o + OBJS += ggml-mpi.o endif # LLAMA_MPI ifdef LLAMA_OPENBLAS @@ -380,59 +381,75 @@ ifdef LLAMA_BLIS endif # LLAMA_BLIS ifdef LLAMA_CUBLAS - MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include - MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib - OBJS += ggml-cuda.o - NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math + MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include + MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib + OBJS += ggml-cuda.o + MK_NVCCFLAGS += -use_fast_math +ifdef LLAMA_FATAL_WARNINGS + MK_NVCCFLAGS += -Werror all-warnings +endif # LLAMA_FATAL_WARNINGS +ifndef JETSON_EOL_MODULE_DETECT + MK_NVCCFLAGS += --forward-unknown-to-host-compiler +endif # JETSON_EOL_MODULE_DETECT +ifdef LLAMA_DEBUG + MK_NVCCFLAGS += -lineinfo +endif # LLAMA_DEBUG ifdef LLAMA_CUDA_NVCC - NVCC = $(LLAMA_CUDA_NVCC) + NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC) else - NVCC = nvcc + NVCC = $(CCACHE) nvcc endif #LLAMA_CUDA_NVCC ifdef CUDA_DOCKER_ARCH - NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) -else - NVCCFLAGS += -arch=native + MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) +else ifndef CUDA_POWER_ARCH + MK_NVCCFLAGS += -arch=native endif # CUDA_DOCKER_ARCH ifdef LLAMA_CUDA_FORCE_DMMV - NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV + MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV endif # LLAMA_CUDA_FORCE_DMMV +ifdef LLAMA_CUDA_FORCE_MMQ + MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ +endif # LLAMA_CUDA_FORCE_MMQ ifdef LLAMA_CUDA_DMMV_X - NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) + MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) else - NVCCFLAGS += -DGGML_CUDA_DMMV_X=32 + MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32 endif # LLAMA_CUDA_DMMV_X ifdef LLAMA_CUDA_MMV_Y - NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) + MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) else ifdef LLAMA_CUDA_DMMV_Y - NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility + MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility else - NVCCFLAGS += -DGGML_CUDA_MMV_Y=1 + MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1 endif # LLAMA_CUDA_MMV_Y ifdef LLAMA_CUDA_F16 - NVCCFLAGS += -DGGML_CUDA_F16 + MK_NVCCFLAGS += -DGGML_CUDA_F16 endif # LLAMA_CUDA_F16 ifdef LLAMA_CUDA_DMMV_F16 - NVCCFLAGS += -DGGML_CUDA_F16 + MK_NVCCFLAGS += -DGGML_CUDA_F16 endif # LLAMA_CUDA_DMMV_F16 ifdef LLAMA_CUDA_KQUANTS_ITER - NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) + MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) else - NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 + MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 endif ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE - NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE) + MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE) else - NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 + MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE #ifdef LLAMA_CUDA_CUBLAS -# NVCCFLAGS += -DGGML_CUDA_CUBLAS +# MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS #endif # LLAMA_CUDA_CUBLAS ifdef LLAMA_CUDA_CCBIN - NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) + MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) endif ggml-cuda.o: ggml-cuda.cu ggml-cuda.h - $(NVCC) $(NVCCFLAGS) -c $< -o $@ +ifdef JETSON_EOL_MODULE_DETECT + $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ +else + $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ +endif # JETSON_EOL_MODULE_DETECT endif # LLAMA_CUBLAS ifdef LLAMA_CLBLAST @@ -453,14 +470,48 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h $(CXX) $(CXXFLAGS) -c $< -o $@ endif # LLAMA_CLBLAST +ifdef LLAMA_VULKAN + MK_CPPFLAGS += -DGGML_USE_VULKAN + MK_LDFLAGS += -lvulkan + OBJS += ggml-vulkan.o + +ifdef LLAMA_VULKAN_CHECK_RESULTS + MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS +endif + +ifdef LLAMA_VULKAN_DEBUG + MK_CPPFLAGS += -DGGML_VULKAN_DEBUG +endif + +ifdef LLAMA_VULKAN_VALIDATE + MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE +endif + +ifdef LLAMA_VULKAN_RUN_TESTS + MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS +endif + +ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h + $(CXX) $(CXXFLAGS) -c $< -o $@ +endif # LLAMA_VULKAN + ifdef LLAMA_HIPBLAS - ROCM_PATH ?= /opt/rocm - HIPCC ?= $(ROCM_PATH)/bin/hipcc - GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) + + ifeq ($(wildcard /opt/rocm),) + ROCM_PATH ?= /usr + GPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) + else + ROCM_PATH ?= /opt/rocm + GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) + endif + HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_KQUANTS_ITER ?= 2 MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS +ifdef LLAMA_HIP_UMA + MK_CPPFLAGS += -DGGML_HIP_UMA +endif # LLAMA_HIP_UMA MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) @@ -482,11 +533,29 @@ ifdef LLAMA_METAL ifdef LLAMA_METAL_NDEBUG MK_CPPFLAGS += -DGGML_METAL_NDEBUG endif +ifdef LLAMA_METAL_EMBED_LIBRARY + MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY + OBJS += ggml-metal-embed.o +endif endif # LLAMA_METAL ifdef LLAMA_METAL ggml-metal.o: ggml-metal.m ggml-metal.h $(CC) $(CFLAGS) -c $< -o $@ + +ifdef LLAMA_METAL_EMBED_LIBRARY +ggml-metal-embed.o: ggml-metal.metal + @echo "Embedding Metal library" + $(eval TEMP_ASSEMBLY=$(shell mktemp)) + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) + @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) + @$(AS) $(TEMP_ASSEMBLY) -o $@ + @rm -f ${TEMP_ASSEMBLY} +endif endif # LLAMA_METAL ifdef LLAMA_MPI @@ -494,21 +563,23 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_MPI -ifndef LLAMA_NO_K_QUANTS -k_quants.o: k_quants.c k_quants.h - $(CC) $(CFLAGS) -c $< -o $@ -endif # LLAMA_NO_K_QUANTS +GF_CC := $(CC) +include scripts/get-flags.mk # combine build flags with cmdline overrides -override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS) -override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) -override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS) -override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS) -override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) +override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) +override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS) +BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS) +override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS) +override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS) +override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) -# save CXXFLAGS before we add host-only options -NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)" -override CXXFLAGS += $(HOST_CXXFLAGS) +# identify CUDA host compiler +ifdef LLAMA_CUBLAS +GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler +include scripts/get-flags.mk +CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic +endif # # Print build information @@ -522,8 +593,19 @@ $(info I CFLAGS: $(CFLAGS)) $(info I CXXFLAGS: $(CXXFLAGS)) $(info I NVCCFLAGS: $(NVCCFLAGS)) $(info I LDFLAGS: $(LDFLAGS)) -$(info I CC: $(shell $(CC) --version | head -n 1)) -$(info I CXX: $(shell $(CXX) --version | head -n 1)) +$(info I CC: $(shell $(CC) --version | head -n 1)) +$(info I CXX: $(shell $(CXX) --version | head -n 1)) +ifdef LLAMA_CUBLAS +$(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) +CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])') +ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) +ifndef CUDA_DOCKER_ARCH +ifndef CUDA_POWER_ARCH +$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH) +endif # CUDA_POWER_ARCH +endif # CUDA_DOCKER_ARCH +endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) +endif # LLAMA_CUBLAS $(info ) # @@ -539,13 +621,16 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h $(CC) $(CFLAGS) -c $< -o $@ -OBJS += ggml-alloc.o ggml-backend.o +ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h + $(CC) $(CFLAGS) -c $< -o $@ + +OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ -COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h -COMMON_DEPS = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o +COMMON_H_DEPS = common/common.h common/sampling.h common/log.h +COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o common.o: common/common.cpp $(COMMON_H_DEPS) $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -565,115 +650,166 @@ train.o: common/train.cpp common/train.h libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) +libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS) + ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS) + clean: - rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) + rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) + find examples pocs -type f -name "*.o" -delete # # Examples # -main: examples/main/main.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +# $< is the first prerequisite, i.e. the source file. +# Explicitly compile this to an object file so that it can be cached with ccache. +# The source file is then filtered out from $^ (the list of all prerequisites) and the object file is added instead. + +# Helper function that replaces .c, .cpp, and .cu file endings with .o: +GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1)))) + +main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -infill: examples/infill/infill.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -simple: examples/simple/simple.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -batched: examples/batched/batched.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -batched-bench: examples/batched-bench/batched-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) +embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) +save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2) -embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput - -gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual +libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual + +llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -ifdef LLAMA_METAL -metal: examples/metal/metal.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -endif +lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift (cd examples/batched.swift; make build) endif -build-info.h: $(wildcard .git/index) scripts/build-info.sh - @sh scripts/build-info.sh $(CC) > $@.tmp +common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh + @sh scripts/build-info.sh "$(CC)" > $@.tmp @if ! cmp -s $@.tmp $@; then \ mv $@.tmp $@; \ else \ rm $@.tmp; \ fi +build-info.o: common/build-info.cpp + $(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@ + # # Tests # tests: $(TEST_TARGETS) -benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) run-benchmark-matmult: benchmark-matmult ./$@ @@ -681,46 +817,80 @@ run-benchmark-matmult: benchmark-matmult .PHONY: run-benchmark-matmult swift vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-c.o: tests/test-c.c llama.h $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ + +tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/Package.swift b/Package.swift index 4ab055b19..b24c9204a 100644 --- a/Package.swift +++ b/Package.swift @@ -2,33 +2,14 @@ import PackageDescription -#if arch(arm) || arch(arm64) -let platforms: [SupportedPlatform]? = [ - .macOS(.v12), - .iOS(.v14), - .watchOS(.v4), - .tvOS(.v14) -] -let exclude: [String] = [] -let resources: [Resource] = [ - .process("ggml-metal.metal") -] -let additionalSources: [String] = ["ggml-metal.m"] -let additionalSettings: [CSetting] = [ - .unsafeFlags(["-fno-objc-arc"]), - .define("GGML_USE_METAL") -] -#else -let platforms: [SupportedPlatform]? = nil -let exclude: [String] = ["ggml-metal.metal"] -let resources: [Resource] = [] -let additionalSources: [String] = [] -let additionalSettings: [CSetting] = [] -#endif - let package = Package( name: "llama", - platforms: platforms, + platforms: [ + .macOS(.v12), + .iOS(.v14), + .watchOS(.v4), + .tvOS(.v14) + ], products: [ .library(name: "llama", targets: ["llama"]), ], @@ -36,26 +17,40 @@ let package = Package( .target( name: "llama", path: ".", - exclude: exclude, + exclude: [ + "cmake", + "examples", + "scripts", + "models", + "tests", + "CMakeLists.txt", + "ggml-cuda.cu", + "ggml-cuda.h", + "Makefile" + ], sources: [ "ggml.c", "llama.cpp", "ggml-alloc.c", "ggml-backend.c", - "k_quants.c", - ] + additionalSources, - resources: resources, + "ggml-quants.c", + "ggml-metal.m", + ], + resources: [ + .process("ggml-metal.metal") + ], publicHeadersPath: "spm-headers", cSettings: [ .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), - .define("GGML_USE_K_QUANTS"), - .define("GGML_USE_ACCELERATE") + .define("GGML_USE_ACCELERATE"), + .unsafeFlags(["-fno-objc-arc"]), + .define("GGML_USE_METAL"), // NOTE: NEW_LAPACK will required iOS version 16.4+ // We should consider add this in the future when we drop support for iOS 14 // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) // .define("ACCELERATE_NEW_LAPACK"), // .define("ACCELERATE_LAPACK_ILP64") - ] + additionalSettings, + ], linkerSettings: [ .linkedFramework("Accelerate") ] diff --git a/README-sycl.md b/README-sycl.md new file mode 100644 index 000000000..dd5bf9dea --- /dev/null +++ b/README-sycl.md @@ -0,0 +1,494 @@ +# llama.cpp for SYCL + +- [Background](#background) +- [OS](#os) +- [Intel GPU](#intel-gpu) +- [Docker](#docker) +- [Linux](#linux) +- [Windows](#windows) +- [Environment Variable](#environment-variable) +- [Known Issue](#known-issue) +- [Q&A](#q&a) +- [Todo](#todo) + +## Background + +SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17. + +oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms. + +Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs. + +To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL. + +The llama.cpp for SYCL is used to support Intel GPUs. + +For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building). + +## OS + +|OS|Status|Verified| +|-|-|-| +|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39| +|Windows|Support|Windows 11| + + +## Intel GPU + +### Verified + +|Intel GPU| Status | Verified Model| +|-|-|-| +|Intel Data Center Max Series| Support| Max 1550| +|Intel Data Center Flex Series| Support| Flex 170| +|Intel Arc Series| Support| Arc 770, 730M| +|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake| +|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7| + +Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use. + +### Memory + +The memory is a limitation to run LLM on GPUs. + +When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors: buffer size = 3577.56 MiB`. + +For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+. + +For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+. + +## Docker + +Note: +- Only docker on Linux is tested. Docker on WSL may not work. +- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that) + +### Build the image + +You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference. + + +```sh +# For F16: +#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile . + +# Or, for F32: +docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile . + +# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example +``` + +### Run + +```sh +# Firstly, find all the DRI cards: +ls -la /dev/dri +# Then, pick the card that you want to use. + +# For example with "/dev/dri/card1" +docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 +``` + +## Linux + +### Setup Environment + +1. Install Intel GPU driver. + +a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html). + +Note: for iGPU, please install the client GPU driver. + +b. Add user to group: video, render. + +```sh +sudo usermod -aG render username +sudo usermod -aG video username +``` + +Note: re-login to enable it. + +c. Check + +```sh +sudo apt install clinfo +sudo clinfo -l +``` + +Output (example): + +``` +Platform #0: Intel(R) OpenCL Graphics + `-- Device #0: Intel(R) Arc(TM) A770 Graphics + + +Platform #0: Intel(R) OpenCL HD Graphics + `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49] +``` + +2. Install Intel® oneAPI Base toolkit. + +a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). + +Recommend to install to default folder: **/opt/intel/oneapi**. + +Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder. + +b. Check + +```sh +source /opt/intel/oneapi/setvars.sh + +sycl-ls +``` + +There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**. + +Output (example): +``` +[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] +[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] +[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50] +[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918] + +``` + +2. Build locally: + +Note: +- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference. +- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only. + +```sh +mkdir -p build +cd build +source /opt/intel/oneapi/setvars.sh + +# For FP16: +#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON + +# Or, for FP32: +cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + +# Build example/main only +#cmake --build . --config Release --target main + +# Or, build all binary +cmake --build . --config Release -v + +cd .. +``` + +or + +```sh +./examples/sycl/build.sh +``` + +### Run + +1. Put model file to folder **models** + +You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example. + +2. Enable oneAPI running environment + +``` +source /opt/intel/oneapi/setvars.sh +``` + +3. List device ID + +Run without parameter: + +```sh +./build/bin/ls-sycl-device + +# or running the "main" executable and look at the output log: + +./build/bin/main +``` + +Check the ID in startup log, like: + +``` +found 4 SYCL devices: + Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3, + max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 + Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2, + max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280 + Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0, + max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280 + Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0, + max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 + +``` + +|Attribute|Note| +|-|-| +|compute capability 1.3|Level-zero running time, recommended | +|compute capability 3.0|OpenCL running time, slower than level-zero in most cases| + +4. Set device ID and execute llama.cpp + +Set device ID = 0 by **GGML_SYCL_DEVICE=0** + +```sh +GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 +``` +or run by script: + +```sh +./examples/sycl/run_llama2.sh +``` + +Note: + +- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue. + + +5. Check the device ID in output + +Like: +``` +Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device +``` + +## Windows + +### Setup Environment + +1. Install Intel GPU driver. + +Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html). + +Note: **The driver is mandatory for compute function**. + +2. Install Visual Studio. + +Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows. + +3. Install Intel® oneAPI Base toolkit. + +a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). + +Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**. + +Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder. + +b. Enable oneAPI running environment: + +- In Search, input 'oneAPI'. + +Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022" + +- In Run: + +In CMD: +``` +"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 +``` + +c. Check GPU + +In oneAPI command line: + +``` +sycl-ls +``` + +There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**. + +Output (example): +``` +[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] +[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] +[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186] +[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044] +``` + +4. Install cmake & make + +a. Download & install cmake for Windows: https://cmake.org/download/ + +b. Download & install mingw-w64 make for Windows provided by w64devkit + +- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). + +- Extract `w64devkit` on your pc. + +- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`. + +### Build locally: + +In oneAPI command line window: + +``` +mkdir -p build +cd build +@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force + +:: for FP16 +:: faster for long-prompt inference +:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON + +:: for FP32 +cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release + + +:: build example/main only +:: make main + +:: build all binary +make -j +cd .. +``` + +or + +``` +.\examples\sycl\win-build-sycl.bat +``` + +Note: + +- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only. + +### Run + +1. Put model file to folder **models** + +You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example. + +2. Enable oneAPI running environment + +- In Search, input 'oneAPI'. + +Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022" + +- In Run: + +In CMD: +``` +"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 +``` + +3. List device ID + +Run without parameter: + +``` +build\bin\ls-sycl-device.exe + +or + +build\bin\main.exe +``` + +Check the ID in startup log, like: + +``` +found 4 SYCL devices: + Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3, + max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 + Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2, + max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280 + Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0, + max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280 + Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0, + max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 + +``` + +|Attribute|Note| +|-|-| +|compute capability 1.3|Level-zero running time, recommended | +|compute capability 3.0|OpenCL running time, slower than level-zero in most cases| + +4. Set device ID and execute llama.cpp + +Set device ID = 0 by **set GGML_SYCL_DEVICE=0** + +``` +set GGML_SYCL_DEVICE=0 +build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 +``` +or run by script: + +``` +.\examples\sycl\win-run-llama2.bat +``` + +Note: + +- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue. + + +5. Check the device ID in output + +Like: +``` +Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device +``` + +## Environment Variable + +#### Build + +|Name|Value|Function| +|-|-|-| +|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.
For FP32/FP16, LLAMA_SYCL=ON is mandatory.| +|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference.
For FP32, not set it.| +|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path| +|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path| + +#### Running + + +|Name|Value|Function| +|-|-|-| +|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output| +|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG| + +## Known Issue + +- Hang during startup + + llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block. + + Solution: add **--no-mmap** or **--mmap 0**. + +## Q&A + +- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`. + + Miss to enable oneAPI running environment. + + Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`. + +- In Windows, no result, not error. + + Miss to enable oneAPI running environment. + +- Meet compile error. + + Remove folder **build** and try again. + +- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux. + + Please run **sudo sycl-ls**. + + If you see it in result, please add video/render group to your ID: + + ``` + sudo usermod -aG render username + sudo usermod -aG video username + ``` + + Then **relogin**. + + If you do not see it, please check the installation GPU steps again. + +## Todo + +- Support multiple cards. diff --git a/README.md b/README.md index ce63c6f0e..3bc512af0 100644 --- a/README.md +++ b/README.md @@ -2,17 +2,18 @@ ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) -[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) -Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ +Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ ### Hot topics -- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436 -- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252) +- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) +- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631 +- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590 +- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 ---- @@ -28,17 +29,14 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
  • Get the Code
  • Build
  • BLAS Build
  • -
  • Prepare Data & Run
  • +
  • Prepare and Quantize
  • +
  • Run the quantized model
  • Memory/Disk Requirements
  • Quantization
  • Interactive mode
  • Constrained output with grammars
  • -
  • Instruction mode with Alpaca
  • -
  • Using OpenLLaMA
  • -
  • Using GPT4All
  • -
  • Using Pygmalion 7B & Metharme 7B
  • -
  • Obtaining the Facebook LLaMA original model and Stanford Alpaca model data
  • -
  • Verifying the model files
  • +
  • Instruct mode
  • +
  • Obtaining and using the Facebook LLaMA 2 model
  • Seminal papers and background on the models
  • Perplexity (measuring model quality)
  • Android
  • @@ -53,18 +51,20 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ## Description -The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook +The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide +variety of hardware - locally and in the cloud. -- Plain C/C++ implementation without dependencies -- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks +- Plain C/C++ implementation without any dependencies +- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks - AVX, AVX2 and AVX512 support for x86 architectures -- Mixed F16 / F32 precision -- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support -- CUDA, Metal and OpenCL GPU backend support +- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use +- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP) +- Vulkan, SYCL, and (partial) OpenCL backend support +- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity -The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022). -Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves -as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library. +Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has +improved significantly thanks to many contributions. It is the main playground for developing new features for the +[ggml](https://github.com/ggerganov/ggml) library. **Supported platforms:** @@ -72,55 +72,95 @@ as the main playground for developing new features for the [ggml](https://github - [X] Linux - [X] Windows (via CMake) - [X] Docker +- [X] FreeBSD **Supported models:** +Typically finetunes of the base models below are supported as well. + - [X] LLaMA 🦙 - [x] LLaMA 2 🦙🦙 +- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) +- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral) - [X] Falcon -- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca) -- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) -- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) -- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy) -- [X] [Pygmalion/Metharme](#using-pygmalion-7b--metharme-7b) -- [X] [WizardLM](https://github.com/nlpxucan/WizardLM) - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) -- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) - [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410) - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417) - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553) +- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi) +- [X] [StableLM models](https://huggingface.co/stabilityai) +- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek) +- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen) +- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557) +- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi) +- [x] [GPT-2](https://huggingface.co/gpt2) +- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118) +- [x] [InternLM2](https://huggingface.co/models?search=internlm2) +- [x] [CodeShell](https://github.com/WisdomShell/codeshell) +- [x] [Gemma](https://ai.google.dev/gemma) + +**Multimodal models:** + +- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e) +- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava) +- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5) +- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V) +- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) +- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) **Bindings:** - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) -- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp), [hlhr202/llama-node](https://github.com/hlhr202/llama-node) +- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) +- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) +- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) -- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) +- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) +- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) +- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) +- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) **UI:** +Unless otherwise noted these projects are open-source with permissive licensing: + +- [iohub/collama](https://github.com/iohub/coLLaMA) +- [janhq/jan](https://github.com/janhq/jan) (AGPL) - [nat/openplayground](https://github.com/nat/openplayground) -- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) +- [Faraday](https://faraday.dev/) (proprietary) +- [LMStudio](https://lmstudio.ai/) (proprietary) +- [LocalAI](https://github.com/mudler/LocalAI) (MIT) +- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) +- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) +- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) +- [ollama/ollama](https://github.com/ollama/ollama) +- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL) +- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) +- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT) +- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) +- [pythops/tenere](https://github.com/pythops/tenere) (AGPL) +- [semperai/amica](https://github.com/semperai/amica) - [withcatai/catai](https://github.com/withcatai/catai) +- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) --- Here is a typical run using LLaMA v2 13B on M2 Ultra: -```java +``` $ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e I llama.cpp build info: I UNAME_S: Darwin @@ -204,7 +244,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8 ## Usage -Here are the end-to-end binary build and model conversion steps for the LLaMA-7B model. +Here are the end-to-end binary build and model conversion steps for most supported models. ### Get the Code @@ -265,7 +305,7 @@ In order to build llama.cpp you have three different options. sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \ opencl clblast openblas - gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4 + gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4 ``` **Notes:** With this packages you can build llama.cpp with OPENBLAS and @@ -321,7 +361,7 @@ mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 ### BLAS Build -Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it: +Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use: - #### Accelerate Framework: @@ -365,20 +405,37 @@ Building the program with BLAS support may lead to some performance improvements Check [BLIS.md](docs/BLIS.md) for more information. -- #### Intel MKL +- #### SYCL + SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators. - By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by: + llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU). - ```bash - mkdir build - cd build - cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx - cmake --build . --config Release - ``` + For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md). + +- #### Intel oneMKL + Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md). + + - Using manual oneAPI installation: + By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps: + ```bash + mkdir build + cd build + source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation + cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON + cmake --build . --config Release + ``` + + - Using oneAPI docker image: + If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above. + + Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information. - #### cuBLAS This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). + + For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling. + - Using `make`: ```bash make LLAMA_CUBLAS=1 @@ -411,22 +468,39 @@ Building the program with BLAS support may lead to some performance improvements This provides BLAS acceleration on HIP-supported AMD GPUs. Make sure to have ROCm installed. You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html). - Windows support is coming soon... - Using `make`: ```bash make LLAMA_HIPBLAS=1 ``` - - Using `CMake`: + - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): ```bash - mkdir build - cd build - CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON - cmake --build . + CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \ + cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + && cmake --build build -- -j 16 + ``` + On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`. + However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs). + + - Using `make` (example for target gfx1030, build with 16 CPU threads): + ```bash + make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030 ``` + - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): + ```bash + set PATH=%HIP_PATH%\bin;%PATH% + mkdir build + cd build + cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ .. + cmake --build . + ``` + Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) + Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`. + + The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used. - If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3. + If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above): | Option | Legal values | Default | Description | @@ -539,34 +613,87 @@ Building the program with BLAS support may lead to some performance improvements You can get a list of platforms and devices from the `clinfo -l` command, etc. -### Prepare Data & Run +- #### Vulkan + + **With docker**: + + You don't need to install Vulkan SDK. It will be installed inside the container. + + ```sh + # Build the image + docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile . + + # Then, use it: + docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 + ``` + + **Without docker**: + + Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html) + + For example, on Ubuntu 22.04 (jammy), use the command below: + + ```bash + wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list + apt update -y + apt-get install -y vulkan-sdk + # To verify the installation, use the command below: + vulkaninfo + ``` + + Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead. + + Then, build llama.cpp using the cmake command below: + + ```bash + mkdir -p build + cd build + cmake .. -DLLAMA_VULKAN=1 + cmake --build . --config Release + # Test the output binary (with "-ngl 33" to offload all layers to GPU) + ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 + + # You should see in the output, ggml_vulkan detected your GPU. For example: + # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 + ``` + +### Prepare and Quantize + +To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. ```bash -# obtain the original LLaMA model weights and place them in ./models +# obtain the official LLaMA model weights and place them in ./models ls ./models -65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model - # [Optional] for models using BPE tokenizers - ls ./models - 65B 30B 13B 7B vocab.json +llama-2-7b tokenizer_checklist.chk tokenizer.model +# [Optional] for models using BPE tokenizers +ls ./models + vocab.json +# [Optional] for PyTorch .bin models like Mistral-7B +ls ./models + # install Python dependencies python3 -m pip install -r requirements.txt -# convert the 7B model to ggml FP16 format -python3 convert.py models/7B/ +# convert the model to ggml FP16 format +python3 convert.py models/mymodel/ - # [Optional] for models using BPE tokenizers - python convert.py models/7B/ --vocabtype bpe +# [Optional] for models using BPE tokenizers +python convert.py models/mymodel/ --vocab-type bpe -# quantize the model to 4-bits (using q4_0 method) -./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0 +# quantize the model to 4-bits (using Q4_K_M method) +./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M -# update the gguf filetype to current if older version is unsupported by another application -./quantize ./models/7B/ggml-model-q4_0.gguf ./models/7B/ggml-model-q4_0-v2.gguf COPY +# update the gguf filetype to current version if older version is now unsupported +./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY +``` +### Run the quantized model -# run the inference -./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 +```bash +# start inference on a gguf model +./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128 ``` When running the larger models, make sure you have enough disk space to store all the intermediate files. @@ -587,7 +714,7 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. -| Model | Original size | Quantized size (4-bit) | +| Model | Original size | Quantized size (Q4_0) | |------:|--------------:|-----------------------:| | 7B | 13 GB | 3.9 GB | | 13B | 24 GB | 7.8 GB | @@ -614,9 +741,21 @@ Several quantization methods are supported. They differ in the resulting model d | 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 | - [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684) -- recent k-quants improvements +- recent k-quants improvements and new i-quants - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707) - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807) + - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773) + - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856) + - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861) + - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872) + - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897) + - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) + - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) + - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) + - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996) + - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) + - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) + - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) ### Perplexity (measuring model quality) @@ -628,7 +767,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread #### How to run -1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research +1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip 2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw` 3. Output: ``` @@ -691,9 +830,9 @@ The `grammars/` folder contains a handful of sample grammars. To write your own, For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one. -### Instruction mode with Alpaca +### Instruct mode -1. First, download the `ggml` Alpaca model into the `./models` folder +1. First, download and place the `ggml` model into the `./models` folder 2. Run the `main` tool like this: ``` @@ -719,50 +858,6 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. > ``` -### Using [OpenLLaMA](https://github.com/openlm-research/open_llama) - -OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights. - -- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face. -- Convert the model to ggml FP16 format using `python convert.py ` - -### Using [GPT4All](https://github.com/nomic-ai/gpt4all) - -*Note: these instructions are likely obsoleted by the GGUF update* - -- Obtain the `tokenizer.model` file from LLaMA model and put it to `models` -- Obtain the `added_tokens.json` file from Alpaca model and put it to `models` -- Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B` -- It is distributed in the old `ggml` format which is now obsoleted -- You have to convert it to the new format using `convert.py`: - -```bash -python3 convert.py models/gpt4all-7B/gpt4all-lora-quantized.bin -``` - -- You can now use the newly generated `models/gpt4all-7B/ggml-model-q4_0.bin` model in exactly the same way as all other models - -- The newer GPT4All-J model is not yet supported! - -### Using Pygmalion 7B & Metharme 7B - -- Obtain the [LLaMA weights](#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data) -- Obtain the [Pygmalion 7B](https://huggingface.co/PygmalionAI/pygmalion-7b/) or [Metharme 7B](https://huggingface.co/PygmalionAI/metharme-7b) XOR encoded weights -- Convert the LLaMA model with [the latest HF convert script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py) -- Merge the XOR files with the converted LLaMA weights by running the [xor_codec](https://huggingface.co/PygmalionAI/pygmalion-7b/blob/main/xor_codec.py) script -- Convert to `ggml` format using the `convert.py` script in this repo: -```bash -python3 convert.py pygmalion-7b/ --outtype q4_1 -``` -> The Pygmalion 7B & Metharme 7B weights are saved in [bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format) precision. If you wish to convert to `ggml` without quantizating, please specify the `--outtype` as `f32` instead of `f16`. - - -### Obtaining the Facebook LLaMA original model and Stanford Alpaca model data - -- **Under no circumstances should IPFS, magnet links, or any other links to model downloads be shared anywhere in this repository, including in issues, discussions, or pull requests. They will be immediately deleted.** -- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. -- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data. - ### Obtaining and using the Facebook LLaMA 2 model - Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data. @@ -774,20 +869,6 @@ python3 convert.py pygmalion-7b/ --outtype q4_1 - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF) - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF) -### Verifying the model files - -Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files. -- The following python script will verify if you have all possible latest files in your self-installed `./models` subdirectory: - -```bash -# run the verification script -./scripts/verify-checksum-models.py -``` - -- On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory: - - On Linux: `sha256sum --ignore-missing -c SHA256SUMS` - - on macOS: `shasum -a 256 --ignore-missing -c SHA256SUMS` - ### Seminal papers and background on the models If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: @@ -872,19 +953,22 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th * Create a folder to store big models & intermediate files (ex. /llama/models) #### Images -We have two Docker images available for this project: +We have three Docker images available for this project: 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) +3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`) Additionally, there the following images, similar to the above: - `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) +- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). +The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). #### Usage @@ -908,6 +992,12 @@ or with a light image: docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 ``` +or with a server image: + +```bash +docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 +``` + ### Docker With CUDA Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container. @@ -917,6 +1007,7 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia ```bash docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile . +docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile . ``` You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. @@ -930,6 +1021,7 @@ The resulting images, are essentially the same as the non-CUDA images: 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. +3. `local/llama.cpp:server-cuda`: This image only includes the server executable file. #### Usage @@ -938,6 +1030,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne ```bash docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 +docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 ``` ### Contributing @@ -957,12 +1050,13 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m / - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions +- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices +- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT` ### Docs - [main](./examples/main/README.md) - [server](./examples/server/README.md) -- [embd-input](./examples/embd-input/README.md) - [jeopardy](./examples/jeopardy/README.md) - [BLIS](./docs/BLIS.md) - [Performance troubleshooting](./docs/token_generation_performance_tips.md) diff --git a/SHA256SUMS b/SHA256SUMS deleted file mode 100644 index ca4d5a4a5..000000000 --- a/SHA256SUMS +++ /dev/null @@ -1,40 +0,0 @@ -700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth -666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin -ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf models/7B/ggml-model-q4_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_1.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_1.bin -7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json -745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth -d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth -2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin -fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5 models/13B/ggml-model-q4_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_1.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_1.bin -4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json -e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth -4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth -24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth -1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth -7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin -d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d models/30B/ggml-model-q4_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_1.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_1.bin -2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json -135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth -9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth -e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770 models/65B/consolidated.02.pth -73176ffb426b40482f2aa67ae1217ef79fbbd1fff5482bae5060cdc5a24ab70e models/65B/consolidated.03.pth -882e6431d0b08a8bc66261a0d3607da21cbaeafa96a24e7e59777632dbdac225 models/65B/consolidated.04.pth -a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/consolidated.05.pth -72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth -d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth -60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin -cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92 models/65B/ggml-model-q4_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_1.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_1.bin -999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json -9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model diff --git a/awq-py/README.md b/awq-py/README.md new file mode 100644 index 000000000..16e68d027 --- /dev/null +++ b/awq-py/README.md @@ -0,0 +1,116 @@ +# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp +[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)] + +**Supported models:** + +- [X] LLaMA +- [x] LLaMA 2 +- [X] MPT +- [X] Mistral AI v0.1 +- [ ] Bloom +- [ ] Mixtral MoE + +**TODO:** +- [x] Update version work with both MPT and MPT-AWQ model +- [ ] Add OPT model +- [ ] Add Bloom model +- [ ] Add Mixtral MoE +- [ ] Support w3, w2 + + +## Contents + +- [Install](##Install) +- [Convert](##Convert) +- [Quantize](##Quantize) +- [Test](##Test) +- [Benchmark](##Benchmark) +- [Results](##Results) + +## Install +Install requirements +```bash +pip install -r requirements.txt +``` +Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT +```bash +git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache +``` + +## Convert +Example for llama model +```bash +# For llama7b and llama2 models +python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf +# For mistral and mpt models +python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf +``` + +## Quantize +```bash +# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types. +./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0 +``` + +## Test +```bash +# For all models. +./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time" +``` + +## Benchmark +The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512. +```bash +# For llama and llama2, and mistral models. +./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw +``` + +## Results +Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison +We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k + +### Llama 7B (Build with OpenBLAS) + +| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | +|-----------:|--------------|-------:|-------:|-------:|-------:| +|Llama 7B | perplexity | 5.9066 | 6.1214 | 6.0643 | 6.5808 | +|Llama 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G | +|Llama 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | +|AWQ-LLama 7B| perplexity | 5.9175 | 6.0252 | 5.9987 | 6.3692 | +|AWQ-LLama 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G | +|AWQ-LLama 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | + + +### Llama2 7B (Build with CuBLAS) + +| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | +|------------:|--------------|-------:|-------:|-------:|-------:| +|Llama2 7B | perplexity | 5.8664 | 6.0260 | 6.0656 | 6.4496 | +|Llama2 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G | +|Llama2 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | +|AWQ-LLama2 7B| perplexity | 5.8801 | 6.0054 | 5.9849 | 6.3650 | +|AWQ-LLama2 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G | +|AWQ-LLama2 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | + + +### Mistral 7B v0.1 (Build with CuBLAS) + +| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | +|-------------:|--------------|-------:|-------:|-------:|-------:| +|Mistral 7B | perplexity | 5.6931 | 5.8202 | 5.8268 | 6.1645 | +|Mistral 7B | file size | 14.5G | 4.1G | 4.5G | 3.1G | +|Mistral 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | +|AWQ-Mistral 7B| perplexity | 5.6934 | 5.8020 | 5.7691 | 6.0426 | +|AWQ-Mistral 7B| file size | 14.5G | 4.1G | 4.5G | 3.1G | +|AWQ-Mistral 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | + +### MPT 7B (Build with OpenBLAS) + +| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | +|---------:|--------------|-------:|-------:|-------:|--------:| +|MPT 7B | perplexity | 8.4369 | 8.7956 | 8.6265 | 11.4913 | +|MPT 7B | file size | 13.7G | 3.9G | 4.3G | 2.8G | +|MPT 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | +|AWQ-MPT 7B| perplexity | 8.4944 | 8.7053 | 8.6750 | 10.2873| +|AWQ-MPT 7B| file size | 13.7G | 3.9G | 4.3G | 2.8G | +|AWQ-MPT 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | diff --git a/awq-py/awq/apply_awq.py b/awq-py/awq/apply_awq.py new file mode 100644 index 000000000..11132c5d2 --- /dev/null +++ b/awq-py/awq/apply_awq.py @@ -0,0 +1,254 @@ +""" +Implements the AWQ for llama.cpp use cases. +Original paper: https://arxiv.org/abs/2306.00978 + +This code is based on versions of the AWQ implementation found in the following repositories: +* https://github.com/mit-han-lab/llm-awq +* https://github.com/casper-hansen/AutoAWQ +""" + +import os +import torch +import torch.nn as nn + +from transformers import AutoModelForCausalLM, AutoConfig +from transformers.models.bloom.modeling_bloom import BloomGelu +from transformers.models.llama.modeling_llama import LlamaRMSNorm +from transformers.activations import GELUActivation + + +class ScaledActivation(nn.Module): + """ + ScaledActivation module wraps an existing activation function and applies a + scale factor to its output. + + Args: + module (nn.Module): The activation function to be scaled. + scales (torch.Tensor): A tensor of size (num_features,) containing the initial + scale factors for each feature. + + Returns: + torch.Tensor: The scaled output of the activation function. + """ + + def __init__(self, module, scales): + super().__init__() + self.act = module + self.scales = nn.Parameter(scales.data) + + def forward(self, x): + return self.act(x) / self.scales.view(1, 1, -1).to(x.device) + + +def set_op_by_name(layer, name, new_module): + """ + Set the new module for given module's name. + + Args: + layer (nn.Module): The layer in which to replace the submodule. + name (str): The path to the submodule to be replaced, using dot notation + to access nested modules. + new_module (nn.Module): The new module to replace the existing one. + """ + levels = name.split(".") + if len(levels) > 1: + mod_ = layer + for l_idx in range(len(levels) - 1): + if levels[l_idx].isdigit(): + mod_ = mod_[int(levels[l_idx])] + else: + mod_ = getattr(mod_, levels[l_idx]) + setattr(mod_, levels[-1], new_module) + else: + setattr(layer, name, new_module) + + +def get_op_by_name(module, op_name): + """ + Retrieves a submodule within a given layer based on its name. + + Args: + module (nn.Module): The layer containing the submodule to find. + op_name (str): The name of the submodule. + + Returns: + nn.Module: The requested submodule found within the given layer. + + Raises: + ValueError: If the specified submodule cannot be found within the layer. + """ + for name, m in module.named_modules(): + if name == op_name: + return m + raise ValueError(f"Cannot find op {op_name} in module {module}") + + +@torch.no_grad() +def scale_ln_fcs(ln, fcs, scales): + """ + Scales the weights of a LayerNorm and a list of fully-connected layers proportionally. + + Args: + ln (nn.LayerNorm): The LayerNorm module to be scaled. + fcs (List[nn.Linear]): A list of fully-connected layers to be scaled. + scales (torch.Tensor): A 1D tensor of size (num_features,). + """ + + if not isinstance(fcs, list): + fcs = [fcs] + + scales = scales.to(ln.weight.device) + + ln.weight.div_(scales) + if hasattr(ln, "bias") and ln.bias is not None: + ln.bias.div_(scales) + + for fc in fcs: + fc.weight.mul_(scales.view(1, -1)) + + for p in ln.parameters(): + assert torch.isnan(p).sum() == 0 + for fc in fcs: + for p in fc.parameters(): + assert torch.isnan(p).sum() == 0 + + +@torch.no_grad() +def scale_fc_fc(fc1, fc2, scales): + """ + Scales the weights of two fully-connected layers in a specific pattern. + + Args: + fc1 (nn.Linear): The first fully-connected layer to be scaled. + fc2 (nn.Linear): The second fully-connected layer to be scaled. + scales (torch.Tensor): A 1D tensor of size (num_features,). + """ + assert isinstance(fc1, nn.Linear) + assert isinstance(fc2, nn.Linear) + + scales = scales.to(fc1.weight.device) + + fc1.weight[-scales.size(0):].div_(scales.view(-1, 1)) + if fc1.bias is not None: + fc1.bias.div_(scales.view(-1)) + + fc2.weight.mul_(scales.view(1, -1)) + + for p in fc1.parameters(): + assert torch.isnan(p).sum() == 0 + for p in fc2.parameters(): + assert torch.isnan(p).sum() == 0 + + +@torch.no_grad() +def scale_gelu_fc(gelu, fc, scales): + """ + Scales the weight of a GELU activation and a fully-connected layer proportionally. + + Args: + gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled. + fc (nn.Linear): The fully-connected layer to be scaled. + scales (torch.Tensor): A 1D tensor of size (num_features,). + + Raises: + TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`. + TypeError: If the `fc` module is not of type `nn.Linear`. + """ + assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation)) + assert isinstance(fc, nn.Linear) + + fc.weight.mul_(scales.view(1, -1).to(fc.weight.device)) + + for p in fc.parameters(): + assert torch.isnan(p).sum() == 0 + + +def apply_scale(module, scales_list, input_feat_dict=None): + """ + Applies different scaling strategies to layers based on their type and hierarchy within a given module. + + Args: + module (nn.Module): The module containing the layers to be scaled. + scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing: + * prev_op_name (str): The name of the preceding operation or module, + relative to which the layers to be scaled are located. + * layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation. + * scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature. + input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding + input features (optional). + """ + for prev_op_name, layer_names, scales in scales_list: + prev_op = get_op_by_name(module, prev_op_name) + layers = [get_op_by_name(module, name) for name in layer_names] + + prev_op.cuda() + for layer in layers: + layer.cuda() + scales.cuda() + + if isinstance(prev_op, nn.Linear): + assert len(layers) == 1 + scale_fc_fc(prev_op, layers[0], scales) + elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower(): + scale_ln_fcs(prev_op, layers, scales) + elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)): + new_module = ScaledActivation(prev_op, scales) + set_op_by_name(module, prev_op_name, new_module) + scale_gelu_fc(prev_op, layers[0], scales) + else: + raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!") + + # apply the scaling to input feat if given; prepare it for clipping + if input_feat_dict is not None: + for layer_name in layer_names: + inp = input_feat_dict[layer_name] + inp.div_(scales.view(1, -1).to(inp.device)) + + prev_op.cpu() + for layer in layers: + layer.cpu() + scales.cpu() + + +@torch.no_grad() +def apply_clip(module, clip_list): + """ + Applies element-wise clipping to the weight of a specific layer within a given module. + + Args: + module (nn.Module): The module containing the layer to be clipped. + clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing: + * name (str): The name of the layer to be clipped, relative to the root of the module. + * max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight. + """ + for name, max_val in clip_list: + layer = get_op_by_name(module, name) + layer.cuda() + max_val = max_val.to(layer.weight.device) + org_shape = layer.weight.shape + layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1) + layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val) + layer.weight.data = layer.weight.data.reshape(org_shape) + layer.cpu() + + +def add_scale_weights(model_path, scale_path, tmp_path): + """ + Adds pre-computed Activation Weight Quantization (AWQ) results to a model, + including scaling factors and clipping bounds. + + Args: + model_path (str): Path to the pre-trained model to be equipped with AWQ. + scale_path (str): Path to the AWQ scale factors (.pt file). + tmp_path (str): Path to the temporary directory where the equipped model will be saved. + """ + config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + model_path, config=config, trust_remote_code=True + ) + model.eval() + awq_results = torch.load(str(scale_path), map_location="cpu") + apply_scale(model, awq_results["scale"]) + apply_clip(model, awq_results["clip"]) + model.save_pretrained(str(tmp_path)) + os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}") diff --git a/awq-py/requirements.txt b/awq-py/requirements.txt new file mode 100644 index 000000000..991896116 --- /dev/null +++ b/awq-py/requirements.txt @@ -0,0 +1,2 @@ +torch>=2.1.1 +transformers>=4.32.0 diff --git a/build.zig b/build.zig index 0b74cee48..c0af454dc 100644 --- a/build.zig +++ b/build.zig @@ -10,7 +10,6 @@ const Maker = struct { builder: *std.build.Builder, target: CrossTarget, optimize: Mode, - config_header: *ConfigHeader, enable_lto: bool, include_dirs: ArrayList([]const u8), @@ -41,26 +40,24 @@ const Maker = struct { const commit_hash = try std.ChildProcess.exec( .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } }, ); - const config_header = builder.addConfigHeader( - .{ .style = .blank, .include_path = "build-info.h" }, - .{ - .BUILD_NUMBER = 0, - .BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline - .BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}), - .BUILD_TARGET = try target.allocDescription(builder.allocator), - }, - ); + try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt( + \\int LLAMA_BUILD_NUMBER = {}; + \\char const *LLAMA_COMMIT = "{s}"; + \\char const *LLAMA_COMPILER = "Zig {s}"; + \\char const *LLAMA_BUILD_TARGET = "{s}"; + \\ + , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) })); var m = Maker{ .builder = builder, .target = target, .optimize = builder.standardOptimizeOption(.{}), - .config_header = config_header, .enable_lto = false, .include_dirs = ArrayList([]const u8).init(builder.allocator), .cflags = ArrayList([]const u8).init(builder.allocator), .cxxflags = ArrayList([]const u8).init(builder.allocator), .objs = ArrayList(*Compile).init(builder.allocator), }; + try m.addCFlag("-std=c11"); try m.addCxxFlag("-std=c++11"); try m.addProjectInclude(&.{}); @@ -72,7 +69,7 @@ const Maker = struct { const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize }); if (o.target.getAbi() != .msvc) o.defineCMacro("_GNU_SOURCE", null); - o.addConfigHeader(m.config_header); + if (std.mem.endsWith(u8, src, ".c")) { o.addCSourceFiles(&.{src}, m.cflags.items); o.linkLibC(); @@ -85,7 +82,6 @@ const Maker = struct { o.linkLibCpp(); } } - o.addConfigHeader(m.config_header); for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i }); o.want_lto = m.enable_lto; return o; @@ -105,7 +101,6 @@ const Maker = struct { // linkLibCpp already add (libc++ + libunwind + libc) e.linkLibCpp(); } - e.addConfigHeader(m.config_header); m.builder.installArtifact(e); e.want_lto = m.enable_lto; return e; @@ -116,30 +111,28 @@ pub fn build(b: *std.build.Builder) !void { var make = try Maker.init(b); make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false; - if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) { - try make.addFlag("-DGGML_USE_K_QUANTS"); - const k_quants = make.obj("k_quants", "k_quants.c"); - try make.objs.append(k_quants); - } - const ggml = make.obj("ggml", "ggml.c"); const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); const ggml_backend = make.obj("ggml-backend", "ggml-backend.c"); + const ggml_quants = make.obj("ggml-quants", "ggml-quants.c"); const llama = make.obj("llama", "llama.cpp"); + const buildinfo = make.obj("common", "common/build-info.cpp"); const common = make.obj("common", "common/common.cpp"); const console = make.obj("console", "common/console.cpp"); const sampling = make.obj("sampling", "common/sampling.cpp"); const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp"); const train = make.obj("train", "common/train.cpp"); + const clip = make.obj("clip", "examples/llava/clip.cpp"); + const llava = make.obj("llava", "examples/llava/llava.cpp"); - _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser }); - _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); - _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); - _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); - _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); - _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); + _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser }); + _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo }); + _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo }); + _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo }); + _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train }); + _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train }); - const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser }); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava }); if (server.target.isWindows()) { server.linkSystemLibrary("ws2_32"); } diff --git a/ci/README.md b/ci/README.md index 65cfe63eb..406470519 100644 --- a/ci/README.md +++ b/ci/README.md @@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt # with CUDA support GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt + +# with SYCL support +source /opt/intel/oneapi/setvars.sh +GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt ``` diff --git a/ci/run.sh b/ci/run.sh index 2e3343831..f3a29c2e9 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -10,6 +10,9 @@ # # with CUDA support # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # +# # with SYCL support +# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# if [ -z "$2" ]; then echo "usage: $0 " @@ -22,14 +25,32 @@ mkdir -p "$2" OUT=$(realpath "$1") MNT=$(realpath "$2") -rm -v $OUT/*.log -rm -v $OUT/*.exit -rm -v $OUT/*.md +rm -f "$OUT/*.log" +rm -f "$OUT/*.exit" +rm -f "$OUT/*.md" sd=`dirname $0` cd $sd/../ SRC=`pwd` +CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON" + +if [ ! -z ${GG_BUILD_METAL} ]; then + CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON" +fi + +if [ ! -z ${GG_BUILD_CUDA} ]; then + CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1" +fi + +if [ ! -z ${GG_BUILD_SYCL} ]; then + if [ -z ${ONEAPI_ROOT} ]; then + echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh" + exit 1 + fi + + CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON" +fi ## helpers # download a file if it does not exist or if it is outdated @@ -81,10 +102,10 @@ function gg_run_ctest_debug { set -e - (time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log set +e } @@ -109,13 +130,13 @@ function gg_run_ctest_release { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log if [ -z ${GG_BUILD_LOW_PERF} ]; then - (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log else - (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log fi set +e @@ -131,6 +152,61 @@ function gg_sum_ctest_release { gg_printf '```\n' } +function gg_get_model { + local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf" + local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf" + if [[ -s $gguf_3b ]]; then + echo -n "$gguf_3b" + elif [[ -s $gguf_7b ]]; then + echo -n "$gguf_7b" + else + echo >&2 "No model found. Can't run gg_run_ctest_with_model." + exit 1 + fi +} + +function gg_run_ctest_with_model_debug { + cd ${SRC} + + local model; model=$(gg_get_model) + cd build-ci-debug + set -e + (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log + set +e + cd .. +} + +function gg_run_ctest_with_model_release { + cd ${SRC} + + local model; model=$(gg_get_model) + cd build-ci-release + set -e + (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log + set +e + cd .. +} + +function gg_sum_ctest_with_model_debug { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs ctest with model files in debug mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)" + gg_printf '```\n' +} + +function gg_sum_ctest_with_model_release { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs ctest with model files in release mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)" + gg_printf '```\n' +} + # open_llama_3b_v2 function gg_run_open_llama_3b_v2 { @@ -143,7 +219,7 @@ function gg_run_open_llama_3b_v2 { gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json - gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip + gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw @@ -154,8 +230,8 @@ function gg_run_open_llama_3b_v2 { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../convert.py ${path_models} @@ -208,6 +284,8 @@ function gg_run_open_llama_3b_v2 { (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { @@ -235,6 +313,8 @@ function gg_run_open_llama_3b_v2 { check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log + # lora function compare_ppl { qnt="$1" @@ -276,7 +356,6 @@ function gg_run_open_llama_3b_v2 { (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log - set +e } @@ -286,6 +365,7 @@ function gg_sum_open_llama_3b_v2 { gg_printf 'OpenLLaMA 3B-v2:\n' gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" @@ -321,7 +401,7 @@ function gg_run_open_llama_7b_v2 { gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json - gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip + gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ path_models="../models-mnt/open-llama/7B-v2" @@ -331,8 +411,8 @@ function gg_run_open_llama_7b_v2 { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../convert.py ${path_models} @@ -385,6 +465,8 @@ function gg_run_open_llama_7b_v2 { (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { @@ -412,6 +494,8 @@ function gg_run_open_llama_7b_v2 { check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log + # lora function compare_ppl { qnt="$1" @@ -463,6 +547,7 @@ function gg_sum_open_llama_7b_v2 { gg_printf 'OpenLLaMA 7B-v2:\n' gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" @@ -483,17 +568,69 @@ function gg_sum_open_llama_7b_v2 { #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" } +# bge-small + +function gg_run_embd_bge_small { + cd ${SRC} + + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json + + gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json + + path_models="../models-mnt/bge-small" + + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + python3 ../convert-hf-to-gguf.py ${path_models} + + model_f16="${path_models}/ggml-model-f16.gguf" + model_q8_0="${path_models}/ggml-model-q8_0.gguf" + + ./bin/quantize ${model_f16} ${model_q8_0} q8_0 + + (time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + + set +e +} + +function gg_sum_embd_bge_small { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'BGE Small (BERT):\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" + gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" +} + ## main if [ -z ${GG_BUILD_LOW_PERF} ]; then + # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt rm -rf ${SRC}/models-mnt - mnt_models=${MNT}/models mkdir -p ${mnt_models} ln -sfn ${mnt_models} ${SRC}/models-mnt - python3 -m pip install -r ${SRC}/requirements.txt - python3 -m pip install --editable gguf-py + # Create a fresh python3 venv and enter it + python3 -m venv "$MNT/venv" + source "$MNT/venv/bin/activate" + + pip install -r ${SRC}/requirements.txt --disable-pip-version-check + pip install --editable gguf-py --disable-pip-version-check fi ret=0 @@ -502,12 +639,16 @@ test $ret -eq 0 && gg_run ctest_debug test $ret -eq 0 && gg_run ctest_release if [ -z ${GG_BUILD_LOW_PERF} ]; then + test $ret -eq 0 && gg_run embd_bge_small + if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_CUDA} ]; then test $ret -eq 0 && gg_run open_llama_3b_v2 else test $ret -eq 0 && gg_run open_llama_7b_v2 fi + test $ret -eq 0 && gg_run ctest_with_model_debug + test $ret -eq 0 && gg_run ctest_with_model_release fi fi diff --git a/cmake/FindSIMD.cmake b/cmake/FindSIMD.cmake new file mode 100644 index 000000000..33377ec44 --- /dev/null +++ b/cmake/FindSIMD.cmake @@ -0,0 +1,100 @@ +include(CheckCSourceRuns) + +set(AVX_CODE " + #include + int main() + { + __m256 a; + a = _mm256_set1_ps(0); + return 0; + } +") + +set(AVX512_CODE " + #include + int main() + { + __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0); + __m512i b = a; + __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ); + return 0; + } +") + +set(AVX2_CODE " + #include + int main() + { + __m256i a = {0}; + a = _mm256_abs_epi16(a); + __m256i x; + _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code + return 0; + } +") + +set(FMA_CODE " + #include + int main() + { + __m256 acc = _mm256_setzero_ps(); + const __m256 d = _mm256_setzero_ps(); + const __m256 p = _mm256_setzero_ps(); + acc = _mm256_fmadd_ps( d, p, acc ); + return 0; + } +") + +macro(check_sse type flags) + set(__FLAG_I 1) + set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + foreach (__FLAG ${flags}) + if (NOT ${type}_FOUND) + set(CMAKE_REQUIRED_FLAGS ${__FLAG}) + check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I}) + if (HAS_${type}_${__FLAG_I}) + set(${type}_FOUND TRUE CACHE BOOL "${type} support") + set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags") + endif() + math(EXPR __FLAG_I "${__FLAG_I}+1") + endif() + endforeach() + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) + + if (NOT ${type}_FOUND) + set(${type}_FOUND FALSE CACHE BOOL "${type} support") + set(${type}_FLAGS "" CACHE STRING "${type} flags") + endif() + + mark_as_advanced(${type}_FOUND ${type}_FLAGS) +endmacro() + +# flags are for MSVC only! +check_sse("AVX" " ;/arch:AVX") +if (NOT ${AVX_FOUND}) + set(LLAMA_AVX OFF) +else() + set(LLAMA_AVX ON) +endif() + +check_sse("AVX2" " ;/arch:AVX2") +check_sse("FMA" " ;/arch:AVX2") +if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND})) + set(LLAMA_AVX2 OFF) +else() + set(LLAMA_AVX2 ON) +endif() + +check_sse("AVX512" " ;/arch:AVX512") +if (NOT ${AVX512_FOUND}) + set(LLAMA_AVX512 OFF) +else() + set(LLAMA_AVX512 ON) +endif() diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index fbb0ff095..f79acfef1 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,8 +1,52 @@ # common + +# Build info header +# + +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git") + set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git") + + # Is git submodule + if(NOT IS_DIRECTORY "${GIT_DIR}") + file(READ ${GIT_DIR} REAL_GIT_DIR_LINK) + string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK}) + string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS) + if (SLASH_POS EQUAL 0) + set(GIT_DIR "${REAL_GIT_DIR}") + else() + set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}") + endif() + endif() + + set(GIT_INDEX "${GIT_DIR}/index") +else() + message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.") + set(GIT_INDEX "") +endif() + +# Add a custom command to rebuild build-info.cpp when .git/index changes +add_custom_command( + OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp" + COMMENT "Generating build details from Git" + COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} + -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake" + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.." + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX} + VERBATIM +) +set(TARGET build_info) +add_library(${TARGET} OBJECT build-info.cpp) +if (BUILD_SHARED_LIBS) + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() + + set(TARGET common) -add_library(${TARGET} OBJECT +add_library(${TARGET} STATIC + base64.hpp common.h common.cpp sampling.h @@ -21,4 +65,4 @@ endif() target_include_directories(${TARGET} PUBLIC .) target_compile_features(${TARGET} PUBLIC cxx_std_11) -target_link_libraries(${TARGET} PRIVATE llama) +target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama) diff --git a/common/base64.hpp b/common/base64.hpp new file mode 100644 index 000000000..563247a6e --- /dev/null +++ b/common/base64.hpp @@ -0,0 +1,392 @@ +/* +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to +*/ + +#ifndef PUBLIC_DOMAIN_BASE64_HPP_ +#define PUBLIC_DOMAIN_BASE64_HPP_ + +#include +#include +#include +#include + +class base64_error : public std::runtime_error +{ +public: + using std::runtime_error::runtime_error; +}; + +class base64 +{ +public: + enum class alphabet + { + /** the alphabet is detected automatically */ + auto_, + /** the standard base64 alphabet is used */ + standard, + /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/ + url_filename_safe + }; + + enum class decoding_behavior + { + /** if the input is not padded, the remaining bits are ignored */ + moderate, + /** if a padding character is encounter decoding is finished */ + loose + }; + + /** + Encodes all the elements from `in_begin` to `in_end` to `out`. + + @warning The source and destination cannot overlap. The destination must be able to hold at least + `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator. + + @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than + 8 bits + @tparam Output_iterator the destination; the elements written to it are from the type `char` + @param in_begin the beginning of the source + @param in_end the ending of the source + @param out the destination iterator + @param alphabet which alphabet should be used + @returns the iterator to the next element past the last element copied + @throws see `Input_iterator` and `Output_iterator` + */ + template + static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out, + alphabet alphabet = alphabet::standard) + { + constexpr auto pad = '='; + const char* alpha = alphabet == alphabet::url_filename_safe + ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" + : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + + while (in_begin != in_end) { + std::uint8_t i0 = 0, i1 = 0, i2 = 0; + + // first character + i0 = static_cast(*in_begin); + ++in_begin; + + *out = alpha[i0 >> 2 & 0x3f]; + ++out; + + // part of first character and second + if (in_begin != in_end) { + i1 = static_cast(*in_begin); + ++in_begin; + + *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)]; + ++out; + } else { + *out = alpha[(i0 & 0x3) << 4]; + ++out; + + // last padding + *out = pad; + ++out; + + // last padding + *out = pad; + ++out; + + break; + } + + // part of second character and third + if (in_begin != in_end) { + i2 = static_cast(*in_begin); + ++in_begin; + + *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)]; + ++out; + } else { + *out = alpha[(i1 & 0xf) << 2]; + ++out; + + // last padding + *out = pad; + ++out; + + break; + } + + // rest of third + *out = alpha[i2 & 0x3f]; + ++out; + } + + return out; + } + /** + Encodes a string. + + @param str the string that should be encoded + @param alphabet which alphabet should be used + @returns the encoded base64 string + @throws see base64::encode() + */ + static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard) + { + std::string result; + + result.reserve(required_encode_size(str.length()) + 1); + + encode(str.begin(), str.end(), std::back_inserter(result), alphabet); + + return result; + } + /** + Encodes a char array. + + @param buffer the char array + @param size the size of the array + @param alphabet which alphabet should be used + @returns the encoded string + */ + static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard) + { + std::string result; + + result.reserve(required_encode_size(size) + 1); + + encode(buffer, buffer + size, std::back_inserter(result), alphabet); + + return result; + } + /** + Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`, + in other words: inplace decoding is possible. + + @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`, + otherwise the behavior depends on the output iterator. + + @tparam Input_iterator the source; the returned elements are cast to `char` + @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t` + @param in_begin the beginning of the source + @param in_end the ending of the source + @param out the destination iterator + @param alphabet which alphabet should be used + @param behavior the behavior when an error was detected + @returns the iterator to the next element past the last element copied + @throws base64_error depending on the set behavior + @throws see `Input_iterator` and `Output_iterator` + */ + template + static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out, + alphabet alphabet = alphabet::auto_, + decoding_behavior behavior = decoding_behavior::moderate) + { + //constexpr auto pad = '='; + std::uint8_t last = 0; + auto bits = 0; + + while (in_begin != in_end) { + auto c = *in_begin; + ++in_begin; + + if (c == '=') { + break; + } + + auto part = _base64_value(alphabet, c); + + // enough bits for one byte + if (bits + 6 >= 8) { + *out = (last << (8 - bits)) | (part >> (bits - 2)); + ++out; + + bits -= 2; + } else { + bits += 6; + } + + last = part; + } + + // check padding + if (behavior != decoding_behavior::loose) { + while (in_begin != in_end) { + auto c = *in_begin; + ++in_begin; + + if (c != '=') { + throw base64_error("invalid base64 character."); + } + } + } + + return out; + } + /** + Decodes a string. + + @param str the base64 encoded string + @param alphabet which alphabet should be used + @param behavior the behavior when an error was detected + @returns the decoded string + @throws see base64::decode() + */ + static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_, + decoding_behavior behavior = decoding_behavior::moderate) + { + std::string result; + + result.reserve(max_decode_size(str.length())); + + decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior); + + return result; + } + /** + Decodes a string. + + @param buffer the base64 encoded buffer + @param size the size of the buffer + @param alphabet which alphabet should be used + @param behavior the behavior when an error was detected + @returns the decoded string + @throws see base64::decode() + */ + static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_, + decoding_behavior behavior = decoding_behavior::moderate) + { + std::string result; + + result.reserve(max_decode_size(size)); + + decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior); + + return result; + } + /** + Decodes a string inplace. + + @param[in,out] str the base64 encoded string + @param alphabet which alphabet should be used + @param behavior the behavior when an error was detected + @throws base64::decode_inplace() + */ + static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_, + decoding_behavior behavior = decoding_behavior::moderate) + { + str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin()); + } + /** + Decodes a char array inplace. + + @param[in,out] str the string array + @param size the length of the array + @param alphabet which alphabet should be used + @param behavior the behavior when an error was detected + @returns the pointer to the next element past the last element decoded + @throws base64::decode_inplace() + */ + static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_, + decoding_behavior behavior = decoding_behavior::moderate) + { + return decode(str, str + size, str, alphabet, behavior); + } + /** + Returns the required decoding size for a given size. The value is calculated with the following formula: + + $$ + \lceil \frac{size}{4} \rceil \cdot 3 + $$ + + @param size the size of the encoded input + @returns the size of the resulting decoded buffer; this the absolute maximum + */ + static std::size_t max_decode_size(std::size_t size) noexcept + { + return (size / 4 + (size % 4 ? 1 : 0)) * 3; + } + /** + Returns the required encoding size for a given size. The value is calculated with the following formula: + + $$ + \lceil \frac{size}{3} \rceil \cdot 4 + $$ + + @param size the size of the decoded input + @returns the size of the resulting encoded buffer + */ + static std::size_t required_encode_size(std::size_t size) noexcept + { + return (size / 3 + (size % 3 ? 1 : 0)) * 4; + } + +private: + static std::uint8_t _base64_value(alphabet& alphabet, char c) + { + if (c >= 'A' && c <= 'Z') { + return c - 'A'; + } else if (c >= 'a' && c <= 'z') { + return c - 'a' + 26; + } else if (c >= '0' && c <= '9') { + return c - '0' + 52; + } + + // comes down to alphabet + if (alphabet == alphabet::standard) { + if (c == '+') { + return 62; + } else if (c == '/') { + return 63; + } + } else if (alphabet == alphabet::url_filename_safe) { + if (c == '-') { + return 62; + } else if (c == '_') { + return 63; + } + } // auto detect + else { + if (c == '+') { + alphabet = alphabet::standard; + + return 62; + } else if (c == '/') { + alphabet = alphabet::standard; + + return 63; + } else if (c == '-') { + alphabet = alphabet::url_filename_safe; + + return 62; + } else if (c == '_') { + alphabet = alphabet::url_filename_safe; + + return 63; + } + } + + throw base64_error("invalid base64 character."); + } +}; + +#endif // !PUBLIC_DOMAIN_BASE64_HPP_ diff --git a/common/build-info.cpp.in b/common/build-info.cpp.in new file mode 100644 index 000000000..0b945aa68 --- /dev/null +++ b/common/build-info.cpp.in @@ -0,0 +1,4 @@ +int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@; +char const *LLAMA_COMMIT = "@BUILD_COMMIT@"; +char const *LLAMA_COMPILER = "@BUILD_COMPILER@"; +char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@"; diff --git a/common/common.cpp b/common/common.cpp index ce14d66b8..ec596f5a0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1,5 +1,4 @@ #include "common.h" -#include "build-info.h" #include "llama.h" #include @@ -13,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +42,14 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) +#define GGML_USE_CUBLAS_SYCL +#endif + +#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) +#define GGML_USE_CUBLAS_SYCL_VULKAN +#endif + int32_t get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores @@ -91,6 +99,19 @@ void process_escapes(std::string& input) { case '\'': input[output_idx++] = '\''; break; case '\"': input[output_idx++] = '\"'; break; case '\\': input[output_idx++] = '\\'; break; + case 'x': + // Handle \x12, etc + if (input_idx + 2 < input_len) { + const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 }; + char *err_p = nullptr; + const long val = std::strtol(x, &err_p, 16); + if (err_p == x + 2) { + input_idx += 2; + input[output_idx++] = char(val); + break; + } + } + // fall through default: input[output_idx++] = '\\'; input[output_idx++] = input[input_idx]; break; } @@ -103,11 +124,26 @@ void process_escapes(std::string& input) { } bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + bool result = true; + try { + if (!gpt_params_parse_ex(argc, argv, params)) { + gpt_print_usage(argc, argv, gpt_params()); + exit(0); + } + } + catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); + gpt_print_usage(argc, argv, gpt_params()); + exit(1); + } + return result; +} + +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; std::string arg; - gpt_params default_params; const std::string arg_prefix = "--"; - llama_sampling_params & sparams = params.sampling_params; + llama_sampling_params & sparams = params.sparams; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -139,6 +175,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { if (params.n_threads_batch <= 0) { params.n_threads_batch = std::thread::hardware_concurrency(); } + } else if (arg == "-td" || arg == "--threads-draft") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads_draft = std::stoi(argv[i]); + if (params.n_threads_draft <= 0) { + params.n_threads_draft = std::thread::hardware_concurrency(); + } + } else if (arg == "-tbd" || arg == "--threads-batch-draft") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads_batch_draft = std::stoi(argv[i]); + if (params.n_threads_batch_draft <= 0) { + params.n_threads_batch_draft = std::thread::hardware_concurrency(); + } } else if (arg == "-p" || arg == "--prompt") { if (++i >= argc) { invalid_param = true; @@ -157,6 +211,23 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.prompt_cache_all = true; } else if (arg == "--prompt-cache-ro") { params.prompt_cache_ro = true; + } else if (arg == "-bf" || arg == "--binary-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i], std::ios::binary); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + break; + } + // store the external file name in params + params.prompt_file = argv[i]; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); } else if (arg == "-f" || arg == "--file") { if (++i >= argc) { invalid_param = true; @@ -192,6 +263,20 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_ctx = std::stoi(argv[i]); + } else if (arg == "--grp-attn-n" || arg == "-gan") { + if (++i >= argc) { + invalid_param = true; + break; + } + + params.grp_attn_n = std::stoi(argv[i]); + } else if (arg == "--grp-attn-w" || arg == "-gaw") { + if (++i >= argc) { + invalid_param = true; + break; + } + + params.grp_attn_w = std::stoi(argv[i]); } else if (arg == "--rope-freq-base") { if (++i >= argc) { invalid_param = true; @@ -204,26 +289,84 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.rope_freq_scale = std::stof(argv[i]); + } else if (arg == "--rope-scaling") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + else { invalid_param = true; break; } } else if (arg == "--rope-scale") { if (++i >= argc) { invalid_param = true; break; } params.rope_freq_scale = 1.0f/std::stof(argv[i]); - } else if (arg == "--memory-f32") { - params.memory_f16 = false; + } else if (arg == "--yarn-orig-ctx") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_orig_ctx = std::stoi(argv[i]); + } else if (arg == "--yarn-ext-factor") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_ext_factor = std::stof(argv[i]); + } else if (arg == "--yarn-attn-factor") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_attn_factor = std::stof(argv[i]); + } else if (arg == "--yarn-beta-fast") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_fast = std::stof(argv[i]); + } else if (arg == "--yarn-beta-slow") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_slow = std::stof(argv[i]); + } else if (arg == "--samplers") { + if (++i >= argc) { + invalid_param = true; + break; + } + const auto sampler_names = string_split(argv[i], ';'); + sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); + } else if (arg == "--sampling-seq") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.samplers_sequence = sampler_types_from_chars(argv[i]); } else if (arg == "--top-p") { if (++i >= argc) { invalid_param = true; break; } sparams.top_p = std::stof(argv[i]); + } else if (arg == "--min-p") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.min_p = std::stof(argv[i]); } else if (arg == "--temp") { if (++i >= argc) { invalid_param = true; break; } sparams.temp = std::stof(argv[i]); + sparams.temp = std::max(sparams.temp, 0.0f); } else if (arg == "--tfs") { if (++i >= argc) { invalid_param = true; @@ -241,25 +384,38 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - sparams.repeat_last_n = std::stoi(argv[i]); + sparams.penalty_last_n = std::stoi(argv[i]); + sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); } else if (arg == "--repeat-penalty") { if (++i >= argc) { invalid_param = true; break; } - sparams.repeat_penalty = std::stof(argv[i]); + sparams.penalty_repeat = std::stof(argv[i]); } else if (arg == "--frequency-penalty") { if (++i >= argc) { invalid_param = true; break; } - sparams.frequency_penalty = std::stof(argv[i]); + sparams.penalty_freq = std::stof(argv[i]); } else if (arg == "--presence-penalty") { if (++i >= argc) { invalid_param = true; break; } - sparams.presence_penalty = std::stof(argv[i]); + sparams.penalty_present = std::stof(argv[i]); + } else if (arg == "--dynatemp-range") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.dynatemp_range = std::stof(argv[i]); + } else if (arg == "--dynatemp-exp") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.dynatemp_exponent = std::stof(argv[i]); } else if (arg == "--mirostat") { if (++i >= argc) { invalid_param = true; @@ -341,6 +497,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_sequences = std::stoi(argv[i]); + } else if (arg == "--p-accept" || arg == "-pa") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.p_accept = std::stof(argv[i]); + } else if (arg == "--p-split" || arg == "-ps") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.p_split = std::stof(argv[i]); } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_param = true; @@ -364,7 +532,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f)); + params.lora_adapter.emplace_back(argv[i], 1.0f); params.use_mmap = false; } else if (arg == "--lora-scaled") { if (++i >= argc) { @@ -376,7 +544,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i]))); + params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.use_mmap = false; } else if (arg == "--lora-base") { if (++i >= argc) { @@ -404,8 +572,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.interactive_first = true; } else if (arg == "-ins" || arg == "--instruct") { params.instruct = true; + } else if (arg == "-cml" || arg == "--chatml") { + params.chatml = true; } else if (arg == "--infill") { params.infill = true; + } else if (arg == "-dkvc" || arg == "--dump-kv-cache") { + params.dump_kv_cache = true; + } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + params.no_kv_offload = true; + } else if (arg == "-ctk" || arg == "--cache-type-k") { + params.cache_type_k = argv[++i]; + } else if (arg == "-ctv" || arg == "--cache-type-v") { + params.cache_type_v = argv[++i]; } else if (arg == "--multiline-input") { params.multiline_input = true; } else if (arg == "--simple-io") { @@ -421,75 +599,97 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD params.n_gpu_layers = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); -#endif + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { if (++i >= argc) { invalid_param = true; break; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD params.n_gpu_layers_draft = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); -#endif + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } } else if (arg == "--main-gpu" || arg == "-mg") { if (++i >= argc) { invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS params.main_gpu = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); -#endif +#ifndef GGML_USE_CUBLAS_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUBLAS_SYCL + } else if (arg == "--split-mode" || arg == "-sm") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string arg_next = argv[i]; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_MODE_NONE; + } else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_MODE_LAYER; + } else if (arg_next == "row") { + params.split_mode = LLAMA_SPLIT_MODE_ROW; + } else { + invalid_param = true; + break; + } +#ifndef GGML_USE_CUBLAS_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUBLAS_SYCL + } else if (arg == "--tensor-split" || arg == "-ts") { if (++i >= argc) { invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS std::string arg_next = argv[i]; // split string by , and / const std::regex regex{R"([,/]+)"}; std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { + if (split_arg.size() >= llama_max_devices()) { + invalid_param = true; + break; + } + for (size_t i = 0; i < llama_max_devices(); ++i) { if (i < split_arg.size()) { params.tensor_split[i] = std::stof(split_arg[i]); } else { params.tensor_split[i] = 0.0f; } } -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); -#endif // GGML_USE_CUBLAS - } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { -#ifdef GGML_USE_CUBLAS - params.mul_mat_q = false; -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n"); -#endif // GGML_USE_CUBLAS +#ifndef GGML_USE_CUBLAS_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); +#endif // GGML_USE_CUBLAS_SYCL } else if (arg == "--no-mmap") { params.use_mmap = false; } else if (arg == "--numa") { - params.numa = true; + if (++i >= argc) { + invalid_param = true; + break; + } + std::string value(argv[i]); + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { invalid_param = true; break; } } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; + } else if (arg == "--no-display-prompt") { + params.display_prompt = false; } else if (arg == "-r" || arg == "--reverse-prompt") { if (++i >= argc) { invalid_param = true; break; } - params.antiprompt.push_back(argv[i]); + params.antiprompt.emplace_back(argv[i]); } else if (arg == "-ld" || arg == "--logdir") { if (++i >= argc) { invalid_param = true; @@ -500,6 +700,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { if (params.logdir.back() != DIRECTORY_SEPARATOR) { params.logdir += DIRECTORY_SEPARATOR; } + } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.logits_file = argv[i]; } else if (arg == "--perplexity" || arg == "--all-logits") { params.logits_all = true; } else if (arg == "--ppl-stride") { @@ -508,6 +714,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.ppl_stride = std::stoi(argv[i]); + } else if (arg == "-ptc" || arg == "--print-token-count") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_print = std::stoi(argv[i]); } else if (arg == "--ppl-output-type") { if (++i >= argc) { invalid_param = true; @@ -522,6 +734,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.hellaswag_tasks = std::stoi(argv[i]); + } else if (arg == "--winogrande") { + params.winogrande = true; + } else if (arg == "--winogrande-tasks") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.winogrande_tasks = std::stoi(argv[i]); + } else if (arg == "--multiple-choice") { + params.multiple_choice = true; + } else if (arg == "--multiple-choice-tasks") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.multiple_choice_tasks = std::stoi(argv[i]); + } else if (arg == "--kl-divergence") { + params.kl_divergence = true; } else if (arg == "--ignore-eos") { params.ignore_eos = true; } else if (arg == "--no-penalize-nl") { @@ -546,10 +776,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } } else if (arg == "-h" || arg == "--help") { - gpt_print_usage(argc, argv, default_params); -#ifndef LOG_DISABLE_LOGS - log_print_usage(); -#endif // LOG_DISABLE_LOGS + return false; + + } else if (arg == "--version") { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); } else if (arg == "--random-prompt") { params.random_prompt = true; @@ -572,7 +803,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.grammar = argv[i]; + sparams.grammar = argv[i]; } else if (arg == "--grammar-file") { if (++i >= argc) { invalid_param = true; @@ -587,8 +818,49 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { std::copy( std::istreambuf_iterator(file), std::istreambuf_iterator(), - std::back_inserter(params.grammar) + std::back_inserter(sparams.grammar) ); + } else if (arg == "--override-kv") { + if (++i >= argc) { + invalid_param = true; + break; + } + char * sep = strchr(argv[i], '='); + if (sep == nullptr || sep - argv[i] >= 128) { + fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); + invalid_param = true; + break; + } + struct llama_model_kv_override kvo; + std::strncpy(kvo.key, argv[i], sep - argv[i]); + kvo.key[sep - argv[i]] = 0; + sep++; + if (strncmp(sep, "int:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.int_value = std::atol(sep); + } else if (strncmp(sep, "float:", 6) == 0) { + sep += 6; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; + kvo.float_value = std::atof(sep); + } else if (strncmp(sep, "bool:", 5) == 0) { + sep += 5; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; + if (std::strcmp(sep, "true") == 0) { + kvo.bool_value = true; + } else if (std::strcmp(sep, "false") == 0) { + kvo.bool_value = false; + } else { + fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); + invalid_param = true; + break; + } + } else { + fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); + invalid_param = true; + break; + } + params.kv_overrides.push_back(kvo); #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters } else if ( log_param_single_parse( argv[i] ) ) { @@ -609,46 +881,58 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { // End of Parse args for logging parameters #endif // LOG_DISABLE_LOGS } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, default_params); - exit(1); + throw std::invalid_argument("error: unknown argument: " + arg); } } if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, default_params); - exit(1); + throw std::invalid_argument("error: invalid parameter for argument: " + arg); } if (params.prompt_cache_all && (params.interactive || params.interactive_first || params.instruct)) { - fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n"); - gpt_print_usage(argc, argv, default_params); - exit(1); + + throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } if (params.escape) { process_escapes(params.prompt); process_escapes(params.input_prefix); process_escapes(params.input_suffix); + process_escapes(sparams.cfg_negative_prompt); for (auto & antiprompt : params.antiprompt) { process_escapes(antiprompt); } } + if (!params.kv_overrides.empty()) { + params.kv_overrides.emplace_back(); + params.kv_overrides.back().key[0] = 0; + } + return true; } void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - const llama_sampling_params & sparams = params.sampling_params; + const llama_sampling_params & sparams = params.sparams; + std::string sampler_type_chars; + std::string sampler_type_names; + for (const auto sampler_type : sparams.samplers_sequence) { + sampler_type_chars += static_cast(sampler_type); + sampler_type_names += sampler_type_to_name_string(sampler_type) + ";"; + } + sampler_type_names.pop_back(); + + printf("\n"); printf("usage: %s [options]\n", argv[0]); printf("\n"); printf("options:\n"); printf(" -h, --help show this help message and exit\n"); + printf(" --version show version and build info\n"); printf(" -i, --interactive run in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); + printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); printf(" -r PROMPT, --reverse-prompt PROMPT\n"); printf(" halt generation at PROMPT, return control in interactive mode\n"); @@ -658,6 +942,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); printf(" -tb N, --threads-batch N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); + printf(" -td N, --threads-draft N"); + printf(" number of threads to use during generation (default: same as --threads)\n"); + printf(" -tbd N, --threads-batch-draft N\n"); + printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n"); printf(" -p PROMPT, --prompt PROMPT\n"); printf(" prompt to start generation with (default: empty)\n"); printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); @@ -671,17 +959,25 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); printf(" -f FNAME, --file FNAME\n"); printf(" prompt file to start generation.\n"); + printf(" -bf FNAME, --binary-file FNAME\n"); + printf(" binary file containing multiple choice tasks.\n"); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n"); + printf(" (default: %s)\n", sampler_type_names.c_str()); + printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str()); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); + printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z); printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p); - printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n); - printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty); - printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty); - printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty); + printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n); + printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat); + printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present); + printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq); + printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range); + printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent); printf(" --mirostat N use Mirostat sampling.\n"); printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat); @@ -698,60 +994,97 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --cfg-negative-prompt-file FNAME\n"); printf(" negative prompt file to use for guidance. (default: empty)\n"); printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale); - printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n"); + printf(" --rope-scaling {none,linear,yarn}\n"); + printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); + printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n"); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); + printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n"); + printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); + printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); + printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); + printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --no-penalize-nl do not penalize newline token\n"); - printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp); printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); + printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n"); + printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks); + printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n"); + printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks); + printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n"); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); + printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept); + printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n"); - if (llama_mlock_supported()) { + if (llama_supports_mlock()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } - if (llama_mmap_supported()) { + if (llama_supports_mmap()) { printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } - printf(" --numa attempt optimizations that help on some NUMA systems\n"); + printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n"); + printf(" - distribute: spread execution evenly over all nodes\n"); + printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n"); + printf(" - numactl: use the CPU map provided by numactl\n"); printf(" if run without this previously, it is recommended to drop the system page cache before using this\n"); printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n"); -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); - printf(" -ngld N, --n-gpu-layers-draft N\n"); - printf(" number of layers to store in VRAM for the draft model\n"); - printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); -#ifdef GGML_USE_CUBLAS - printf(" -nommq, --no-mul-mat-q\n"); - printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); - printf(" Not recommended since this is both slower and uses more VRAM.\n"); -#endif // GGML_USE_CUBLAS -#endif - printf(" --verbose-prompt print prompt before generation\n"); - fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); + if (llama_supports_gpu_offload()) { + printf(" -ngl N, --n-gpu-layers N\n"); + printf(" number of layers to store in VRAM\n"); + printf(" -ngld N, --n-gpu-layers-draft N\n"); + printf(" number of layers to store in VRAM for the draft model\n"); + printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); + printf(" -ts SPLIT, --tensor-split SPLIT\n"); + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); + } + printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); + printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); + printf(" -gan N, --grp-attn-n N\n"); + printf(" group-attention factor (default: %d)\n", params.grp_attn_n); + printf(" -gaw N, --grp-attn-w N\n"); + printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w); + printf(" -dkvc, --dump-kv-cache\n"); + printf(" verbose print of the KV cache\n"); + printf(" -nkvo, --no-kv-offload\n"); + printf(" disable KV offload\n"); + printf(" -ctk TYPE, --cache-type-k TYPE\n"); + printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str()); + printf(" -ctv TYPE, --cache-type-v TYPE\n"); + printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str()); + printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); printf(" -md FNAME, --model-draft FNAME\n"); - printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str()); + printf(" draft model for speculative decoding\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" path under which to save YAML logs (no logging if unset)\n"); + printf(" --override-kv KEY=TYPE:VALUE\n"); + printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); + printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); + printf(" -ptc N, --print-token-count N\n"); + printf(" print token count every N tokens (default: %d)\n", params.n_print); printf("\n"); +#ifndef LOG_DISABLE_LOGS + log_print_usage(); +#endif // LOG_DISABLE_LOGS } std::string get_system_info(const gpt_params & params) { @@ -784,6 +1117,104 @@ std::string gpt_random_prompt(std::mt19937 & rng) { GGML_UNREACHABLE(); } +// +// String utils +// + +std::vector string_split(std::string input, char separator) { + std::vector parts; + size_t separator_pos = input.find(separator); + while (separator_pos != std::string::npos) { + std::string part = input.substr(0, separator_pos); + parts.emplace_back(part); + input = input.substr(separator_pos + 1); + separator_pos = input.find(separator); + } + parts.emplace_back(input); + return parts; +} + +std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names) { + std::unordered_map sampler_canonical_name_map { + {"top_k", llama_sampler_type::TOP_K}, + {"top_p", llama_sampler_type::TOP_P}, + {"typical_p", llama_sampler_type::TYPICAL_P}, + {"min_p", llama_sampler_type::MIN_P}, + {"tfs_z", llama_sampler_type::TFS_Z}, + {"temperature", llama_sampler_type::TEMPERATURE} + }; + + // since samplers names are written multiple ways + // make it ready for both system names and input names + std::unordered_map sampler_alt_name_map { + {"top-k", llama_sampler_type::TOP_K}, + {"top-p", llama_sampler_type::TOP_P}, + {"nucleus", llama_sampler_type::TOP_P}, + {"typical-p", llama_sampler_type::TYPICAL_P}, + {"typical", llama_sampler_type::TYPICAL_P}, + {"min-p", llama_sampler_type::MIN_P}, + {"tfs-z", llama_sampler_type::TFS_Z}, + {"tfs", llama_sampler_type::TFS_Z}, + {"temp", llama_sampler_type::TEMPERATURE} + }; + + std::vector sampler_types; + sampler_types.reserve(names.size()); + for (const auto & name : names) + { + auto sampler_item = sampler_canonical_name_map.find(name); + if (sampler_item != sampler_canonical_name_map.end()) + { + sampler_types.push_back(sampler_item->second); + } + else + { + if (allow_alt_names) + { + sampler_item = sampler_alt_name_map.find(name); + if (sampler_item != sampler_alt_name_map.end()) + { + sampler_types.push_back(sampler_item->second); + } + } + } + } + return sampler_types; +} + +std::vector sampler_types_from_chars(const std::string & names_string) { + std::unordered_map sampler_name_map { + {'k', llama_sampler_type::TOP_K}, + {'p', llama_sampler_type::TOP_P}, + {'y', llama_sampler_type::TYPICAL_P}, + {'m', llama_sampler_type::MIN_P}, + {'f', llama_sampler_type::TFS_Z}, + {'t', llama_sampler_type::TEMPERATURE} + }; + + std::vector sampler_types; + sampler_types.reserve(names_string.size()); + for (const auto & c : names_string) { + const auto sampler_item = sampler_name_map.find(c); + if (sampler_item != sampler_name_map.end()) { + sampler_types.push_back(sampler_item->second); + } + } + return sampler_types; +} + +std::string sampler_type_to_name_string(llama_sampler_type sampler_type) { + switch (sampler_type) { + case llama_sampler_type::TOP_K: return "top_k"; + case llama_sampler_type::TFS_Z: return "tfs_z"; + case llama_sampler_type::TYPICAL_P: return "typical_p"; + case llama_sampler_type::TOP_P: return "top_p"; + case llama_sampler_type::MIN_P: return "min_p"; + case llama_sampler_type::TEMPERATURE: return "temperature"; + default : return ""; + } +} + // // Model utils // @@ -795,27 +1226,69 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.n_gpu_layers = params.n_gpu_layers; } mparams.main_gpu = params.main_gpu; + mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; + if (params.kv_overrides.empty()) { + mparams.kv_overrides = NULL; + } else { + GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key"); + mparams.kv_overrides = params.kv_overrides.data(); + } return mparams; } +static ggml_type kv_cache_type_from_str(const std::string & s) { + if (s == "f32") { + return GGML_TYPE_F32; + } + if (s == "f16") { + return GGML_TYPE_F16; + } + if (s == "q8_0") { + return GGML_TYPE_Q8_0; + } + if (s == "q4_0") { + return GGML_TYPE_Q4_0; + } + if (s == "q4_1") { + return GGML_TYPE_Q4_1; + } + if (s == "q5_0") { + return GGML_TYPE_Q5_0; + } + if (s == "q5_1") { + return GGML_TYPE_Q5_1; + } + + throw std::runtime_error("Invalid cache type: " + s); +} + struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { auto cparams = llama_context_default_params(); - cparams.n_ctx = params.n_ctx; - cparams.n_batch = params.n_batch; - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; - cparams.mul_mat_q = params.mul_mat_q; - cparams.seed = params.seed; - cparams.f16_kv = params.memory_f16; - cparams.logits_all = params.logits_all; - cparams.embedding = params.embedding; - cparams.rope_freq_base = params.rope_freq_base; - cparams.rope_freq_scale = params.rope_freq_scale; + cparams.n_ctx = params.n_ctx; + cparams.n_batch = params.n_batch; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + cparams.mul_mat_q = params.mul_mat_q; + cparams.seed = params.seed; + cparams.logits_all = params.logits_all; + cparams.embedding = params.embedding; + cparams.rope_scaling_type = params.rope_scaling_type; + cparams.rope_freq_base = params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale; + cparams.yarn_ext_factor = params.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.yarn_orig_ctx = params.yarn_orig_ctx; + cparams.offload_kqv = !params.no_kv_offload; + + cparams.type_k = kv_cache_type_from_str(params.cache_type_k); + cparams.type_v = kv_cache_type_from_str(params.cache_type_v); return cparams; } @@ -831,7 +1304,7 @@ void llama_batch_add( const std::vector & seq_ids, bool logits) { batch.token [batch.n_tokens] = id; - batch.pos [batch.n_tokens] = pos, + batch.pos [batch.n_tokens] = pos; batch.n_seq_id[batch.n_tokens] = seq_ids.size(); for (size_t i = 0; i < seq_ids.size(); ++i) { batch.seq_id[batch.n_tokens][i] = seq_ids[i]; @@ -878,15 +1351,15 @@ std::tuple llama_init_from_gpt_par } if (params.ignore_eos) { - params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY; + params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } { LOG("warming up the model with an empty run\n"); - std::vector tmp = { llama_token_bos(lctx), llama_token_eos(lctx), }; + std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); - llama_kv_cache_tokens_rm(lctx, -1, -1); + llama_kv_cache_clear(lctx); llama_reset_timings(lctx); } @@ -939,7 +1412,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t } std::string llama_detokenize_spm(llama_context * ctx, const std::vector & tokens) { - const llama_token bos_id = llama_token_bos(ctx); + const llama_token bos_id = llama_token_bos(llama_get_model(ctx)); std::string piece; std::string result; @@ -972,6 +1445,12 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector & prompt_tokens, const char * model_desc) { - const llama_sampling_params & sparams = params.sampling_params; + const llama_sampling_params & sparams = params.sparams; - fprintf(stream, "build_commit: %s\n", BUILD_COMMIT); - fprintf(stream, "build_number: %d\n", BUILD_NUMBER); - fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); - fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); + fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); + fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); + fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); + fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false"); + fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); - fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); - fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); - fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); - fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); - fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); - fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); - fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); - fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); - fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); - fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); + fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); + fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false"); + fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); + fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); + fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); + fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); + fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); + fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); + fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); + fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); + fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); + fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); + fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false"); #ifdef NDEBUG fprintf(stream, "debug: false\n"); @@ -1178,13 +1661,13 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); - fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty); - dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); + fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq); + dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str()); fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); - const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx)); + const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx))); const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY; fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); @@ -1221,7 +1704,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l } fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); - fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false"); + fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); @@ -1235,17 +1718,16 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false"); fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false"); - fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false"); fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); - fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty); + fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str()); fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens); fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false"); - fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty); + fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat); fprintf(stream, "reverse_prompt:\n"); for (std::string ap : params.antiprompt) { @@ -1260,18 +1742,95 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); - fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed); + fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); - const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES); + const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); - fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); + fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); + fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); +} + +// +// KV cache utils +// + +void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { + static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; + + printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", + view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); + + llama_kv_cache_view_cell * c_curr = view.cells; + llama_seq_id * cs_curr = view.cells_sequences; + + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { + if (i % row_size == 0) { + printf("\n%5d: ", i); + } + int seq_count = 0; + for (int j = 0; j < view.n_max_seq; j++) { + if (cs_curr[j] >= 0) { seq_count++; } + } + putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]); + } + + printf("\n=== Done dumping\n"); +} + +void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { + static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + + printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", + view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); + + std::unordered_map seqs; + llama_kv_cache_view_cell * c_curr = view.cells; + llama_seq_id * cs_curr = view.cells_sequences; + + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { + for (int j = 0; j < view.n_max_seq; j++) { + if (cs_curr[j] < 0) { continue; } + if (seqs.find(cs_curr[j]) == seqs.end()) { + if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } + const size_t sz = seqs.size(); + seqs[cs_curr[j]] = sz; + } + } + if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } + } + + printf("=== Sequence legend: "); + for (const auto & it : seqs) { + printf("%zu=%d, ", it.second, it.first); + } + printf("'+'=other sequence ids"); + + c_curr = view.cells; + cs_curr = view.cells_sequences; + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { + if (i % row_size == 0) { + printf("\n%5d: ", i); + } + for (int j = 0; j < view.n_max_seq; j++) { + if (cs_curr[j] >= 0) { + const auto & it = seqs.find(cs_curr[j]); + putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+'); + } else { + putchar('.'); + } + } + putchar(' '); + } + + printf("\n=== Done dumping\n"); } diff --git a/common/common.h b/common/common.h index 65d3d20cd..3e21579b0 100644 --- a/common/common.h +++ b/common/common.h @@ -9,6 +9,7 @@ #define LOG_NO_FILE_LINE_FUNCTION #include "log.h" +#include #include #include #include @@ -25,38 +26,60 @@ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) -#define print_build_info() do { \ - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \ - fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \ +#define print_build_info() do { \ + fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ + fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ } while(0) +// build info +extern int LLAMA_BUILD_NUMBER; +extern char const *LLAMA_COMMIT; +extern char const *LLAMA_COMPILER; +extern char const *LLAMA_BUILD_TARGET; + // // CLI argument parsing // int32_t get_num_physical_cores(); struct gpt_params { - uint32_t seed = -1; // RNG seed - int32_t n_threads = get_num_physical_cores(); - int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 512; // context size - int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 16; // number of tokens to draft during speculative decoding - int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) - int32_t n_parallel = 1; // number of parallel sequences to decode - int32_t n_sequences = 1; // number of sequences to decode - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs - int32_t n_beams = 0; // if non-zero then use beam search of given width. - float rope_freq_base = 0.0f; // RoPE base frequency - float rope_freq_scale = 0.0f; // RoPE frequency scaling factor + uint32_t seed = -1; // RNG seed + + int32_t n_threads = get_num_physical_cores(); + int32_t n_threads_draft = -1; + int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) + int32_t n_threads_batch_draft = -1; + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 512; // context size + int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_draft = 8; // number of tokens to draft during speculative decoding + int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) + int32_t n_parallel = 1; // number of parallel sequences to decode + int32_t n_sequences = 1; // number of sequences to decode + float p_accept = 0.5f; // speculative decoding accept probability + float p_split = 0.1f; // speculative decoding split probability + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + int32_t n_beams = 0; // if non-zero then use beam search of given width. + int32_t grp_attn_n = 1; // group-attention factor + int32_t grp_attn_w = 512; // group-attention width + int32_t n_print = -1; // print token count every n tokens (-1 = disabled) + float rope_freq_base = 0.0f; // RoPE base frequency + float rope_freq_scale = 0.0f; // RoPE frequency scaling factor + float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor + float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor + float yarn_beta_fast = 32.0f; // YaRN low correction dim + float yarn_beta_slow = 1.0f; // YaRN high correction dim + int32_t yarn_orig_ctx = 0; // YaRN original context length + int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; + ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; // // sampling parameters - struct llama_sampling_params sampling_params; + struct llama_sampling_params sparams; std::string model = "models/7B/ggml-model-f16.gguf"; // model path std::string model_draft = ""; // draft model for speculative decoding @@ -66,9 +89,11 @@ struct gpt_params { std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state std::string input_prefix = ""; // string to prefix user inputs with std::string input_suffix = ""; // string to suffix user inputs with - std::string grammar = ""; // optional BNF-like grammar to constrain sampling std::vector antiprompt; // string upon seeing which more user input is prompted std::string logdir = ""; // directory in which to save YAML log files + std::string logits_file = ""; // file for saving *all* logits + + std::vector kv_overrides; // TODO: avoid tuple, use struct std::vector> lora_adapter; // lora adapter path with user defined scale @@ -78,14 +103,22 @@ struct gpt_params { int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line // (which is more convenient to use for plotting) // - bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt + bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score + bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt + size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed + + bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt + size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed + + bool kl_divergence = false; // compute KL-divergence + bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS - bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode + bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it @@ -102,15 +135,22 @@ struct gpt_params { bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory - bool numa = false; // attempt optimizations that help on some NUMA systems bool verbose_prompt = false; // print prompt tokens before generation + bool display_prompt = true; // print prompt before generation bool infill = false; // use infill mode + bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes + bool no_kv_offload = false; // disable KV offloading + + std::string cache_type_k = "f16"; // KV cache data type for the K + std::string cache_type_v = "f16"; // KV cache data type for the V // multimodal models (see examples/llava) std::string mmproj = ""; // path to multimodal projector - std::string image = ""; // path to an image file + std::string image = ""; // path to an image file }; +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); + bool gpt_params_parse(int argc, char ** argv, gpt_params & params); void gpt_print_usage(int argc, char ** argv, const gpt_params & params); @@ -121,6 +161,15 @@ std::string gpt_random_prompt(std::mt19937 & rng); void process_escapes(std::string& input); +// +// String utils +// + +std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names); +std::vector sampler_types_from_chars(const std::string & names_string); +std::vector string_split(std::string input, char separator); +std::string sampler_type_to_name_string(llama_sampler_type sampler_type); + // // Model utils // @@ -182,6 +231,10 @@ std::string llama_detokenize_bpe( llama_context * ctx, const std::vector & tokens); +// Uses the value from the model metadata if possible, otherwise +// defaults to true when model type is SPM, otherwise false. +bool llama_should_add_bos_token(const llama_model * model); + // // YAML utils // @@ -195,3 +248,13 @@ std::string get_sortable_timestamp(); void dump_non_result_info_yaml( FILE * stream, const gpt_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); + +// +// KV cache utils +// + +// Dump the KV cache view with the number of sequences per cell. +void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); + +// Dump the KV cache view showing individual sequences in each cell (long output). +void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40); diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp index 5a545a807..bf89a96f3 100644 --- a/common/grammar-parser.cpp +++ b/common/grammar-parser.cpp @@ -190,7 +190,7 @@ namespace grammar_parser { pos = parse_space(pos + 1, is_nested); } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator if (last_sym_start == out_elements.size()) { - throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos); + throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos); } // apply transformation to previous symbol (last_sym_start to end) according to @@ -399,7 +399,7 @@ namespace grammar_parser { void print_grammar(FILE * file, const parse_state & state) { try { std::map symbol_id_names; - for (auto kv : state.symbol_ids) { + for (const auto & kv : state.symbol_ids) { symbol_id_names[kv.second] = kv.first; } for (size_t i = 0, end = state.rules.size(); i < end; i++) { diff --git a/common/log.h b/common/log.h index 70e7e4ca2..e4e1b9f4f 100644 --- a/common/log.h +++ b/common/log.h @@ -61,13 +61,13 @@ // #define LOG_TARGET stderr // #include "log.h" // -// The log target can also be redirected to a diffrent function +// The log target can also be redirected to a different function // like so: // -// #define LOG_TARGET log_handler_diffrent() +// #define LOG_TARGET log_handler_different() // #include "log.h" // -// FILE* log_handler_diffrent() +// FILE* log_handler_different() // { // return stderr; // } @@ -97,37 +97,56 @@ #define LOG_TEE_TARGET stderr #endif +// Utility for synchronizing log configuration state +// since std::optional was introduced only in c++17 +enum LogTriState +{ + LogTriStateSame, + LogTriStateFalse, + LogTriStateTrue +}; + // Utility to obtain "pid" like unique process id and use it when creating log files. inline std::string log_get_pid() { - static std::string pid; - if (pid.empty()) - { - // std::this_thread::get_id() is the most portable way of obtaining a "process id" - // it's not the same as "pid" but is unique enough to solve multiple instances - // trying to write to the same log. - std::stringstream ss; - ss << std::this_thread::get_id(); - pid = ss.str(); - } + static std::string pid; + if (pid.empty()) + { + // std::this_thread::get_id() is the most portable way of obtaining a "process id" + // it's not the same as "pid" but is unique enough to solve multiple instances + // trying to write to the same log. + std::stringstream ss; + ss << std::this_thread::get_id(); + pid = ss.str(); + } - return pid; + return pid; } // Utility function for generating log file names with unique id based on thread id. // invocation with log_filename_generator( "llama", "log" ) creates a string "llama..log" // where the number is a runtime id of the current thread. -#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension) +#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension) // INTERNAL, DO NOT USE -inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension) +inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension) { + static bool _multilog = false; + + if (multilog != LogTriStateSame) + { + _multilog = multilog == LogTriStateTrue; + } + std::stringstream buf; buf << log_file_basename; - buf << "."; - buf << log_get_pid(); + if (_multilog) + { + buf << "."; + buf << log_get_pid(); + } buf << "."; buf << log_file_extension; @@ -212,15 +231,6 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base #define LOG_TEE_FLF_VAL ,"" #endif -// Utility for synchronizing log configuration state -// since std::optional was introduced only in c++17 -enum LogTriState -{ - LogTriStateSame, - LogTriStateFalse, - LogTriStateTrue -}; - // INTERNAL, DO NOT USE // USE LOG() INSTEAD // @@ -314,16 +324,23 @@ enum LogTriState #endif // INTERNAL, DO NOT USE -inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) +inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) { - static bool _initialized{false}; - static bool _disabled{(filename.empty() && target == nullptr)}; + static bool _initialized = false; + static bool _append = false; + static bool _disabled = filename.empty() && target == nullptr; static std::string log_current_filename{filename}; static FILE *log_current_target{target}; static FILE *logfile = nullptr; if (change) { + if (append != LogTriStateSame) + { + _append = append == LogTriStateTrue; + return logfile; + } + if (disable == LogTriStateTrue) { // Disable primary target @@ -376,7 +393,7 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri } } - logfile = fopen(filename.c_str(), "w"); + logfile = fopen(filename.c_str(), _append ? "a" : "w"); } if (!logfile) @@ -397,20 +414,20 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri } // INTERNAL, DO NOT USE -inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) +inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) { - return log_handler1_impl(change, disable, filename, target); + return log_handler1_impl(change, append, disable, filename, target); } // Disables logs entirely at runtime. // Makes LOG() and LOG_TEE() produce no output, -// untill enabled back. +// until enabled back. #define log_disable() log_disable_impl() // INTERNAL, DO NOT USE inline FILE *log_disable_impl() { - return log_handler1_impl(true, LogTriStateTrue); + return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue); } // Enables logs at runtime. @@ -419,19 +436,31 @@ inline FILE *log_disable_impl() // INTERNAL, DO NOT USE inline FILE *log_enable_impl() { - return log_handler1_impl(true, LogTriStateFalse); + return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse); } // Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*) #define log_set_target(target) log_set_target_impl(target) // INTERNAL, DO NOT USE -inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); } -inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); } +inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); } +inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); } // INTERNAL, DO NOT USE inline FILE *log_handler() { return log_handler1_impl(); } +// Enable or disable creating separate log files for each run. +// can ONLY be invoked BEFORE first log use. +#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "") +// Enable or disable append mode for log file. +// can ONLY be invoked BEFORE first log use. +#define log_append(enable) log_append_impl(enable) +// INTERNAL, DO NOT USE +inline FILE *log_append_impl(bool enable) +{ + return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame); +} + inline void log_test() { log_disable(); @@ -493,6 +522,18 @@ inline bool log_param_single_parse(const std::string & param) return true; } + if (param == "--log-new") + { + log_multilog(true); + return true; + } + + if (param == "--log-append") + { + log_append(true); + return true; + } + return false; } @@ -522,7 +563,9 @@ inline void log_print_usage() printf(" --log-disable Disable trace logs\n"); printf(" --log-enable Enable trace logs\n"); printf(" --log-file Specify a log filename (without extension)\n"); - printf(" Log file will be tagged with unique ID and written as \"..log\"\n"); /* */ + printf(" --log-new Create a separate new log file on start. " + "Each log file will have unique name: \"..log\"\n"); + printf(" --log-append Don't truncate the old log file.\n"); } #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) diff --git a/common/sampling.cpp b/common/sampling.cpp index 0b2466581..de4331a11 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -1,9 +1,9 @@ #include "sampling.h" -struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params) { +struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) { struct llama_sampling_context * result = new llama_sampling_context(); - result->params = params.sampling_params; + result->params = params; result->grammar = nullptr; // if there is a grammar, parse it @@ -13,6 +13,7 @@ struct llama_sampling_context * llama_sampling_init(const struct gpt_params & pa // will be empty (default) if there are parse errors if (result->parsed_grammar.rules.empty()) { fprintf(stderr, "%s: failed to parse grammar\n", __func__); + delete result; return nullptr; } @@ -23,7 +24,7 @@ struct llama_sampling_context * llama_sampling_init(const struct gpt_params & pa grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root")); } - result->prev.resize(params.n_ctx); + result->prev.resize(params.n_prev); return result; } @@ -39,6 +40,7 @@ void llama_sampling_free(struct llama_sampling_context * ctx) { void llama_sampling_reset(llama_sampling_context * ctx) { if (ctx->grammar != NULL) { llama_grammar_free(ctx->grammar); + ctx->grammar = NULL; } if (!ctx->parsed_grammar.rules.empty()) { @@ -66,25 +68,106 @@ void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * ds dst->prev = src->prev; } -llama_token llama_sampling_sample( +llama_token llama_sampling_last(llama_sampling_context * ctx) { + return ctx->prev.back(); +} + +std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) { + const int size = ctx_sampling->prev.size(); + + n = std::min(n, size); + + std::string result; + + for (int i = size - n; i < size; i++) { + result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]); + } + + return result; +} + +std::string llama_sampling_print(const llama_sampling_params & params) { + char result[1024]; + + snprintf(result, sizeof(result), + "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" + "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" + "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", + params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present, + params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp, + params.mirostat, params.mirostat_eta, params.mirostat_tau); + + return std::string(result); +} + +std::string llama_sampling_order_print(const llama_sampling_params & params) { + std::string result = "CFG -> Penalties "; + if (params.mirostat == 0) { + for (auto sampler_type : params.samplers_sequence) { + const auto sampler_type_name = sampler_type_to_name_string(sampler_type); + if (!sampler_type_name.empty()) { + result += "-> " + sampler_type_name + " "; + } + } + } else { + result += "-> mirostat "; + } + + return result; +} + +// no reasons to expose this function in header +static void sampler_queue( + struct llama_context * ctx_main, + const llama_sampling_params & params, + llama_token_data_array & cur_p, + size_t min_keep) { + const float temp = params.temp; + const float dynatemp_range = params.dynatemp_range; + const float dynatemp_exponent = params.dynatemp_exponent; + const int32_t top_k = params.top_k; + const float top_p = params.top_p; + const float min_p = params.min_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const std::vector & samplers_sequence = params.samplers_sequence; + + for (auto sampler_type : samplers_sequence) { + switch (sampler_type) { + case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; + case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; + case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; + case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; + case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; + case llama_sampler_type::TEMPERATURE: + if (dynatemp_range > 0) { + float dynatemp_min = std::max(0.0f, temp - dynatemp_range); + float dynatemp_max = std::max(0.0f, temp + dynatemp_range); + llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent); + } else { + llama_sample_temp(ctx_main, &cur_p, temp); + } + break; + default : break; + } + } +} + +static llama_token llama_sampling_sample_impl( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, struct llama_context * ctx_cfg, - const int idx) { - const int n_ctx = llama_n_ctx(ctx_main); - const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - + const int idx, + bool is_resampling) { // Add a parameter to indicate if we are resampling const llama_sampling_params & params = ctx_sampling->params; + const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); + const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; - const float top_p = params.top_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - const float alpha_presence = params.presence_penalty; - const float alpha_frequency = params.frequency_penalty; + const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; + const float penalty_repeat = params.penalty_repeat; + const float penalty_freq = params.penalty_freq; + const float penalty_present = params.penalty_present; const int mirostat = params.mirostat; const float mirostat_tau = params.mirostat_tau; const float mirostat_eta = params.mirostat_eta; @@ -95,13 +178,27 @@ llama_token llama_sampling_sample( llama_token id = 0; + // Get a pointer to the logits float * logits = llama_get_logits_ith(ctx_main, idx); - // Apply params.logit_bias map + // Declare original_logits at the beginning of the function scope + std::vector original_logits; + + if (!is_resampling) { + // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this. + original_logits = std::vector(logits, logits + llama_n_vocab(llama_get_model(ctx_main))); + } + + // apply params.logit_bias map for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { logits[it->first] += it->second; } + if (ctx_cfg) { + float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx); + llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale); + } + cur.clear(); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { @@ -110,25 +207,19 @@ llama_token llama_sampling_sample( llama_token_data_array cur_p = { cur.data(), cur.size(), false }; - if (ctx_cfg) { - llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale); - } - // apply penalties - if (!prev.empty()) { - const float nl_logit = logits[llama_token_nl(ctx_main)]; - const int last_n_repeat = std::min(std::min((int)prev.size(), repeat_last_n), n_ctx); + const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev; + const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n); + if (penalty_tokens_used_size) { + const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))]; - llama_sample_repetition_penalty(ctx_main, &cur_p, - prev.data() + prev.size() - last_n_repeat, - last_n_repeat, repeat_penalty); - llama_sample_frequency_and_presence_penalties(ctx_main, &cur_p, - prev.data() + prev.size() - last_n_repeat, - last_n_repeat, alpha_frequency, alpha_presence); + llama_sample_repetition_penalties(ctx_main, &cur_p, + penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size, + penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present); if (!penalize_nl) { for (size_t idx = 0; idx < cur_p.size; idx++) { - if (cur_p.data[idx].id == llama_token_nl(ctx_main)) { + if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) { cur_p.data[idx].logit = nl_logit; break; } @@ -136,12 +227,17 @@ llama_token llama_sampling_sample( } } - if (ctx_sampling->grammar != NULL) { + // If we are in the resampling phase, apply grammar checks before sampling logic + if (is_resampling && ctx_sampling->grammar != NULL) { llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); } - if (temp <= 0) { - // Greedy sampling + if (temp < 0.0) { + // greedy sampling, with probs + llama_sample_softmax(ctx_main, &cur_p); + id = cur_p.data[0].id; + } else if (temp == 0.0) { + // greedy sampling, no probs id = llama_sample_token_greedy(ctx_main, &cur_p); } else { if (mirostat == 1) { @@ -152,13 +248,10 @@ llama_token llama_sampling_sample( llama_sample_temp(ctx_main, &cur_p, temp); id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu); } else { - // Temperature sampling - size_t min_keep = std::max(1, params.n_probs); - llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); - llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); - llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); - llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); - llama_sample_temp (ctx_main, &cur_p, temp); + // temperature sampling + size_t min_keep = std::max(1, params.min_keep); + + sampler_queue(ctx_main, params, cur_p, min_keep); id = llama_sample_token(ctx_main, &cur_p); @@ -177,17 +270,49 @@ llama_token llama_sampling_sample( } } + if (ctx_sampling->grammar != NULL && !is_resampling) { + // Create an array with a single token data element for the sampled id + llama_token_data single_token_data = {id, logits[id], 0.0f}; + llama_token_data_array single_token_data_array = { &single_token_data, 1, false }; + + // Apply grammar constraints to the single token + llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar); + + // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY + bool is_valid = single_token_data_array.data[0].logit != -INFINITY; + + // If the token is not valid according to the grammar, perform resampling + if (!is_valid) { + LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str()); + + // Restore logits from the copy + std::copy(original_logits.begin(), original_logits.end(), logits); + + return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling + } + } + return id; } +llama_token llama_sampling_sample( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + const int idx) { + // Call the implementation function with is_resampling set to false by default + return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false); +} + void llama_sampling_accept( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, - llama_token id) { + llama_token id, + bool apply_grammar) { ctx_sampling->prev.erase(ctx_sampling->prev.begin()); ctx_sampling->prev.push_back(id); - if (ctx_sampling->grammar != NULL) { + if (ctx_sampling->grammar != NULL && apply_grammar) { llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id); } } diff --git a/common/sampling.h b/common/sampling.h index 50afcbc12..95d875394 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -8,32 +8,58 @@ #include #include +// sampler types +enum class llama_sampler_type : char { + TOP_K = 'k', + TOP_P = 'p', + MIN_P = 'm', + TFS_Z = 'f', + TYPICAL_P = 'y', + TEMPERATURE = 't' +}; + // sampling parameters typedef struct llama_sampling_params { - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typical_p = 1.00f; // 1.0 = disabled - float temp = 0.80f; // 1.0 = disabled - float repeat_penalty = 1.10f; // 1.0 = disabled - int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float frequency_penalty = 0.00f; // 0.0 = disabled - float presence_penalty = 0.00f; // 0.0 = disabled - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typical_p = 1.00f; // 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.10f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool penalize_nl = true; // consider newlines as a repeatable token - bool penalize_nl = true; // consider newlines as a repeatable token + std::vector samplers_sequence = { + llama_sampler_type::TOP_K, + llama_sampler_type::TFS_Z, + llama_sampler_type::TYPICAL_P, + llama_sampler_type::TOP_P, + llama_sampler_type::MIN_P, + llama_sampler_type::TEMPERATURE + }; - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + std::string grammar; // optional BNF-like grammar to constrain sampling // Classifier-Free Guidance // https://arxiv.org/abs/2306.17806 - std::string cfg_negative_prompt; // string to help guidance - float cfg_scale = 1.f; // How strong is guidance + std::string cfg_negative_prompt; // string to help guidance + float cfg_scale = 1.f; // how strong is guidance std::unordered_map logit_bias; // logit bias for specific tokens + std::vector penalty_prompt_tokens; + bool use_penalty_prompt_tokens = false; } llama_sampling_params; // general sampler context @@ -58,7 +84,7 @@ struct llama_sampling_context { #include "common.h" // Create a new sampling context instance. -struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params); +struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params); void llama_sampling_free(struct llama_sampling_context * ctx); @@ -70,6 +96,18 @@ void llama_sampling_reset(llama_sampling_context * ctx); // Copy the sampler context void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst); +// Get the last sampled token +llama_token llama_sampling_last(llama_sampling_context * ctx); + +// Get a string representation of the last sampled tokens +std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n); + +// Print sampling parameters into a string +std::string llama_sampling_print(const llama_sampling_params & params); + +// Print sampling order into a string +std::string llama_sampling_order_print(const llama_sampling_params & params); + // this is a common sampling function used across the examples for convenience // it can serve as a starting point for implementing your own sampling function // Note: When using multiple sequences, it is the caller's responsibility to call @@ -96,4 +134,5 @@ llama_token llama_sampling_sample( void llama_sampling_accept( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, - llama_token id); + llama_token id, + bool apply_grammar); diff --git a/common/train.cpp b/common/train.cpp index 972eaefe0..0dbfd24df 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -31,7 +31,8 @@ struct train_state * init_train_state() { state->opt = new struct ggml_opt_context; state->opt->ctx = NULL; - state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); + state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; state->opt->loss_after = 0.0f; return state; @@ -70,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd) struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { float scale = 1.0f; // xavier - switch (tensor->n_dims) { + switch (ggml_n_dims(tensor)) { case 1: scale /= sqrtf((float) tensor->ne[0]); for (int i0 = 0; i0 < tensor->ne[0]; i0++) { @@ -118,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct } struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { - switch (tensor->n_dims) { + switch (ggml_n_dims(tensor)) { case 1: for (int i0 = 0; i0 < tensor->ne[0]; i0++) { float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); @@ -182,25 +183,27 @@ float fclamp(const float v, const float min, const float max) { } void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { - GGML_ASSERT(tensor->n_dims == 1); GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == 1); + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); } void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { - GGML_ASSERT(tensor->n_dims == 2); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); } void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { - GGML_ASSERT(tensor->n_dims == 3); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[2] == ne2); + GGML_ASSERT(tensor->ne[3] == 1); } void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { - GGML_ASSERT(tensor->n_dims == 4); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[2] == ne2); @@ -224,8 +227,8 @@ int64_t get_example_targets_batch( bool sample_random_offsets ) { GGML_ASSERT(samples_count > 0); - GGML_ASSERT(tokens_input->n_dims == 2); - GGML_ASSERT(target_probs->n_dims == 3); + GGML_ASSERT(ggml_is_matrix(tokens_input)); + GGML_ASSERT(ggml_is_3d(target_probs)); int64_t n_vocab = target_probs->ne[0]; int64_t n_tokens = tokens_input->ne[0]; int64_t n_batch = tokens_input->ne[1]; @@ -236,8 +239,8 @@ int64_t get_example_targets_batch( int64_t used_samples = 0; ggml_set_f32(target_probs, 0.0f); - llama_token bos = llama_token_bos(lctx); - llama_token eos = llama_token_eos(lctx); + llama_token bos = llama_token_bos(llama_get_model(lctx)); + llama_token eos = llama_token_eos(llama_get_model(lctx)); // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); for (int k=0; kparams.type = GGML_OPT_ADAM; + opt->params.type = GGML_OPT_TYPE_ADAM; GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); @@ -565,7 +568,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { - opt->params.type = GGML_OPT_LBFGS; + opt->params.type = GGML_OPT_TYPE_LBFGS; GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); @@ -600,7 +603,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); switch (opt->params.type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); @@ -619,7 +622,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * gguf_add_tensor(fctx, opt->adam.pf); } } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); @@ -924,7 +927,7 @@ size_t tokenize_file( for (llama_token token=0; token < n_vocab; ++token) { max_token_text_size = std::max( max_token_text_size, - strlen(llama_token_get_text(lctx, token))); + strlen(llama_token_get_text(llama_get_model(lctx), token))); } // upper bound of context byte length. @@ -1045,6 +1048,7 @@ struct train_params_common get_default_train_params_common() { params.n_batch = 8; params.n_gradient_accumulation = 1; params.n_epochs = -1; + params.n_gpu_layers = 0; params.custom_n_ctx = false; @@ -1080,6 +1084,7 @@ struct train_params_common get_default_train_params_common() { params.adam_beta2 = 0.999f; params.adam_gclip = 1.0f; params.adam_eps_f = 0.0f; + return params; } @@ -1102,7 +1107,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train fprintf(stderr, " --sample-start STR Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str()); fprintf(stderr, " --include-sample-start Include the sample start in the samples. (default off)\n"); fprintf(stderr, " --escape process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); - fprintf(stderr, " --overlapping-samples Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n"); + fprintf(stderr, " --overlapping-samples Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n"); fprintf(stderr, " --fill-with-next-samples Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n"); fprintf(stderr, " --separate-with-eos When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : ""); fprintf(stderr, " --separate-with-bos When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : ""); @@ -1133,6 +1138,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2); fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip); fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f); + fprintf(stderr, " -ngl N, --n-gpu-layers N Number of model layers to offload to GPU (default %d)", params->n_gpu_layers); fprintf(stderr, "\n"); } @@ -1352,6 +1358,17 @@ bool consume_common_train_arg( return true; } params->adam_gclip = std::stof(argv[i]); + } else if (arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + if (llama_supports_gpu_offload()) { + params->n_gpu_layers = std::stoi(argv[i]); + } else { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } } else if (arg == "-h" || arg == "--help") { params->print_usage = true; return true; @@ -1425,7 +1442,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * canc int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f); if (impr_plot > 0) impr_plot = 0; - if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0; + if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0; printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f", __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count, *sched, opt->loss_after); diff --git a/common/train.h b/common/train.h index 42fa704b8..263d940c0 100644 --- a/common/train.h +++ b/common/train.h @@ -9,6 +9,8 @@ #include "ggml.h" #include "llama.h" +#define LLAMA_TRAIN_MAX_NODES 16384 + typedef std::string mt19937_state; struct train_state { @@ -44,6 +46,7 @@ struct train_params_common { int n_batch; int n_gradient_accumulation; int n_epochs; + int n_gpu_layers; bool custom_n_ctx; diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py deleted file mode 100755 index 513a7516a..000000000 --- a/convert-baichuan-hf-to-gguf.py +++ /dev/null @@ -1,310 +0,0 @@ -#!/usr/bin/env python3 -# HF baichuan --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import TYPE_CHECKING, Any -import itertools -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -if TYPE_CHECKING: - from typing import TypeAlias - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - -# reverse HF permute back to original pth layout - - -def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - -def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray: - r = weights.shape[0] // 3 - return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) - -def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray: - r = weights.shape[0] // 3 - return weights[r * n_part : r * n_part + r, ...] - -def count_model_parts(dir_model: str) -> int: - num_parts = 0 - - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - - return num_parts - - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) -print("hello print: ",hparams["architectures"][0]) -if hparams["architectures"][0] != "BaichuanForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) -print(f"num_parts:{num_parts}\n") -ARCH=gguf.MODEL_ARCH.BAICHUAN -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] -head_count = hparams["num_attention_heads"] - -if "num_key_value_heads" in hparams: - head_count_kv = hparams["num_key_value_heads"] -else: - head_count_kv = head_count - -if "_name_or_path" in hparams: - hf_repo = hparams["_name_or_path"] -else: - hf_repo = "" - -if "max_sequence_length" in hparams: - ctx_length = hparams["max_sequence_length"] -elif "max_position_embeddings" in hparams: - ctx_length = hparams["max_position_embeddings"] -elif "model_max_length" in hparams: - ctx_length = hparams["model_max_length"] -else: - print("gguf: can not find ctx length parameter.") - - sys.exit() - - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_source_hf_repo(hf_repo) -gguf_writer.add_tensor_data_layout("Meta AI original pth") -gguf_writer.add_context_length(ctx_length) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) -gguf_writer.add_head_count(head_count) -gguf_writer.add_head_count_kv(head_count_kv) -gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - -if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: - if "type" in hparams["rope_scaling"]: - if hparams["rope_scaling"]["type"] == "linear": - gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) - - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytes] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -tokenizer_model_file = dir_model / 'tokenizer.model' -if not tokenizer_model_file.is_file(): - print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) - sys.exit(1) - -# vocab type sentencepiece -print("gguf: get sentencepiece tokenizer vocab, scores and token types") - -tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) -vocab_size = hparams.get('vocab_size') -if vocab_size is None: - vocab_size = tokenizer.vocab_size() - -for i in range(vocab_size): - text: bytes - score: float - - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) - - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 - - # toktype = 4 is user-defined = tokens from added_tokens.json - - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - -added_tokens_file = dir_model / 'added_tokens.json' -if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - addtokens_json = json.load(f) - - print("gguf: get added tokens") - - for key in addtokens_json: - tokens.append( key.encode("utf-8") ) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type - - -gguf_writer.add_tokenizer_model("llama") -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - tmp=model_part - for i in range(block_count): - if f"model.layers.{i}.self_attn.W_pack.weight" in model_part: - print(f"Unpacking and permuting layer {i}") - tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count) - tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv) - tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2) - del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] - - for name in model_part.keys(): - data = model_part[name] - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-bloom-hf-to-gguf.py b/convert-bloom-hf-to-gguf.py deleted file mode 100755 index 7bfc95ec1..000000000 --- a/convert-bloom-hf-to-gguf.py +++ /dev/null @@ -1,238 +0,0 @@ -#!/usr/bin/env python3 -# HF bloom --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import re -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -# Supported Models: -# https://huggingface.co/bigscience/bloom-1b7 -# https://huggingface.co/bigscience/bloom-3b -# https://huggingface.co/bigscience/bloom-7b1 -# https://huggingface.co/Langboat/bloom-1b4-zh -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") - parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "BloomForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - sys.exit(1) - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.BLOOM -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["n_layer"] - -gguf_writer.add_name("Bloom") -n_embed = hparams.get("hidden_size", hparams.get("n_embed")) -n_head = hparams.get("n_head", hparams.get("num_attention_heads")) -gguf_writer.add_context_length(hparams.get("seq_length", n_embed)) -gguf_writer.add_embedding_length(n_embed) -gguf_writer.add_feed_forward_length(4 * n_embed) -gguf_writer.add_block_count(block_count) -gguf_writer.add_head_count(n_head) -gguf_writer.add_head_count_kv(n_head) -gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) -gguf_writer.add_file_type(ftype) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") - scores.append(0.0) # dummy - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH, block_count) - -# params for qkv transform -n_head_kv = hparams.get("n_head_kv", n_head) -head_dim = n_embed // n_head - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(dir_model / part_name, map_location="cpu") - - has_lm_head = True - if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys(): - has_lm_head = False - - for original_name in model_part.keys(): - data = model_part[original_name] - name = re.sub(r'transformer\.', '', original_name) - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) - data = np.concatenate( - (qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed))), - axis=0 - ) - print("re-format attention.linear_qkv.weight") - elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): - qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) - data = np.concatenate( - (qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,))), - axis=0 - ) - print("re-format attention.linear_qkv.bias") - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - if not has_lm_head and name == "word_embeddings.weight": - gguf_writer.add_tensor("output.weight", data) - print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py deleted file mode 100755 index 9252e1c46..000000000 --- a/convert-falcon-hf-to-gguf.py +++ /dev/null @@ -1,250 +0,0 @@ -#!/usr/bin/env python3 -# HF falcon--> gguf conversion - -from __future__ import annotations - -import argparse -import contextlib -import json -import os -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path, prefix: str) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith(prefix): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "FalconForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit(1) - -# get number of model parts -num_parts = count_model_parts(dir_model, "model-00") -if num_parts: - is_safetensors = True - from safetensors import safe_open -else: - is_safetensors = False - num_parts = count_model_parts(dir_model, "pytorch_model-") - -ARCH=gguf.MODEL_ARCH.FALCON -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] - -gguf_writer.add_name("Falcon") -gguf_writer.add_context_length(2048) # not in config.json -gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_head_count(hparams["num_attention_heads"]) -if "num_kv_heads" in hparams: - gguf_writer.add_head_count_kv(hparams["num_kv_heads"]) -else: - gguf_writer.add_head_count_kv(1) -gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) -gguf_writer.add_file_type(ftype) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - tokens.append(reverse_vocab[i]) - scores.append(0.0) # dummy - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# params for qkv transform -n_head = hparams["num_attention_heads"] -n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1 - -head_dim = hparams["hidden_size"] // n_head - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -elif is_safetensors: - part_names = ( - f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1) - ) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - if is_safetensors: - ctx = safe_open(dir_model / part_name, framework="pt", device="cpu") - else: - ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu")) - - with ctx as model_part: - for name in model_part.keys(): - data = model_part.get_tensor(name) if is_safetensors else model_part[name] - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - # QKV tensor transform - # The original query_key_value tensor contains n_head_kv "kv groups", - # each consisting of n_head/n_head_kv query weights followed by one key - # and one value weight (shared by all query heads in the kv group). - # This layout makes it a big pain to work with in GGML. - # So we rearrange them here,, so that we have n_head query weights - # followed by n_head_kv key weights followed by n_head_kv value weights, - # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - - if "query_key_value" in name: - qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) - q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head) - k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) - v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data = torch.cat((q,k,v)).reshape_as(data) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py deleted file mode 100755 index d4e85f518..000000000 --- a/convert-gptneox-hf-to-gguf.py +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/env python3 -# HF gptneox--> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "GPTNeoXForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.GPTNEOX -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_context_length(hparams["max_position_embeddings"]) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))) -gguf_writer.add_head_count(hparams["num_attention_heads"]) -gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) -gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"]) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") - scores.append(0.0) # dummy - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - # we don't need these - if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py new file mode 100755 index 000000000..ae30b2a76 --- /dev/null +++ b/convert-hf-to-gguf.py @@ -0,0 +1,1934 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import contextlib +import json +import os +import re +import sys +from enum import IntEnum +from pathlib import Path +from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast + +import numpy as np +import torch + +if TYPE_CHECKING: + from torch import Tensor + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import gguf + +from convert import HfVocab + + +###### MODEL DEFINITIONS ###### + +class SentencePieceTokenTypes(IntEnum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + + +class Model: + def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): + self.dir_model = dir_model + self.ftype = ftype + self.fname_out = fname_out + self.is_big_endian = is_big_endian + self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.is_safetensors = self._is_model_safetensors() + self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") + self.part_names = self._get_part_names() + self.hparams = Model.load_hparams(self.dir_model) + self.model_arch = self._get_model_architecture() + self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) + + def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any: + key = next((k for k in keys if k in self.hparams), None) + if key is not None: + return self.hparams[key] + if optional: + return None + raise KeyError(f"could not find any of: {keys}") + + def set_vocab(self): + self._set_vocab_gpt2() + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for part_name in self.part_names: + print(f"gguf: loading model part '{part_name}'") + ctx: ContextManager[Any] + if self.is_safetensors: + from safetensors import safe_open + ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) + else: + ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) + + with ctx as model_part: + for name in model_part.keys(): + data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] + yield name, data + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.block_count) + + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: + self.gguf_writer.add_context_length(n_ctx) + + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + self.gguf_writer.add_embedding_length(n_embd) + + if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: + self.gguf_writer.add_feed_forward_length(n_ff) + + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + self.gguf_writer.add_head_count(n_head) + + if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: + self.gguf_writer.add_head_count_kv(n_head_kv) + + if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + if (n_experts := self.hparams.get("num_local_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + def write(self): + self.write_tensors() + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file() + self.gguf_writer.close() + + def write_vocab(self): + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.close() + + @staticmethod + def count_model_parts(dir_model: Path, prefix: str) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.endswith(prefix): + num_parts += 1 + + return num_parts + + @staticmethod + def load_hparams(dir_model): + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + return json.load(f) + + @staticmethod + def from_model_architecture(model_architecture): + if model_architecture == "GPTNeoXForCausalLM": + return GPTNeoXModel + if model_architecture == "BloomForCausalLM": + return BloomModel + if model_architecture == "MPTForCausalLM": + return MPTModel + if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): + return BaichuanModel + if model_architecture in ("FalconForCausalLM", "RWForCausalLM"): + return FalconModel + if model_architecture == "GPTBigCodeForCausalLM": + return StarCoderModel + if model_architecture == "GPTRefactForCausalLM": + return RefactModel + if model_architecture == "PersimmonForCausalLM": + return PersimmonModel + if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + return StableLMModel + if model_architecture == "QWenLMHeadModel": + return QwenModel + if model_architecture == "Qwen2ForCausalLM": + return Model + if model_architecture == "MixtralForCausalLM": + return MixtralModel + if model_architecture == "GPT2LMHeadModel": + return GPT2Model + if model_architecture == "PhiForCausalLM": + return Phi2Model + if model_architecture == "PlamoForCausalLM": + return PlamoModel + if model_architecture == "CodeShellForCausalLM": + return CodeShellModel + if model_architecture == "OrionForCausalLM": + return OrionModel + if model_architecture == "InternLM2ForCausalLM": + return InternLM2Model + if model_architecture == "MiniCPMForCausalLM": + return MiniCPMModel + if model_architecture == "BertModel": + return BertModel + if model_architecture == "NomicBertModel": + return NomicBertModel + if model_architecture == "GemmaForCausalLM": + return GemmaModel + return Model + + def _is_model_safetensors(self) -> bool: + return Model.count_model_parts(self.dir_model, ".safetensors") > 0 + + def _get_part_names(self): + if self.is_safetensors: + if self.num_parts == 1: # there's only one .safetensors file + return ("model.safetensors",) + return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1)) + + if self.num_parts == 1: # there's only one .bin file + return ("pytorch_model.bin",) + return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) + + def _get_model_architecture(self) -> gguf.MODEL_ARCH: + arch = self.hparams["architectures"][0] + if arch == "GPTNeoXForCausalLM": + return gguf.MODEL_ARCH.GPTNEOX + if arch == "BloomForCausalLM": + return gguf.MODEL_ARCH.BLOOM + if arch == "MPTForCausalLM": + return gguf.MODEL_ARCH.MPT + if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): + return gguf.MODEL_ARCH.BAICHUAN + if arch in ("FalconForCausalLM", "RWForCausalLM"): + return gguf.MODEL_ARCH.FALCON + if arch == "GPTBigCodeForCausalLM": + return gguf.MODEL_ARCH.STARCODER + if arch == "GPTRefactForCausalLM": + return gguf.MODEL_ARCH.REFACT + if arch == "PersimmonForCausalLM": + return gguf.MODEL_ARCH.PERSIMMON + if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + return gguf.MODEL_ARCH.STABLELM + if arch == "QWenLMHeadModel": + return gguf.MODEL_ARCH.QWEN + if arch == "Qwen2ForCausalLM": + return gguf.MODEL_ARCH.QWEN2 + if arch == "MixtralForCausalLM": + return gguf.MODEL_ARCH.LLAMA + if arch == "GPT2LMHeadModel": + return gguf.MODEL_ARCH.GPT2 + if arch == "PhiForCausalLM": + return gguf.MODEL_ARCH.PHI2 + if arch == "PlamoForCausalLM": + return gguf.MODEL_ARCH.PLAMO + if arch == "CodeShellForCausalLM": + return gguf.MODEL_ARCH.CODESHELL + if arch == "OrionForCausalLM": + return gguf.MODEL_ARCH.ORION + if arch == "InternLM2ForCausalLM": + return gguf.MODEL_ARCH.INTERNLM2 + if arch == "MiniCPMForCausalLM": + return gguf.MODEL_ARCH.MINICPM + if arch == "BertModel": + return gguf.MODEL_ARCH.BERT + if arch == "NomicBertModel": + return gguf.MODEL_ARCH.NOMIC_BERT + if arch == "GemmaForCausalLM": + return gguf.MODEL_ARCH.GEMMA + + raise NotImplementedError(f'Architecture "{arch}" not supported!') + + def _set_vocab_gpt2(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[bytearray] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + assert max(tokenizer.vocab.values()) < vocab_size + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + for i in range(vocab_size): + if i not in reverse_vocab: + pad_token = f"[PAD{i}]".encode('utf-8') + tokens.append(bytearray(pad_token)) + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_qwen(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[bytearray] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams["vocab_size"] + assert max(tokenizer.get_vocab().values()) < vocab_size + + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + assert len(merged) == 2 + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined + added_vocab = tokenizer.special_tokens + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()} + + for i in range(vocab_size): + if i not in reverse_vocab: + pad_token = f"[PAD{i}]".encode("utf-8") + tokens.append(bytearray(pad_token)) + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) + special_vocab.merges = merges + # only add special tokens when they were not already loaded from config.json + if len(special_vocab.special_token_ids) == 0: + special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) + # this one is usually not in config.json anyway + special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_sentencepiece(self): + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + sys.exit(1) + + tokenizer = SentencePieceProcessor(str(tokenizer_path)) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.id_to_piece(token_id) + text = piece.encode("utf-8") + score = tokenizer.get_score(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_hf(self): + path = self.dir_model + added_tokens_path = self.dir_model + vocab = HfVocab( + path, added_tokens_path if added_tokens_path.exists() else None + ) + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + +class GPTNeoXModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count( + int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), + ) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + + +class BloomModel(Model): + def set_gguf_parameters(self): + self.gguf_writer.add_name("Bloom") + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) + self.gguf_writer.add_embedding_length(n_embed) + self.gguf_writer.add_feed_forward_length(4 * n_embed) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams["n_layer"] + tensors = dict(self.get_tensors()) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + has_lm_head = True + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + + for name, data_torch in tensors.items(): + if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys(): + has_lm_head = False + + name = re.sub(r'transformer\.', '', name) + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) + data = np.concatenate( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + axis=0, + ) + print("re-format attention.linear_qkv.weight") + elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): + qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) + data = np.concatenate( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + axis=0, + ) + print("re-format attention.linear_qkv.bias") + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + if not has_lm_head and name == "word_embeddings.weight": + self.gguf_writer.add_tensor("output.weight", data) + print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + + +class MPTModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["n_layers"] + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): + self.gguf_writer.add_head_count_kv(kv_n_heads) + self.gguf_writer.add_layer_norm_eps(1e-5) + if self.hparams["attn_config"]["clip_qkv"] is not None: + self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + if "scales" in name: + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) + if new_name is not None: + new_name = new_name.replace("scales", "act.scales") + else: + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +class OrionModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + print("gguf: can not find ctx length parameter.") + sys.exit() + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + # note: config provides rms norm but it is actually layer norm + # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 + self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) + + def write_tensors(self): + # Collect tensors from generator object + model_kv = dict(self.get_tensors()) + block_count = self.hparams["num_hidden_layers"] + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + +class BaichuanModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + print("gguf: can not find ctx length parameter.") + sys.exit() + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def write_tensors(self): + # Collect tensors from generator object + model_kv = dict(self.get_tensors()) + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + for i in range(block_count): + if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None: + print(f"Unpacking and permuting layer {i}") + model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \ + self._reverse_hf_permute_part(w, 0, head_count, head_count) + model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \ + self._reverse_hf_permute_part(w, 1, head_count, head_count_kv) + model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \ + self._reverse_hf_part(w, 2) + del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"] + + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def _reverse_hf_permute_part( + self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, + ) -> Tensor: + r = weights.shape[0] // 3 + return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) + + def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: + r = weights.shape[0] // 3 + return weights[r * n_part:r * n_part + r, ...] + + +class FalconModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams.get("num_hidden_layers") + if block_count is None: + block_count = self.hparams["n_layer"] # old name + + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + self.gguf_writer.add_name("Falcon") + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams.get("num_hidden_layers") + if block_count is None: + block_count = self.hparams["n_layer"] # old name + + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + head_dim = self.hparams["hidden_size"] // n_head + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + + if "query_key_value" in name: + qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + data_torch = torch.cat((q, k, v)).reshape_as(data_torch) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +class StarCoderModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("StarCoder") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + +class RefactModel(Model): + def set_gguf_parameters(self): + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("Refact") + # refact uses Alibi. So this is from config.json which might be used by training. + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + + self.gguf_writer.add_feed_forward_length(ff_dim) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + n_head = self.hparams["n_head"] + n_head_kv = 1 + head_dim = self.hparams["n_embd"] // n_head + block_count = self.hparams["n_layer"] + + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + tensors = dict(self.get_tensors()) + for i in range(block_count): + if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None: + tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim] + tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:] + del tensors[f"transformer.h.{i}.attn.kv.weight"] + if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None: + tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w + del tensors[f"transformer.h.{i}.attn.q.weight"] + if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None: + tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim] + tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:] + del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] + + for name, data_torch in tensors.items(): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +class PersimmonModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) + head_count = self.hparams["num_attention_heads"] + head_count_kv = head_count + hidden_size = self.hparams["hidden_size"] + + self.gguf_writer.add_name('persimmon-8b-chat') + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + + # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller + # than the head size? + # ref: https://github.com/ggerganov/llama.cpp/pull/4889 + # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) + self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2) + + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + + def set_vocab(self): + self._set_vocab_sentencepiece() + # self.gguf_writer.add_bos_token_id(71013) + # self.gguf_writer.add_eos_token_id(71013) + + def write_tensors(self): + block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + if name.endswith(".self_attention.rotary_emb.inv_freq"): + continue + old_dtype = data_torch.dtype + # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) + data = data_torch.to(torch.float32).squeeze().numpy() + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + n_dims = len(data.shape) + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + +class StableLMModel(Model): + def set_vocab(self): + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab + self._set_vocab_qwen() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) + + +class MixtralModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + +class MiniCPMModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + self.gguf_writer.add_name("MiniCPM") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + def set_vocab(self): + self._set_vocab_hf() + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + n_head = self.hparams.get("num_attention_heads") + n_kv_head = self.hparams.get("num_key_value_heads") + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # HF models permute some of the tensors, so we need to undo that + if name.endswith(("q_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +class QwenModel(Model): + @staticmethod + def token_bytes_to_string(b): + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode + byte_encoder = bytes_to_unicode() + return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) + + @staticmethod + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + assert min_idx is not None + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + return parts + + def set_vocab(self): + self._set_vocab_qwen() + + def set_gguf_parameters(self): + self.gguf_writer.add_name("Qwen") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + + def write_tensors(self): + block_count = self.hparams["num_hidden_layers"] + model_kv = dict(self.get_tensors()) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + +class GPT2Model(Model): + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_context_length(self.hparams["n_ctx"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")): + continue + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): + data_torch = data_torch.transpose(1, 0) + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + # note: GPT2 output is tied to (same as) wte in original model + if new_name == "token_embd.weight": + print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor("output.weight", data) + + +class Phi2Model(Model): + def set_gguf_parameters(self): + block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) + + rot_pct = self.find_hparam(["partial_rotary_factor"]) + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + + self.gguf_writer.add_name("Phi2") + self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) + + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(4 * n_embd) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_add_bos_token(False) + + +class PlamoModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name("PLaMo") + self.gguf_writer.add_context_length(4096) # not in config.json + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + + def shuffle_attn_q_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(8, 5, 128, 5120) + data_torch = torch.permute(data_torch, (1, 0, 2, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def shuffle_attn_output_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(5120, 8, 5, 128) + data_torch = torch.permute(data_torch, (0, 2, 1, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def write_tensors(self): + block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + if "self_attn.rotary_emb.inv_freq" in name: + continue + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + # shuffle for broadcasting of gqa in ggml_mul_mat + if new_name.endswith("attn_q.weight"): + data_torch = self.shuffle_attn_q_weight(data_torch) + elif new_name.endswith("attn_output.weight"): + data_torch = self.shuffle_attn_output_weight(data_torch) + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +class CodeShellModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("CodeShell") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_freq_base(10000.0) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + tensors = dict(self.get_tensors()) + has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys() + for name, data_torch in tensors.items(): + # we don't need these + if name.endswith((".attn.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + if not has_lm_head and name == "transformer.wte.weight": + self.gguf_writer.add_tensor("output.weight", data) + print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + + +class InternLM2Model(Model): + def set_vocab(self): + # (TODO): Is there a better way? + # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character + # \x00 specially and convert it into an emoji character to prevent it from being mistakenly + # recognized as an empty string in C++. + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + sys.exit(1) + + sentencepiece_model = model.ModelProto() + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + + tokenizer = SentencePieceProcessor(str(tokenizer_path)) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.id_to_piece(token_id) + text = piece.encode("utf-8") + score = tokenizer.get_score(token_id) + if text == b"\x00": + # (TODO): fixme + # Hack here and replace the \x00 characters. + print(f"InternLM2 convert token '{text}' to '🐉'!") + text = "🐉" + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + old_eos = special_vocab.special_token_ids["eos"] + if "chat" in os.path.basename(self.dir_model.absolute()): + # For the chat model, we replace the eos with '<|im_end|>'. + special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) + print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ +in chat mode so that the conversation can end normally.") + + special_vocab.add_to_gguf(self.gguf_writer) + + def _try_get_sft_eos(self, tokenizer): + unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]') + im_end_list = tokenizer.encode('<|im_end|>') + assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) + if len(unused_145_list) == 1: + eos_token = unused_145_list[0] + if len(im_end_list) == 1: + eos_token = im_end_list[0] + return eos_token + + def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def set_gguf_parameters(self): + self.gguf_writer.add_name("InternLM2") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + + def post_write_tensors(self, tensor_map, name, data_torch): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + def write_tensors(self): + from einops import rearrange + + num_heads = self.hparams.get("num_attention_heads") + num_kv_heads = self.hparams.get("num_key_value_heads") + hidden_size = self.hparams.get("hidden_size") + q_per_kv = num_heads // num_kv_heads + head_dim = hidden_size // num_heads + num_groups = num_heads // q_per_kv + + block_count = self.hparams["num_hidden_layers"] + model_kv = dict(self.get_tensors()) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + if re.match(qkv_pattern, name): + bid = re.findall(qkv_pattern, name)[0] + qkv = data_torch + qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) + q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] + # The model weights of q and k equire additional reshape. + q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) + k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) + v = rearrange(v, " o g n i -> o (g n i)").T + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q) + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k) + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v) + else: + self.post_write_tensors(tensor_map, name, data_torch) + + +class BertModel(Model): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.vocab_size = None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_causal_attention(False) + + # get pooling path + with open(self.dir_model / "modules.json", encoding="utf-8") as f: + modules = json.load(f) + pooling_path = None + for mod in modules: + if mod["type"] == "sentence_transformers.models.Pooling": + pooling_path = mod["path"] + break + + # get pooling type + pooling_type = gguf.PoolingType.NONE + if pooling_path is not None: + with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: + pooling = json.load(f) + if pooling["pooling_mode_mean_tokens"]: + pooling_type = gguf.PoolingType.MEAN + elif pooling["pooling_mode_cls_token"]: + pooling_type = gguf.PoolingType.CLS + else: + raise NotImplementedError("Only MEAN and CLS pooling types supported") + + self.gguf_writer.add_pooling_type(pooling_type.value) + + def set_vocab(self): + path = self.dir_model + added_tokens_path = self.dir_model if self.dir_model.exists() else None + + # use huggingface vocab to get all tokens + vocab = HfVocab(path, added_tokens_path) + tokens, scores, toktypes = zip(*vocab.all_tokens()) + assert len(tokens) == vocab.vocab_size + self.vocab_size = vocab.vocab_size + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + n_token_types = len(set(toktypes)) + self.gguf_writer.add_token_type_count(n_token_types) + + # convert to phantom space vocab + def phantom(tok, typ): + if tok.startswith(b"[") and tok.endswith(b"]"): + return tok + if tok.startswith(b"##"): + return tok[2:] + return b"\xe2\x96\x81" + tok + tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes)) + + # set up bos and eos tokens (cls and sep) + self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id) + self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id) + + # add vocab to gguf + self.gguf_writer.add_tokenizer_model("bert") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + # handle special tokens + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def write_tensors(self): + tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + tensors = dict(self.get_tensors()) + for name, data_torch in tensors.items(): + # we are only using BERT for embeddings so we don't need the pooling layer + if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): + continue # we don't need these + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + data = data_torch.squeeze().numpy() + n_dims = len(data.shape) + new_dtype: type[np.floating[Any]] + + if ( + self.ftype == 1 and name.endswith(".weight") and n_dims == 2 + and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32 + ): + # if f16 desired, convert any float32 2-dim weight tensors to float16 + new_dtype = np.float16 + else: + # if f32 desired, convert any float16 to float32 + new_dtype = np.float32 + + print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}") + + if data.dtype != new_dtype: + data = data.astype(new_dtype) + + self.gguf_writer.add_tensor(new_name, data) + + +class NomicBertModel(BertModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # the HF config claims n_ctx=8192, but it uses RoPE scaling + self.hparams["n_ctx"] = 2048 + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # this doesn't do anything in the HF version + assert self.hparams["causal"] is False + # no bias tensors + assert self.hparams["qkv_proj_bias"] is False + assert self.hparams["mlp_fc1_bias"] is False + assert self.hparams["mlp_fc2_bias"] is False + # norm at end of layer + assert self.hparams["prenorm"] is False + # standard RoPE + assert self.hparams["rotary_emb_fraction"] == 1.0 + assert self.hparams["rotary_emb_interleaved"] is False + assert self.hparams["rotary_emb_scale_base"] is None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + + def get_tensors(self): + assert self.vocab_size is not None + for name, data in super().get_tensors(): + # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly. + if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size: + rounded_vocab_size = (self.vocab_size + 63) // 64 * 64 + assert data.shape == (rounded_vocab_size, self.hparams["n_embd"]) + data = data[:self.vocab_size, :] + yield name, data + + +class GemmaModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +###### CONVERSION LOGIC ###### + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert a huggingface model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--awq-path", type=Path, default=None, + help="Path to scale awq cache file") + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16"], default="f16", + help="output format - use f32 for float32, f16 for float16", + ) + parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") + parser.add_argument( + "model", type=Path, + help="directory containing model file", + ) + + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + dir_model = args.model + + if args.awq_path: + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] + tmp_model_path = args.model / "weighted_model" + dir_model = tmp_model_path + if tmp_model_path.is_dir(): + print(f"{tmp_model_path} exists as a weighted model.") + else: + tmp_model_path.mkdir(parents=True, exist_ok=True) + print("Saving new weighted model ...") + add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) + print(f"Saved weighted model at {tmp_model_path}.") + + if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file=sys.stderr) + sys.exit(1) + + ftype_map = { + "f32": gguf.GGMLQuantizationType.F32, + "f16": gguf.GGMLQuantizationType.F16, + } + + if args.outfile is not None: + fname_out = args.outfile + else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' + + print(f"Loading model: {dir_model.name}") + + hparams = Model.load_hparams(dir_model) + + with torch.inference_mode(): + model_class = Model.from_model_architecture(hparams["architectures"][0]) + model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) + + print("Set model parameters") + model_instance.set_gguf_parameters() + + print("Set model tokenizer") + model_instance.set_vocab() + + if args.vocab_only: + print(f"Exporting model vocab to '{fname_out}'") + model_instance.write_vocab() + else: + print(f"Exporting model to '{fname_out}'") + model_instance.write() + + print(f"Model successfully exported to '{fname_out}'") + + +if __name__ == '__main__': + main() diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py index b5d3e0b3c..b33108062 100755 --- a/convert-llama-ggml-to-gguf.py +++ b/convert-llama-ggml-to-gguf.py @@ -2,7 +2,7 @@ from __future__ import annotations import argparse -import math +import os import struct import sys from enum import IntEnum @@ -10,36 +10,17 @@ from pathlib import Path import numpy as np -import os if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf -# Note: Does not support GGML_QKK_64 -QK_K = 256 -# Items here are (block size, type size) -GGML_QUANT_SIZES = { - gguf.GGMLQuantizationType.F32 : (1, 4), - gguf.GGMLQuantizationType.F16 : (1, 2), - gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16), - gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16), - gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16), - gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16), - gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32), - gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32), - gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4), - gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12), - gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12), - gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), - gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), - gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8), -} class GGMLFormat(IntEnum): GGML = 0 GGMF = 1 GGJT = 2 + class GGMLFType(IntEnum): ALL_F32 = 0 MOSTLY_F16 = 1 @@ -59,6 +40,7 @@ class GGMLFType(IntEnum): MOSTLY_Q5_K_M = 17 MOSTLY_Q6_K = 18 + class Hyperparameters: def __init__(self): self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0 @@ -90,6 +72,7 @@ class Hyperparameters: def __str__(self): return f'' + class Vocab: def __init__(self, load_scores = True): self.items = [] @@ -111,6 +94,7 @@ class Vocab: self.items.append((item_text, item_score)) return offset - orig_offset + class Tensor: def __init__(self, use_padding = True): self.name = None @@ -125,7 +109,7 @@ class Tensor: (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12]) assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}' assert name_len < 4096, 'Absurd tensor name length' - quant = GGML_QUANT_SIZES.get(dtype) + quant = gguf.GGML_QUANT_SIZES.get(dtype) assert quant is not None, 'Unknown tensor type' (blksize, tysize) = quant offset += 12 @@ -144,6 +128,7 @@ class Tensor: # print(n_dims, name_len, dtype, self.dims, self.name, pad) return offset - orig_offset + class GGMLModel: def __init__(self): self.hyperparameters = None @@ -180,8 +165,8 @@ class GGMLModel: if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16): err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.' elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2): - if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1, - GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0): + if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1, + GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0): err = 'Q4 and Q8 quantizations changed in GGJTv3.' if len(err) > 0: raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.') @@ -208,6 +193,7 @@ class GGMLModel: hp.set_n_ff(self) return offset + class GGMLToGGUF: def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None): hp = ggml_model.hyperparameters @@ -238,7 +224,7 @@ class GGMLToGGUF: gguf_writer = gguf.GGUFWriter( self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], - use_temp_file = False ) + use_temp_file = False) self.add_params(gguf_writer) self.add_vocab(gguf_writer) if self.special_vocab is not None: @@ -362,7 +348,8 @@ class GGMLToGGUF: mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, - raw_dtype = tensor.dtype ) + raw_dtype = tensor.dtype) + def handle_metadata(cfg, hp): import convert @@ -384,38 +371,38 @@ def handle_metadata(cfg, hp): params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path) else: raise ValueError('Unable to load metadata') - vocab = convert.load_vocab( - cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, - cfg.vocabtype ) - # FIXME: Respect cfg.vocab_dir? - svocab = gguf.SpecialVocab(cfg.model_metadata_dir) + vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir) + vocab_factory = convert.VocabFactory(vocab_path) + vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir) convert.check_vocab_size(params, vocab) - return (params, vocab, svocab) + return params, vocab, special_vocab + def handle_args(): parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF') parser.add_argument('--input', '-i', type = Path, required = True, - help = 'Input GGMLv3 filename') + help = 'Input GGMLv3 filename') parser.add_argument('--output', '-o', type = Path, required = True, - help ='Output GGUF filename') + help ='Output GGUF filename') parser.add_argument('--name', - help = 'Set model name') + help = 'Set model name') parser.add_argument('--desc', - help = 'Set model description') + help = 'Set model description') parser.add_argument('--gqa', type = int, default = 1, - help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') + help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') parser.add_argument('--eps', default = '5.0e-06', - help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') + help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') parser.add_argument('--context-length', '-c', type=int, default = 2048, - help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') + help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') parser.add_argument('--model-metadata-dir', '-m', type = Path, - help ='Load HuggingFace/.pth vocab and metadata from the specified directory') + help ='Load HuggingFace/.pth vocab and metadata from the specified directory') parser.add_argument("--vocab-dir", type=Path, - help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") + help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm", - help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") + help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") return parser.parse_args() + def main(): cfg = handle_args() print(f'* Using config: {cfg}') @@ -425,7 +412,7 @@ def main(): data = np.memmap(cfg.input, mode = 'r') model = GGMLModel() print('* Scanning GGML input file') - offset = model.load(data, 0) + offset = model.load(data, 0) # noqa print(f'* GGML model hyperparameters: {model.hyperparameters}') vocab_override = None params_override = None @@ -440,12 +427,15 @@ def main(): print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') if model.file_format == GGMLFormat.GGML: print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') - converter = GGMLToGGUF(model, data, cfg, + converter = GGMLToGGUF( + model, data, cfg, params_override = params_override, vocab_override = vocab_override, - special_vocab = special_vocab ) + special_vocab = special_vocab + ) converter.save() print(f'* Successful completion. Output saved to: {cfg.output}') + if __name__ == '__main__': main() diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index a937410dd..9a9936dec 100755 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -3,51 +3,21 @@ from __future__ import annotations import json import os -import re import struct import sys +from pathlib import Path from typing import Any, BinaryIO, Sequence import numpy as np import torch +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} -HF_SUBLAYER_TO_GGML = { - "self_attn.q_proj": "attn_q", - "self_attn.k_proj": "attn_k", - "self_attn.v_proj": "attn_v", - "self_attn.o_proj": "attn_output", - "mlp.gate_proj": "ffn_gate", - "mlp.down_proj": "ffn_down", - "mlp.up_proj": "ffn_up", - "input_layernorm": "attn_norm", - "post_attention_layernorm": "ffn_norm", -} - - -def translate_tensor_name(t: str) -> str: - match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t) - if match: - nn = match.group(1) - sub_layer = match.group(2) - lora_type = match.group(3) - - sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer) - if sub_layer_renamed is None: - print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}") - sys.exit(1) - - output_string = ( - f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}" - ) - return output_string - else: - print(f"Error: unrecognized tensor {t}") - sys.exit(1) - - def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: fout.write(b"ggla"[::-1]) # magic (ggml lora) fout.write(struct.pack("i", 1)) # file version @@ -61,9 +31,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: fout.write(struct.pack("i", int(params["lora_alpha"]))) -def write_tensor_header( - self, name: str, shape: Sequence[int], data_type: np.dtype[Any] -) -> None: +def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None: sname = name.encode("utf-8") fout.write( struct.pack( @@ -78,60 +46,103 @@ def write_tensor_header( fout.seek((fout.tell() + 31) & -32) -if len(sys.argv) != 2: - print(f"Usage: python {sys.argv[0]} ") - print( - "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" - ) - sys.exit(1) +if __name__ == '__main__': + if len(sys.argv) < 2: + print(f"Usage: python {sys.argv[0]} [arch]") + print( + "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" + ) + print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)") + sys.exit(1) -input_json = os.path.join(sys.argv[1], "adapter_config.json") -input_model = os.path.join(sys.argv[1], "adapter_model.bin") -output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") + input_json = os.path.join(sys.argv[1], "adapter_config.json") + input_model = os.path.join(sys.argv[1], "adapter_model.bin") + output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") -model = torch.load(input_model, map_location="cpu") + if os.path.exists(input_model): + model = torch.load(input_model, map_location="cpu") + else: + input_model = os.path.join(sys.argv[1], "adapter_model.safetensors") + # lazy import load_file only if lora is in safetensors format. + from safetensors.torch import load_file + model = load_file(input_model, device="cpu") -with open(input_json, "r") as f: - params = json.load(f) + arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" -if params["peft_type"] != "LORA": - print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") - sys.exit(1) + if arch_name not in gguf.MODEL_ARCH_NAMES.values(): + print(f"Error: unsupported architecture {arch_name}") + sys.exit(1) -if params["fan_in_fan_out"] is True: - print("Error: param fan_in_fan_out is not supported") - sys.exit(1) + arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)] + name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone -if params["bias"] is not None and params["bias"] != "none": - print("Error: param bias is not supported") - sys.exit(1) + with open(input_json, "r") as f: + params = json.load(f) -# TODO: these seem to be layers that have been trained but without lora. -# doesn't seem widely used but eventually should be supported -if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0: - print("Error: param modules_to_save is not supported") - sys.exit(1) + if params["peft_type"] != "LORA": + print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") + sys.exit(1) -with open(output_path, "wb") as fout: - fout.truncate() + if params["fan_in_fan_out"] is True: + print("Error: param fan_in_fan_out is not supported") + sys.exit(1) - write_file_header(fout, params) - for k, v in model.items(): - if k.endswith(".default.weight"): - k = k.replace(".default.weight", ".weight") - if k in ["llama_proj.weight", "llama_proj.bias"]: - continue - if k.endswith("lora_A.weight"): - if v.dtype != torch.float16 and v.dtype != torch.float32: + if params["bias"] is not None and params["bias"] != "none": + print("Error: param bias is not supported") + sys.exit(1) + + # TODO: these seem to be layers that have been trained but without lora. + # doesn't seem widely used but eventually should be supported + if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0: + print("Error: param modules_to_save is not supported") + sys.exit(1) + + with open(output_path, "wb") as fout: + fout.truncate() + + write_file_header(fout, params) + for k, v in model.items(): + orig_k = k + if k.endswith(".default.weight"): + k = k.replace(".default.weight", ".weight") + if k in ["llama_proj.weight", "llama_proj.bias"]: + continue + if k.endswith("lora_A.weight"): + if v.dtype != torch.float16 and v.dtype != torch.float32: + v = v.float() + v = v.T + else: v = v.float() - v = v.T - else: - v = v.float() - t = v.detach().numpy() - tname = translate_tensor_name(k) - print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") - write_tensor_header(fout, tname, t.shape, t.dtype) - t.tofile(fout) + t = v.detach().numpy() -print(f"Converted {input_json} and {input_model} to {output_path}") + prefix = "base_model.model." + if k.startswith(prefix): + k = k[len(prefix) :] + + lora_suffixes = (".lora_A.weight", ".lora_B.weight") + if k.endswith(lora_suffixes): + suffix = k[-len(lora_suffixes[0]):] + k = k[: -len(lora_suffixes[0])] + else: + print(f"Error: unrecognized tensor name {orig_k}") + sys.exit(1) + + tname = name_map.get_name(k) + if tname is None: + print(f"Error: could not map tensor name {orig_k}") + print(" Note: the arch parameter must be specified if the model is not llama") + sys.exit(1) + + if suffix == ".lora_A.weight": + tname += ".weight.loraA" + elif suffix == ".lora_B.weight": + tname += ".weight.loraB" + else: + assert False + + print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") + write_tensor_header(fout, tname, t.shape, t.dtype) + t.tofile(fout) + + print(f"Converted {input_json} and {input_model} to {output_path}") diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py deleted file mode 100755 index 19a66820d..000000000 --- a/convert-mpt-hf-to-gguf.py +++ /dev/null @@ -1,218 +0,0 @@ -#!/usr/bin/env python3 -# HF mpt--> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "MPTForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.MPT -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["n_layers"] - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_context_length(hparams["max_seq_len"]) -gguf_writer.add_embedding_length(hparams["d_model"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(4 * hparams["d_model"]) -gguf_writer.add_head_count(hparams["n_heads"]) -if kv_n_heads := hparams["attn_config"].get("kv_n_heads"): - gguf_writer.add_head_count_kv(kv_n_heads) -gguf_writer.add_layer_norm_eps(1e-05) -if hparams["attn_config"]["clip_qkv"] is not None: - gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"]) -gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"]) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but -# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to -# accomodate some "reserved" tokens; this is causing problems down the line in -# llama.cpp, so we pad the vocab with dummy tokens: - -vocab_size = hparams["vocab_size"] - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") - scores.append(0.0) # dummy - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Cannot map tensor '" + name + "'") - continue # for the sake of compatibility with some old published models, don't quit - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - # note: MPT output is tied to (same as) wte in original model; - # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ - if new_name == "token_embd.weight": - gguf_writer.add_tensor("output.weight", data) - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py old mode 100644 new mode 100755 index e022ffe46..def210531 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -1,14 +1,18 @@ -import torch -import os -from pprint import pprint -import sys +#!/usr/bin/env python3 import argparse +import os +import sys from pathlib import Path +from pprint import pprint + +import torch from sentencepiece import SentencePieceProcessor + if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf + def _flatten_dict(dct, tensors, prefix=None): assert isinstance(dct, dict) for key in dct.keys(): @@ -21,6 +25,7 @@ def _flatten_dict(dct, tensors, prefix=None): raise ValueError(type(dct[key])) return None + def _get_sentencepiece_tokenizer_info(dir_model: Path): tokenizer_path = dir_model / 'adept_vocab.model' print('gguf: getting sentencepiece tokenizer from', tokenizer_path) @@ -54,6 +59,7 @@ def _get_sentencepiece_tokenizer_info(dir_model: Path): pass return tokens, scores, toktypes + def main(): parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") @@ -65,7 +71,7 @@ def main(): persimmon_model = torch.load(args.ckpt_path) hparams = persimmon_model['args'] pprint(hparams) - tensors = {} + tensors: dict[str, torch.Tensor] = {} _flatten_dict(persimmon_model['model'], tensors, None) arch = gguf.MODEL_ARCH.PERSIMMON @@ -82,7 +88,8 @@ def main(): gguf_writer.add_embedding_length(hidden_size) gguf_writer.add_block_count(block_count) gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size) - gguf_writer.add_rope_dimension_count(hidden_size // head_count) + # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443 + gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2) gguf_writer.add_head_count(head_count) gguf_writer.add_head_count_kv(head_count_kv) gguf_writer.add_rope_freq_base(hparams.rotary_emb_base) @@ -125,6 +132,5 @@ def main(): print("") - if __name__ == '__main__': main() diff --git a/convert-refact-hf-to-gguf.py b/convert-refact-hf-to-gguf.py deleted file mode 100755 index bfeabc082..000000000 --- a/convert-refact-hf-to-gguf.py +++ /dev/null @@ -1,263 +0,0 @@ -#!/usr/bin/env python3 -# HF refact--> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import sys -from pathlib import Path - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if "NO_LOCAL_GGUF" not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf")) -import gguf - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Convert a Refact model to a GGML compatible file" - ) - parser.add_argument( - "--vocab-only", - action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", - type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", - type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", - type=int, - choices=[0, 1], - default=1, - nargs="?", - help="output format - use 0 for float32, 1 for float16", - ) - return parser.parse_args() - - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f"Error: {args.model} is not a directory", file=sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf" - -print("gguf: loading model " + dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "GPTRefactForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit(1) - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH = gguf.MODEL_ARCH.REFACT -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -# Get refact feed forward dimension -hidden_dim = hparams["n_embd"] -inner_dim = 4 * hidden_dim -hidden_dim = int(2 * inner_dim / 3) -multiple_of = 256 -ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - -block_count = hparams["n_layer"] - -gguf_writer.add_name("Refact") -# refact uses Alibi. So this is from config.json which might be used by training. -gguf_writer.add_context_length(hparams["n_positions"]) -gguf_writer.add_embedding_length(hparams["n_embd"]) - -gguf_writer.add_feed_forward_length(ff_dim) -gguf_writer.add_block_count(block_count) -gguf_writer.add_head_count(hparams["n_head"]) -gguf_writer.add_head_count_kv(1) -gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"]) -gguf_writer.add_file_type(ftype) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") - scores.append(0.0) # dummy - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH, block_count) - -# params for qkv transform -n_head = hparams["n_head"] -n_head_kv = 1 - -head_dim = hparams["n_embd"] // n_head - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(dir_model / part_name, map_location="cpu") - - for i in range(block_count): - if f"transformer.h.{i}.attn.kv.weight" in model_part: - data = model_part[f"transformer.h.{i}.attn.kv.weight"] - model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[ - : n_head_kv * head_dim - ] - model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[ - n_head_kv * head_dim : - ] - del model_part[f"transformer.h.{i}.attn.kv.weight"] - if f"transformer.h.{i}.attn.q.weight" in model_part: - model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[ - f"transformer.h.{i}.attn.q.weight" - ] - del model_part[f"transformer.h.{i}.attn.q.weight"] - if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part: - data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"] - model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim] - model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:] - del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"] - - for name in model_part.keys(): - data = model_part[name] - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): - data = data.astype(np.float16) - - print( - new_name - + ", n_dims = " - + str(n_dims) - + ", " - + str(old_dtype) - + " --> " - + str(data.dtype) - ) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py deleted file mode 100755 index 90fa0c32f..000000000 --- a/convert-starcoder-hf-to-gguf.py +++ /dev/null @@ -1,202 +0,0 @@ -#!/usr/bin/env python3 -# HF starcoder --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") - parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "GPTBigCodeForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit(1) - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.STARCODER -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["n_layer"] - -gguf_writer.add_name("StarCoder") -gguf_writer.add_context_length(hparams["n_positions"]) -gguf_writer.add_embedding_length(hparams["n_embd"]) -gguf_writer.add_feed_forward_length(4 * hparams["n_embd"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_head_count(hparams["n_head"]) -gguf_writer.add_head_count_kv(1) -gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) -gguf_writer.add_file_type(ftype) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") - scores.append(0.0) # dummy - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# params for qkv transform -n_head = hparams["n_head"] -n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1 - -head_dim = hparams["n_embd"] // n_head - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(dir_model / part_name, map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert.py b/convert.py index e9b08d344..63a0a5d78 100755 --- a/convert.py +++ b/convert.py @@ -3,15 +3,14 @@ from __future__ import annotations import argparse import concurrent.futures -import copy import enum import faulthandler import functools -import io import itertools import json import math import mmap +import os import pickle import re import signal @@ -23,14 +22,13 @@ from abc import ABCMeta, abstractmethod from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar +from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar import numpy as np -from sentencepiece import SentencePieceProcessor # type: ignore[import] +from sentencepiece import SentencePieceProcessor -import os if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf if TYPE_CHECKING: @@ -44,10 +42,12 @@ NDArray: TypeAlias = 'np.ndarray[Any, Any]' ARCH = gguf.MODEL_ARCH.LLAMA DEFAULT_CONCURRENCY = 8 + # # data types # + @dataclass(frozen=True) class DataType: name: str @@ -57,14 +57,17 @@ class DataType: def elements_to_bytes(self, n_elements: int) -> int: return n_elements * self.dtype.itemsize + @dataclass(frozen=True) class UnquantizedDataType(DataType): pass -DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0']) -DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) -DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) -DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) + +DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0']) +DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) +DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) +DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) + @dataclass(frozen=True) class QuantizedDataType(DataType): @@ -79,6 +82,7 @@ class QuantizedDataType(DataType): assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}' return self.quantized_dtype.itemsize * (n_elements // self.block_size) + @dataclass(frozen=True) class Q8_0QuantizedDataType(QuantizedDataType): # Mini Q8_0 quantization in Python! @@ -88,6 +92,7 @@ class Q8_0QuantizedDataType(QuantizedDataType): n_blocks = arr.size // self.block_size blocks = arr.reshape((n_blocks, self.block_size)) # Much faster implementation of block quantization contributed by @Cebtenzzre + def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: d = abs(blocks).max(axis = 1) / np.float32(127) with np.errstate(divide = 'ignore'): @@ -96,10 +101,11 @@ class Q8_0QuantizedDataType(QuantizedDataType): yield from zip(d, qs) return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype) + DT_Q8_0 = Q8_0QuantizedDataType('Q8_0', - dtype = np.dtype(np.float32), valid_conversions = [], - ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32, - quantized_dtype = np.dtype([('d', ' 1 else DT_F32 + GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { GGMLFileType.AllF32 : DT_F32, GGMLFileType.MostlyF16 : DT_F16, @@ -140,19 +149,25 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { # hparams loading # + @dataclass class Params: - n_vocab: int - n_embd: int - n_layer: int - n_ctx: int - n_ff: int - n_head: int - n_head_kv: int - f_norm_eps: float + n_vocab: int + n_embd: int + n_layer: int + n_ctx: int + n_ff: int + n_head: int + n_head_kv: int + n_experts: int | None = None + n_experts_used: int | None = None + f_norm_eps: float | None = None + rope_scaling_type: gguf.RopeScalingType | None = None f_rope_freq_base: float | None = None f_rope_scale: float | None = None + n_orig_ctx: int | None = None + rope_finetuned: bool | None = None ftype: GGMLFileType | None = None @@ -166,11 +181,11 @@ class Params: # try transformer naming first if "model.layers.0.self_attn.q_proj.weight" in model: - n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming - n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) else: - n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) + n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) if n_layer < 1: raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n" @@ -198,20 +213,20 @@ class Params: def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) - n_vocab = config["vocab_size"] - n_embd = config["hidden_size"] - n_layer = config["num_hidden_layers"] - n_ff = config["intermediate_size"] - n_head = config["num_attention_heads"] - n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head - f_norm_eps = config["rms_norm_eps"] - f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None - + rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None rope_scaling = config.get("rope_scaling") - if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear": - f_rope_scale = config["rope_scaling"].get("factor") - else: - f_rope_scale = None + + if rope_scaling is not None and (typ := rope_scaling.get("type")): + rope_factor = rope_scaling.get("factor") + f_rope_scale = rope_factor + if typ == "linear": + rope_scaling_type = gguf.RopeScalingType.LINEAR + elif typ == "yarn": + rope_scaling_type = gguf.RopeScalingType.YARN + n_orig_ctx = rope_scaling['original_max_position_embeddings'] + rope_finetuned = rope_scaling['finetuned'] + else: + raise NotImplementedError(f'Unknown rope scaling type: {typ}') if "max_sequence_length" in config: n_ctx = config["max_sequence_length"] @@ -221,17 +236,29 @@ class Params: raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n" "Suggestion: provide 'config.json' of the model in the same directory containing model files.") + n_experts = None + n_experts_used = None + + if "num_local_experts" in config: + n_experts = config["num_local_experts"] + n_experts_used = config["num_experts_per_tok"] + return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_layer = n_layer, - n_ctx = n_ctx, - n_ff = n_ff, - n_head = n_head, - n_head_kv = n_head_kv, - f_norm_eps = f_norm_eps, - f_rope_freq_base = f_rope_freq_base, - f_rope_scale = f_rope_scale, + n_vocab = config["vocab_size"], + n_embd = config["hidden_size"], + n_layer = config["num_hidden_layers"], + n_ctx = n_ctx, + n_ff = config["intermediate_size"], + n_head = (n_head := config["num_attention_heads"]), + n_head_kv = config.get("num_key_value_heads", n_head), + n_experts = n_experts, + n_experts_used = n_experts_used, + f_norm_eps = config["rms_norm_eps"], + f_rope_freq_base = config.get("rope_theta"), + rope_scaling_type = rope_scaling_type, + f_rope_scale = f_rope_scale, + n_orig_ctx = n_orig_ctx, + rope_finetuned = rope_finetuned, ) # LLaMA v2 70B params.json @@ -240,17 +267,15 @@ class Params: def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) - n_vocab = config["vocab_size"] if "vocab_size" in config else -1 - n_embd = config["dim"] - n_layer = config["n_layers"] - n_ff = -1 - n_head = config["n_heads"] - n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head - f_norm_eps = config["norm_eps"] - f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None + n_experts = None + n_experts_used = None + f_rope_freq_base = None # hack to determine LLaMA v1 vs v2 vs CodeLlama - if f_rope_freq_base == 1000000: + if config.get("moe"): + # Mixtral + n_ctx = 32768 + elif config.get("rope_theta") == 1000000: # CodeLlama n_ctx = 16384 elif config["norm_eps"] == 1e-05: @@ -260,22 +285,27 @@ class Params: # LLaMA v1 n_ctx = 2048 - if n_vocab == -1: - n_vocab = model["tok_embeddings.weight"].shape[0] - - if n_ff == -1: + if "layers.0.feed_forward.w1.weight" in model: n_ff = model["layers.0.feed_forward.w1.weight"].shape[0] + if config.get("moe"): + n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] + n_experts = config["moe"]["num_experts"] + n_experts_used = config["moe"]["num_experts_per_tok"] + f_rope_freq_base = 1e6 + return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_layer = n_layer, + n_vocab = model["tok_embeddings.weight"].shape[0], + n_embd = config["dim"], + n_layer = config["n_layers"], n_ctx = n_ctx, n_ff = n_ff, - n_head = n_head, - n_head_kv = n_head_kv, - f_norm_eps = f_norm_eps, - f_rope_freq_base = f_rope_freq_base, + n_head = (n_head := config["n_heads"]), + n_head_kv = config.get("n_kv_heads", n_head), + n_experts = n_experts, + n_experts_used = n_experts_used, + f_norm_eps = config["norm_eps"], + f_rope_freq_base = config.get("rope_theta", f_rope_freq_base), ) @staticmethod @@ -304,6 +334,10 @@ class Params: class BpeVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) + if isinstance(self.bpe_tokenizer.get('model'), dict): + self.vocab = self.bpe_tokenizer["model"]["vocab"] + else: + self.vocab = self.bpe_tokenizer added_tokens: dict[str, int] if fname_added_tokens is not None: # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. @@ -319,9 +353,9 @@ class BpeVocab: (item['content'], item['id']) for item in tokenizer_json.get('added_tokens', []) # Added tokens here can be duplicates of the main vocabulary. - if item['content'] not in self.bpe_tokenizer ) + if item['content'] not in self.bpe_tokenizer) - vocab_size: int = len(self.bpe_tokenizer) + vocab_size: int = len(self.vocab) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: @@ -329,6 +363,7 @@ class BpeVocab: raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}") items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) + self.added_tokens_dict = added_tokens self.added_tokens_list = [text for (text, idx) in items] self.vocab_size_base: int = vocab_size self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) @@ -336,11 +371,9 @@ class BpeVocab: self.fname_added_tokens = fname_added_tokens def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - tokenizer = self.bpe_tokenizer - from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import] - reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()} + reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} - for i, _ in enumerate(tokenizer): + for i, _ in enumerate(self.vocab): yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: @@ -366,16 +399,20 @@ class SentencePieceVocab: added_tokens = {} vocab_size: int = self.sentencepiece_tokenizer.vocab_size() - expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) - if expected_ids != actual_ids: - raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") - items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_list = [text for (text, idx) in items] - self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer + new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} + expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) + actual_new_ids = sorted(new_tokens.keys()) + + if expected_new_ids != actual_new_ids: + raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") + + # Token pieces that were added to the base vocabulary. + self.added_tokens_dict = added_tokens + self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: @@ -414,20 +451,124 @@ class SentencePieceVocab: def __repr__(self) -> str: return f"" -Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab' + +class HfVocab: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None: + try: + from transformers import AutoTokenizer + except ImportError as e: + raise ImportError( + "To use HfVocab, please install the `transformers` package. " + "You can install it with `pip install transformers`." + ) from e + + print("fname_tokenizer:", fname_tokenizer) + # Allow the tokenizer to default to slow or fast versions. + # Explicitly set tokenizer to use local paths. + self.tokenizer = AutoTokenizer.from_pretrained( + fname_tokenizer, + cache_dir=fname_tokenizer, + local_files_only=True, + ) + + # Initialize lists and dictionaries for added tokens + self.added_tokens_list = [] + self.added_tokens_dict = dict() + self.added_tokens_ids = set() + + # Process added tokens + for tok, tokidx in sorted( + self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] + ): + # Only consider added tokens that are not in the base vocabulary + if tokidx >= self.tokenizer.vocab_size: + self.added_tokens_list.append(tok) + self.added_tokens_dict[tok] = tokidx + self.added_tokens_ids.add(tokidx) + + # Store special tokens and their IDs + self.specials = { + tok: self.tokenizer.get_vocab()[tok] + for tok in self.tokenizer.all_special_tokens + } + self.special_ids = set(self.tokenizer.all_special_ids) + + # Set vocabulary sizes + self.vocab_size_base = self.tokenizer.vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + + self.fname_tokenizer = fname_tokenizer + self.fname_added_tokens = fname_added_tokens + + def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + reverse_vocab = { + id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() + } + + for token_id in range(self.vocab_size_base): + # Skip processing added tokens here + if token_id in self.added_tokens_ids: + continue + + # Convert token text to bytes + token_text = reverse_vocab[token_id].encode("utf-8") + + # Yield token text, score, and type + yield token_text, self.get_token_score(token_id), self.get_token_type( + token_id, token_text, self.special_ids # Reuse already stored special IDs + ) + + def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType: + # Special case for byte tokens + if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + return gguf.TokenType.BYTE + + # Determine token type based on whether it's a special token + return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL + + def get_token_score(self, token_id: int) -> float: + # Placeholder for actual logic to determine the token's score + # This needs to be implemented based on specific requirements + return -1000.0 # Default score + + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + for text in self.added_tokens_list: + if text in self.specials: + toktype = self.get_token_type(self.specials[text], b'', self.special_ids) + score = self.get_token_score(self.specials[text]) + else: + toktype = gguf.TokenType.USER_DEFINED + score = -1000.0 + + yield text.encode("utf-8"), score, toktype + + def has_newline_token(self): + return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab + + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + yield from self.hf_tokens() + yield from self.added_tokens() + + def __repr__(self) -> str: + return f"" + + +Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab" + # # data loading # TODO: reuse (probably move to gguf.py?) # + def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: - #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) + # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) + .swapaxes(1, 2) + .reshape(weights.shape)) class Tensor(metaclass=ABCMeta): @@ -508,7 +649,7 @@ class LazyTensor: ret = self._load() # Should be okay if it maps to the same numpy type? assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \ - (self.data_type, ret.data_type, self.description) + (self.data_type, ret.data_type, self.description) return ret def astype(self, data_type: DataType) -> LazyTensor: @@ -581,14 +722,14 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: if any("model.embed_tokens.weight" in mp.model for mp in models_plus): # Transformers models put different tensors in different files, but - # don't split indivdual tensors between files. + # don't split individual tensors between files. model: LazyModel = {} for mp in models_plus: model.update(mp.model) else: model = merge_sharded([mp.model for mp in models_plus]) - return ModelPlus(model, paths, format, vocab) + return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: @@ -596,6 +737,7 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe return lazy_tensor.load().permute(n_head, n_head_kv) return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) + def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor: def load() -> Tensor: return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv) @@ -603,6 +745,7 @@ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_ s[0] = s[0] // 3 return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) + def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: def load() -> Tensor: return lazy_tensor.load().part(n_part) @@ -672,7 +815,7 @@ class LazyUnpickler(pickle.Unpickler): return func(*args) CLASSES: dict[tuple[str, str], Any] = { - # getattr used here as a workaround for mypy not being smart enough to detrmine + # getattr used here as a workaround for mypy not being smart enough to determine # the staticmethods have a __func__ attribute. ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), @@ -698,6 +841,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: data_base_path=pickle_paths[0][:-4], zip_file=zf) model = unpickler.load() + if 'model' in model: model = model['model'] as_dict = dict(model.items()) return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None) @@ -751,6 +895,7 @@ def lazy_load_file(path: Path) -> ModelPlus: In = TypeVar('In') Out = TypeVar('Out') + def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]: '''Parallel map, but with backpressure. If the caller doesn't call `next` fast enough, this will stop calling `func` at some point rather than @@ -765,7 +910,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc executor_class = ProcessPoolExecutor else: executor_class = ThreadPoolExecutor - with executor_class(max_workers = max_workers) as executor: + with executor_class(max_workers=max_workers) as executor: futures: list[concurrent.futures.Future[Out]] = [] done = False for _ in range(concurrency): @@ -785,26 +930,42 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc break yield result -def check_vocab_size(params: Params, vocab: Vocab) -> None: - if params.n_vocab != vocab.vocab_size: - assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab) - if params.n_vocab == vocab.vocab_size_base: - print("Ignoring added_tokens.json since model matches vocab size without it.") - vocab.added_tokens_list = [] - vocab.vocab_size = vocab.vocab_size_base - return - msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" - if vocab.fname_added_tokens is not None: - msg += f" combined with {vocab.fname_added_tokens}" - msg += f" has {vocab.vocab_size})." - if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None: - msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." - raise Exception(msg) + +def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None: + # Handle special case where the model's vocab size is not set + if params.n_vocab == -1: + raise ValueError( + f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?" + ) + + # Check for a vocab size mismatch + if params.n_vocab == vocab.vocab_size: + print("Ignoring added_tokens.json since model matches vocab size without it.") + return + + if pad_vocab and params.n_vocab > vocab.vocab_size: + pad_count = params.n_vocab - vocab.vocab_size + print( + f"Padding vocab with {pad_count} token(s) - through " + ) + for i in range(1, pad_count + 1): + vocab.added_tokens_dict[f""] = -1 + vocab.added_tokens_list.append(f"") + vocab.vocab_size = params.n_vocab + return + + msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer} has {vocab.vocab_size})." + if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20: + msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." + if vocab.vocab_size < params.n_vocab: + msg += " Add the --pad-vocab option and try again." + + raise Exception(msg) class OutputFile: - def __init__(self, fname_out: Path) -> None: - self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: + self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) def add_meta_arch(self, params: Params) -> None: name = "LLaMA" @@ -823,33 +984,75 @@ class OutputFile: self.gguf.add_rope_dimension_count(params.n_embd // params.n_head) self.gguf.add_head_count (params.n_head) self.gguf.add_head_count_kv (params.n_head_kv) - self.gguf.add_layer_norm_rms_eps (params.f_norm_eps) + + if params.n_experts: + self.gguf.add_expert_count(params.n_experts) + + if params.n_experts_used: + self.gguf.add_expert_used_count(params.n_experts_used) + + if params.f_norm_eps: + self.gguf.add_layer_norm_rms_eps(params.f_norm_eps) + else: + raise ValueError('f_norm_eps is None') if params.f_rope_freq_base is not None: self.gguf.add_rope_freq_base(params.f_rope_freq_base) - if params.f_rope_scale is not None: - self.gguf.add_rope_scale_linear(params.f_rope_scale) + if params.rope_scaling_type: + assert params.f_rope_scale is not None + self.gguf.add_rope_scaling_type(params.rope_scaling_type) + self.gguf.add_rope_scaling_factor(params.f_rope_scale) + + if params.n_orig_ctx is not None: + self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx) + + if params.rope_finetuned is not None: + self.gguf.add_rope_scaling_finetuned(params.rope_finetuned) if params.ftype is not None: self.gguf.add_file_type(params.ftype) - def add_meta_vocab(self, vocab: Vocab) -> None: + def handle_tokenizer_model(self, vocab: Vocab) -> str: + # Map the vocab types to the supported tokenizer models + tokenizer_model = { + SentencePieceVocab: "llama", + HfVocab: "llama", + BpeVocab: "gpt2", + }.get(type(vocab)) + + # Block if vocab type is not predefined + if tokenizer_model is None: + raise ValueError("Unknown vocab type: Not supported") + + return tokenizer_model + + def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]: tokens = [] scores = [] toktypes = [] + # NOTE: `all_tokens` returns the base vocabulary and added tokens for text, score, toktype in vocab.all_tokens(): tokens.append(text) scores.append(score) toktypes.append(toktype) - if isinstance(vocab, SentencePieceVocab): - self.gguf.add_tokenizer_model("llama") - elif isinstance(vocab, BpeVocab): - self.gguf.add_tokenizer_model("gpt2") - else: - raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab') + assert len(tokens) == vocab.vocab_size + + return tokens, scores, toktypes + + def add_meta_vocab(self, vocab: Vocab) -> None: + # Handle the tokenizer model + tokenizer_model = self.handle_tokenizer_model(vocab) + + # Ensure that tokenizer_model is added to the GGUF model + self.gguf.add_tokenizer_model(tokenizer_model) + + # Extract model vocabulary for model conversion + tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab) + + # Add extracted token information for model conversion self.gguf.add_token_list(tokens) self.gguf.add_token_scores(scores) self.gguf.add_token_types(toktypes) @@ -862,7 +1065,7 @@ class OutputFile: raw_dtype = getattr(tensor.data_type, 'ggml_type', None) data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype data_nbytes = tensor.data_type.elements_to_bytes(n_elements) - self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype) + self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) def write_meta(self) -> None: self.gguf.write_header_to_file() @@ -875,10 +1078,13 @@ class OutputFile: self.gguf.close() @staticmethod - def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None: - check_vocab_size(params, vocab) + def write_vocab_only( + fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, + ) -> None: + check_vocab_size(params, vocab, pad_vocab = pad_vocab) - of = OutputFile(fname_out) + of = OutputFile(fname_out, endianess=endianess) # meta data of.add_meta_arch(params) @@ -903,10 +1109,14 @@ class OutputFile: return dt.quantize(arr) @staticmethod - def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None: - check_vocab_size(params, vocab) + def write_all( + fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, + concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab: bool = False, + ) -> None: + check_vocab_size(params, vocab, pad_vocab=pad_vocab) - of = OutputFile(fname_out) + of = OutputFile(fname_out, endianess=endianess) # meta data of.add_meta_arch(params) @@ -923,7 +1133,10 @@ class OutputFile: # tensor data ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) if ftype == GGMLFileType.MostlyQ8_0: - ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, use_processpool_executor = True) + ndarrays = bounded_parallel_map( + OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, + use_processpool_executor=True, + ) else: ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) @@ -932,13 +1145,16 @@ class OutputFile: elapsed = time.time() - start size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) padi = len(str(len(model))) - print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}") + print( + f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" + ) of.gguf.write_tensor_data(ndarray) of.close() + def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: - wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type + wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): return GGMLFileType.AllF32 @@ -951,11 +1167,13 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT raise Exception(f"Unexpected combination of types: {name_to_type}") + def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) for (name, tensor) in model.items()} -def convert_model_names(model: LazyModel, params: Params) -> LazyModel: + +def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel: tmap = gguf.TensorNameMap(ARCH, params.n_layer) should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) @@ -967,7 +1185,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: print(f"Permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) - #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] + # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] elif f"model.layers.{i}.self_attn.W_pack.weight" in model: print(f"Unpacking and permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) @@ -981,7 +1199,11 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: for name, lazy_tensor in model.items(): tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) if name_new is None: - raise Exception(f"Unexpected tensor name: {name}") + if skip_unknown: + print(f"Unexpected tensor name: {name} - skipping") + continue + else: + raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)") if tensor_type in should_skip: print(f"skipping tensor {name_new}") @@ -992,6 +1214,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: return out + def nth_multifile_path(path: Path, n: int) -> Path | None: '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return the nth path in the model. @@ -1036,7 +1259,8 @@ def load_some_model(path: Path) -> ModelPlus: # Be extra-friendly and accept either a file or a directory: if path.is_dir(): # Check if it's a set of safetensors files first - files = list(path.glob("model-00001-of-*.safetensors")) + globs = ["model-00001-of-*.safetensors", "model.safetensors"] + files = [file for glob in globs for file in path.glob(glob)] if not files: # Try the PyTorch patterns too, with lower priority globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] @@ -1057,36 +1281,75 @@ def load_some_model(path: Path) -> ModelPlus: return model_plus -def load_vocab(path: Path, vocabtype: str | None) -> Vocab: - # Be extra-friendly and accept either a file or a directory. Also, if it's - # a directory, it might be the model directory, and tokenizer.model might - # be in the parent of that. - if path.is_dir(): - vocab_file = "tokenizer.model" - if vocabtype == 'bpe': - vocab_file = "vocab.json" - path2 = path / vocab_file - # Use `.parent` instead of /.. to handle the symlink case better. - path3 = path.parent / vocab_file - if path2.exists(): - path = path2 - elif path3.exists(): - path = path3 - else: - raise FileNotFoundError( - f"Could not find {vocab_file} in {path} or its parent; " - "if it's in another directory, pass the directory as --vocab-dir") +class VocabFactory: + def __init__(self, path: Path): + self.path = path + self.files: dict[str, Path | None] = { + "tokenizer.model": None, + "vocab.json": None, + "tokenizer.json": None, + } + self._detect_files() - print(f"Loading vocab file '{path}', type '{vocabtype}'") + def _detect_files(self): + for file in self.files.keys(): + file_path = self.path / file + parent_file_path = self.path.parent / file + if file_path.exists(): + self.files[file] = file_path + elif parent_file_path.exists(): + self.files[file] = parent_file_path + print(f"Found vocab files: {self.files}") - added_tokens_path = path.parent / "added_tokens.json" - if vocabtype == "bpe": - return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None) - elif vocabtype == "spm": - return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) - else: + def _select_file(self, vocabtype: str | None) -> Path: + if vocabtype in ["spm", "bpe"]: + for file_key in self.files.keys(): + if (file := self.files[file_key]) is not None: + return file + raise FileNotFoundError(f"{vocabtype} vocab not found.") + if vocabtype == "hfft": + # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file + return self.path raise ValueError(f"Unsupported vocabulary type {vocabtype}") + def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab: + load_merges = vocabtype == "bpe" + n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None + return gguf.SpecialVocab( + model_parent_path, + load_merges=load_merges, + special_token_types=None, # Predetermined or passed as a parameter + n_vocab=n_vocab, + ) + + def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: + path = self._select_file(vocabtype) + print(f"Loading vocab file '{path}', type '{vocabtype}'") + + added_tokens_path = path.parent / "added_tokens.json" + vocab: Vocab + if vocabtype == "bpe": + vocab = BpeVocab( + path, added_tokens_path if added_tokens_path.exists() else None + ) + elif vocabtype == "spm": + vocab = SentencePieceVocab( + path, added_tokens_path if added_tokens_path.exists() else None + ) + elif vocabtype == "hfft": + vocab = HfVocab( + path, added_tokens_path if added_tokens_path.exists() else None + ) + else: + raise ValueError(f"Unsupported vocabulary type {vocabtype}") + # FIXME: Respect --vocab-dir? + special_vocab = self._create_special_vocab( + vocab, + vocabtype, + model_parent_path, + ) + return vocab, special_vocab + def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: namestr = { @@ -1112,18 +1375,40 @@ def do_dump_model(model_plus: ModelPlus) -> None: def main(args_in: list[str] | None = None) -> None: + output_choices = ["f32", "f16"] + if np.uint32(1) == np.uint32(1).newbyteorder("<"): + # We currently only support Q8_0 output on little endian systems. + output_choices.append("q8_0") + vocab_types = ["spm", "bpe", "hfft"] parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") - parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") - parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outtype", choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") - parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") - parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm") - parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") - parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) + parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) + parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") + parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") + parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") + parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") + parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") + parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY) + parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") + parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") + parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") + args = parser.parse_args(args_in) + if args.awq_path: + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] + tmp_model_path = args.model / "weighted_model" + if tmp_model_path.is_dir(): + print(f"{tmp_model_path} exists as a weighted model.") + else: + tmp_model_path.mkdir(parents=True, exist_ok=True) + print("Saving new weighted model ...") + add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) + print(f"Saved weighted model at {tmp_model_path}.") + args.model = tmp_model_path if args.dump_single: model_plus = lazy_load_file(args.model) @@ -1138,6 +1423,9 @@ def main(args_in: list[str] | None = None) -> None: if args.dump: do_dump_model(model_plus) return + endianess = gguf.GGUFEndian.LITTLE + if args.big_endian: + endianess = gguf.GGUFEndian.BIG params = Params.load(model_plus) if params.n_ctx == -1: @@ -1157,27 +1445,28 @@ def main(args_in: list[str] | None = None) -> None: print(f"params = {params}") - vocab: Vocab + model_parent_path = model_plus.paths[0].parent + vocab_path = Path(args.vocab_dir or args.model or model_parent_path) + vocab_factory = VocabFactory(vocab_path) + vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path) + if args.vocab_only: - assert args.outfile, "need --outfile if using --vocab-only" - # FIXME: Try to respect vocab_dir somehow? - vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype) - special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe') + if not args.outfile: + raise ValueError("need --outfile if using --vocab-only") outfile = args.outfile - OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) + OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, + endianess=endianess, pad_vocab=args.pad_vocab) print(f"Wrote {outfile}") return if model_plus.vocab is not None and args.vocab_dir is None: vocab = model_plus.vocab - else: - vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent - vocab = load_vocab(vocab_dir, args.vocabtype) - # FIXME: Try to respect vocab_dir somehow? - special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe') + + print(f"Vocab info: {vocab}") + print(f"Special vocab info: {special_vocab}") model = model_plus.model - model = convert_model_names(model, params) + model = convert_model_names(model, params, args.skip_unknown) ftype = pick_output_type(model, args.outtype) model = convert_to_output_type(model, ftype) outfile = args.outfile or default_outfile(model_plus.paths, ftype) @@ -1185,7 +1474,8 @@ def main(args_in: list[str] | None = None) -> None: params.ftype = ftype print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency) + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, + concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) print(f"Wrote {outfile}") diff --git a/docs/llama-star/idea-arch.key b/docs/llama-star/idea-arch.key new file mode 100755 index 000000000..3e068e707 Binary files /dev/null and b/docs/llama-star/idea-arch.key differ diff --git a/docs/llama-star/idea-arch.pdf b/docs/llama-star/idea-arch.pdf new file mode 100644 index 000000000..4fa92c71d Binary files /dev/null and b/docs/llama-star/idea-arch.pdf differ diff --git a/docs/token_generation_performance_tips.md b/docs/token_generation_performance_tips.md index c9acff7d4..d7e863dff 100644 --- a/docs/token_generation_performance_tips.md +++ b/docs/token_generation_performance_tips.md @@ -17,7 +17,7 @@ llama_model_load_internal: [cublas] total VRAM used: 17223 MB If you see these lines, then the GPU is being used. ## Verifying that the CPU is not oversaturated -llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physicial CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down. +llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down. # Example of runtime flags effect on inference speed benchmark These runs were tested on the following machine: diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e16c65f7c..653abc73a 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,29 +12,35 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(main) - add_subdirectory(quantize) - add_subdirectory(quantize-stats) - add_subdirectory(perplexity) - add_subdirectory(embedding) - add_subdirectory(save-load-state) - add_subdirectory(benchmark) add_subdirectory(baby-llama) - add_subdirectory(train-text-from-scratch) - add_subdirectory(finetune) - add_subdirectory(convert-llama2c-to-ggml) - add_subdirectory(simple) add_subdirectory(batched) add_subdirectory(batched-bench) - add_subdirectory(speculative) - add_subdirectory(parallel) - add_subdirectory(embd-input) - add_subdirectory(llava) - add_subdirectory(llama-bench) add_subdirectory(beam-search) - if (LLAMA_METAL) - add_subdirectory(metal) + add_subdirectory(benchmark) + add_subdirectory(convert-llama2c-to-ggml) + add_subdirectory(embedding) + add_subdirectory(finetune) + add_subdirectory(infill) + add_subdirectory(llama-bench) + add_subdirectory(llava) + if (LLAMA_SYCL) + add_subdirectory(sycl) endif() + add_subdirectory(main) + add_subdirectory(tokenize) + add_subdirectory(parallel) + add_subdirectory(perplexity) + add_subdirectory(quantize) + add_subdirectory(quantize-stats) + add_subdirectory(save-load-state) + add_subdirectory(simple) + add_subdirectory(passkey) + add_subdirectory(speculative) + add_subdirectory(lookahead) + add_subdirectory(lookup) + add_subdirectory(gguf) + add_subdirectory(train-text-from-scratch) + add_subdirectory(imatrix) if (LLAMA_BUILD_SERVER) add_subdirectory(server) endif() diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 8155101d0..bf0125e75 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -575,10 +575,7 @@ static struct ggml_tensor * forward( // KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // KQ_masked shape [n_past + N, N, n_head, 1] @@ -844,10 +841,7 @@ static struct ggml_tensor * forward_batch( // KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch); // KQ_masked = mask_past(KQ_scaled) @@ -1131,10 +1125,7 @@ static struct ggml_tensor * forward_lora( // KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // KQ_masked shape [n_past + N, N, n_head, 1] @@ -1258,9 +1249,9 @@ static struct ggml_tensor * forward_lora( } static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { - assert(logits->n_dims == 2); - assert(probs->n_dims == 2); - assert(best_samples->n_dims == 1); + assert(ggml_is_matrix(logits)); + assert(ggml_is_matrix(probs)); + assert(ggml_is_vector(best_samples)); assert(logits->ne[1] == best_samples->ne[0]); assert(logits->ne[0] == probs->ne[0]); assert(logits->ne[1] == probs->ne[1]); @@ -1292,9 +1283,9 @@ static void sample_softmax_batch( struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples ) { - GGML_ASSERT(best_samples->n_dims == 2); - GGML_ASSERT(logits->n_dims == 3); - GGML_ASSERT(probs->n_dims == 3); + GGML_ASSERT(ggml_is_matrix(best_samples)); + GGML_ASSERT(ggml_is_3d(logits)); + GGML_ASSERT(ggml_is_3d(probs)); int n_tokens = best_samples->ne[0]; int n_batch = best_samples->ne[1]; int n_vocab = logits->ne[0]; @@ -1334,7 +1325,7 @@ static void print_row(struct ggml_tensor * probs, int i) { } static void print_matrix(struct ggml_tensor * probs) { - assert(probs->n_dims == 2); + assert(ggml_is_matrix(probs)); for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); @@ -1386,8 +1377,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu static void get_example_targets_batch( struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets ) { - GGML_ASSERT(tokens_input->n_dims == 2); - GGML_ASSERT( targets->n_dims == 3); + GGML_ASSERT(ggml_is_matrix(tokens_input)); + GGML_ASSERT(ggml_is_3d(targets)); int n_tokens = tokens_input->ne[0]; int n_batch = tokens_input->ne[1]; GGML_ASSERT(n_tokens == targets->ne[1]); @@ -1542,27 +1533,28 @@ int main(int argc, char ** argv) { int n_past = 0; - ggml_cgraph gf = {}; + struct ggml_cgraph * gf = NULL; + gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true); get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets); - struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch); + struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch); // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits); struct ggml_tensor * e = square_error_loss(ctx0, targets, logits); - ggml_build_forward_expand(&gf, e); - ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, e); + ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1); float error_before_opt = ggml_get_f32_1d(e, 0); - struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); + struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS); opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_backward_graph = false; opt_params_lbfgs.lbfgs.n_iter = 16; ggml_opt(ctx0, opt_params_lbfgs, e); // - ggml_build_forward_expand(&gf, e); - ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, e); + ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1); float error_after_opt = ggml_get_f32_1d(e, 0); @@ -1609,13 +1601,14 @@ int main(int argc, char ** argv) { }; struct ggml_context * ctx0 = ggml_init(params); - ggml_cgraph gf = {}; + struct ggml_cgraph * gf = NULL; + gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true); int n_past = 0; - struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past); + struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past); - ggml_build_forward_expand(&gf, logits); - ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, logits); + ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1); struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx); diff --git a/examples/base-translate.sh b/examples/base-translate.sh new file mode 100755 index 000000000..00dedd0df --- /dev/null +++ b/examples/base-translate.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# +# Few-shot translation example. +# Requires a base model (i.e. no fine-tuned or instruct models). +# +# Usage: +# +# cd llama.cpp +# make -j +# +# ./examples/base-translate.sh "" [extra-main-args] +# + +if [ $# -lt 2 ]; then + echo "Usage: ./base-translate.sh \"\" [extra-main-args]" + exit 1 +fi + +eargs="" +if [ $# -gt 2 ]; then + eargs="${@:3}" +fi + +ftmp="__llama.cpp_example_tmp__.txt" +trap "rm -f $ftmp" EXIT + +echo "Translate from English to French: + +=== + +sea otter, peppermint, plush girafe: + +sea otter => loutre de mer +peppermint => menthe poivrée +plush girafe => girafe peluche + +=== + +violin + +violin => violon + +=== + +phone, computer, mouse, keyboard: + +phone => téléphone +computer => ordinateur +mouse => souris +keyboard => clavier + +=== +" > $ftmp + +echo "$2 +" >> $ftmp + +model=$1 + +# generate the most likely continuation until the string "===" is found +./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index c552eaa73..b4b8a38e1 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -82,13 +82,17 @@ int main(int argc, char ** argv) { // init LLM - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); // initialize the model llama_model_params model_params = llama_model_default_params(); + const std::vector t_split(llama_max_devices(), 0.0f); + model_params.n_gpu_layers = n_gpu_layers; + model_params.tensor_split = t_split.data(); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -154,6 +158,10 @@ int main(int argc, char ** argv) { } } + LOG_TEE("\n"); + LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch); + LOG_TEE("\n"); + LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); @@ -181,7 +189,7 @@ int main(int argc, char ** argv) { const auto t_pp_start = ggml_time_us(); - llama_kv_cache_tokens_rm(ctx, -1, -1); + llama_kv_cache_clear(ctx); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { LOG_TEE("%s: llama_decode() failed\n", __func__); diff --git a/examples/batched.swift/README.md b/examples/batched.swift/README.md index 464c9079c..4c2721fe8 100644 --- a/examples/batched.swift/README.md +++ b/examples/batched.swift/README.md @@ -1,4 +1,4 @@ This is a swift clone of `examples/batched`. $ `make` -$ `./swift MODEL_PATH [PROMPT] [PARALLEL]` +$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]` diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 772730382..d75c503d5 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -17,7 +17,7 @@ let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(argu let n_len: Int = 32 // init LLM -llama_backend_init(false) +llama_backend_init() defer { llama_backend_free() } @@ -153,7 +153,7 @@ while n_cur <= n_len { // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of stream? -> mark the stream as finished - if new_token_id == llama_token_eos(context) || n_cur == n_len { + if new_token_id == llama_token_eos(model) || n_cur == n_len { i_batch[i] = -1 // print("") if n_parallel > 1 { @@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end llama_print_timings(context) private func tokenize(text: String, add_bos: Bool) -> [llama_token] { - let n_tokens = text.count + (add_bos ? 1 : 0) + let utf8Count = text.utf8.count + let n_tokens = utf8Count + (add_bos ? 1 : 0) let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) - let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) + let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) var swiftTokens: [llama_token] = [] for i in 0 ..< tokenCount { swiftTokens.append(tokens[Int(i)]) @@ -230,18 +231,15 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String var result = [CChar](repeating: 0, count: 8) let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count)) if nTokens < 0 { - if result.count >= -Int(nTokens) { - result.removeLast(-Int(nTokens)) - } else { - result.removeAll() - } + let actualTokensCount = -Int(nTokens) + result = .init(repeating: 0, count: actualTokensCount) let check = llama_token_to_piece( model, token, &result, Int32(result.count) ) - assert(check == nTokens) + assert(check == actualTokensCount) } else { result.removeLast(result.count - Int(nTokens)) } @@ -259,5 +257,4 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String buffer = [] return bufferString } - return nil } diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 155212165..9be7eb56b 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -11,12 +11,19 @@ int main(int argc, char ** argv) { gpt_params params; if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]); + printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]); return 1 ; } + // number of parallel batches int n_parallel = 1; + // total length of the sequences including the prompt + int n_len = 32; + + // number of layers to offload to the GPU + int n_gpu_layers = 0; + if (argc >= 2) { params.model = argv[1]; } @@ -29,22 +36,28 @@ int main(int argc, char ** argv) { n_parallel = std::atoi(argv[3]); } + if (argc >= 5) { + n_len = std::atoi(argv[4]); + } + + if (argc >= 6) { + n_gpu_layers = std::atoi(argv[5]); + } + if (params.prompt.empty()) { params.prompt = "Hello my name is"; } - // total length of the sequences including the prompt - const int n_len = 32; - // init LLM - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); // initialize the model llama_model_params model_params = llama_model_default_params(); - // model_params.n_gpu_layers = 99; // offload all layers to the GPU + model_params.n_gpu_layers = n_gpu_layers; llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -57,6 +70,7 @@ int main(int argc, char ** argv) { std::vector tokens_list; tokens_list = ::llama_tokenize(model, params.prompt, true); + const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel; // initialize the context @@ -78,7 +92,7 @@ int main(int argc, char ** argv) { const int n_ctx = llama_n_ctx(ctx); - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); + LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { @@ -175,7 +189,7 @@ int main(int argc, char ** argv) { //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of stream? -> mark the stream as finished - if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) { + if (new_token_id == llama_token_eos(model) || n_cur == n_len) { i_batch[i] = -1; LOG_TEE("\n"); if (n_parallel > 1) { diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index f078ab8a8..866c6d7a6 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -47,7 +47,7 @@ struct beam_search_callback_data { // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. // For example, eob can be flagged due to maximum token length, stop words, etc. static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { - return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); + return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx)); } // Function matching type llama_beam_search_callback_fn_t. @@ -119,7 +119,8 @@ int main(int argc, char ** argv) // Init LLM : //--------------------------------- - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model; llama_context * ctx; diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt index 14916d831..2bb47bab5 100644 --- a/examples/benchmark/CMakeLists.txt +++ b/examples/benchmark/CMakeLists.txt @@ -1,9 +1,6 @@ set(TARGET benchmark) add_executable(${TARGET} benchmark-matmult.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index f1c382aa9..e89f3de2f 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -1,4 +1,3 @@ -#include "build-info.h" #include "common.h" #include "ggml.h" @@ -130,13 +129,13 @@ int main(int argc, char ** argv) { const ggml_type qtype = GGML_TYPE_Q4_1; size_t ctx_size = 0; - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizey*ggml_type_sizef(qtype); - ctx_size += sizex*sizey*ggml_type_sizef(qtype); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez); + ctx_size += ggml_row_size(qtype, sizex*sizey); + ctx_size += ggml_row_size(qtype, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS ctx_size += 1024*1024*16; printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); @@ -172,7 +171,8 @@ int main(int argc, char ** argv) { struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); // printf("Creating compute graph\n"); - struct ggml_cgraph gf = ggml_build_forward(m11xm2); + struct ggml_cgraph * gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, m11xm2); printf("n_threads=%i\n", benchmark_params.n_threads); @@ -181,9 +181,9 @@ int main(int argc, char ** argv) { std::vector work_buffer; - ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads); + ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); - TENSOR_DUMP(gf.nodes[0]); + TENSOR_DUMP(gf->nodes[0]); printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); @@ -194,25 +194,27 @@ int main(int argc, char ** argv) { // Set up a the benchmark matrices // printf("Creating new tensor q11 & Running quantize\n"); struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data()); + ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr); // Set up a the compute graph // printf("Creating new tensor q31\n"); struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); // printf("Creating compute graph\n"); - struct ggml_cgraph gf31 = ggml_build_forward(q31); + struct ggml_cgraph * gf31 = ggml_new_graph(ctx); + ggml_build_forward_expand(gf31, q31); // Set up a second graph computation to make sure we override the CPU cache lines // printf("Creating new tensor q12 & Running quantize\n"); struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data()); + ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr); // printf("Creating new tensor q32\n"); struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); //printf("Creating compute graph\n"); - struct ggml_cgraph gf32 = ggml_build_forward(q32); + struct ggml_cgraph * gf32 = ggml_new_graph(ctx); + ggml_build_forward_expand(gf32, q32); printf("n_threads=%i\n", benchmark_params.n_threads); const int dimx = sizex; @@ -224,7 +226,7 @@ int main(int argc, char ** argv) { // Let's use the F32 result from above as a reference for the quantized multiplication - float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); + float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]); printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); printf("=====================================================================================\n"); @@ -234,7 +236,7 @@ int main(int argc, char ** argv) { long long int start = ggml_time_us(); //printf("Running ggml_graph_compute\n"); - ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads); + ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads); long long int stop = ggml_time_us(); long long int usec = stop-start; @@ -252,7 +254,7 @@ int main(int argc, char ** argv) { // Check that the matrix multiplication result is in the right ballpark // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different - float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]); + float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]); float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 @@ -267,7 +269,7 @@ int main(int argc, char ** argv) { } // Running a different graph computation to make sure we override the CPU cache lines - ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads); + ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads); } printf("\n"); printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index c291f0adf..8209dcb64 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -325,14 +325,14 @@ struct train_params { }; static void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab: %d\n", __func__, params->n_vocab); - printf("%s: n_ctx: %d\n", __func__, params->n_ctx); - printf("%s: n_embd: %d\n", __func__, params->n_embd); - printf("%s: n_mult: %d\n", __func__, params->n_mult); - printf("%s: n_head: %d\n", __func__, params->n_head); - printf("%s: n_ff: %d\n", __func__, params->n_ff); - printf("%s: n_layer: %d\n", __func__, params->n_layer); - printf("%s: n_rot: %d\n", __func__, params->n_rot); + printf("%s: n_vocab: %u\n", __func__, params->n_vocab); + printf("%s: n_ctx: %u\n", __func__, params->n_ctx); + printf("%s: n_embd: %u\n", __func__, params->n_embd); + printf("%s: n_mult: %u\n", __func__, params->n_mult); + printf("%s: n_head: %u\n", __func__, params->n_head); + printf("%s: n_ff: %u\n", __func__, params->n_ff); + printf("%s: n_layer: %u\n", __func__, params->n_layer); + printf("%s: n_rot: %u\n", __func__, params->n_rot); } static void init_model(struct my_llama_model * model) { @@ -350,25 +350,25 @@ static void init_model(struct my_llama_model * model) { model->train_tokens = 0; model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab); + printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab); model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd); + printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd); model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab); // printing the per-layer allocations here so we dont print in the for loop. - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer); + printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); ggml_set_name(model->tok_embeddings, "tok_embeddings.weight"); ggml_set_name(model->norm, "norm.weight"); @@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) { } static void print_matrix(struct ggml_tensor * probs) { - assert(probs->n_dims == 2); + assert(ggml_is_matrix(probs)); for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = get_f32_2d(probs, k, i); @@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) { if (file.size < 4) { return false; } - uint32_t magic = file.read_u32(); + std::string magic = file.read_string(4); return magic == GGUF_MAGIC; } @@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { int ct; - switch (gg_weights->n_dims){ + switch (ggml_n_dims(gg_weights)) { case 1: ct = 0; for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){ diff --git a/examples/embd-input/.gitignore b/examples/embd-input/.gitignore deleted file mode 100644 index 87ef68771..000000000 --- a/examples/embd-input/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -PandaGPT -MiniGPT-4 -*.pth - diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt deleted file mode 100644 index 5bbb1ea02..000000000 --- a/examples/embd-input/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -set(TARGET embdinput) -add_library(${TARGET} embd-input-lib.cpp embd-input.h) -install(TARGETS ${TARGET} LIBRARY) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() - -set(TARGET embd-input-test) -add_executable(${TARGET} embd-input-test.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/embd-input/README.md b/examples/embd-input/README.md deleted file mode 100644 index 5c4c75ea7..000000000 --- a/examples/embd-input/README.md +++ /dev/null @@ -1,63 +0,0 @@ -### Examples for input embedding directly - -## Requirement -build `libembdinput.so` -run the following comman in main dir (../../). -``` -make -``` - -## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py) - -1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/). -2. Convert it to ggml format. -3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin). - -``` -import torch - -bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin" -pth_path = "./examples/embd-input/llava_projection.pth" - -dic = torch.load(bin_path) -used_key = ["model.mm_projector.weight","model.mm_projector.bias"] -torch.save({k: dic[k] for k in used_key}, pth_path) -``` -4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`. - - -## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py) - -1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format. -The `adapter_config.json` is -``` -{ - "peft_type": "LORA", - "fan_in_fan_out": false, - "bias": null, - "modules_to_save": null, - "r": 32, - "lora_alpha": 32, - "lora_dropout": 0.1, - "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"] -} -``` -2. Papare the `vicuna` v0 model. -3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model. -4. Clone the PandaGPT source. -``` -git clone https://github.com/yxuansu/PandaGPT -``` -5. Install the requirement of PandaGPT. -6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py. - -## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py) - -1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`. -2. Clone the MiniGPT-4 source. -``` -git clone https://github.com/Vision-CAIR/MiniGPT-4/ -``` -3. Install the requirement of PandaGPT. -4. Papare the `vicuna` v0 model. -5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`. diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp deleted file mode 100644 index 3ce33842c..000000000 --- a/examples/embd-input/embd-input-lib.cpp +++ /dev/null @@ -1,221 +0,0 @@ -#include "build-info.h" -#include "common.h" -#include "embd-input.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static llama_context ** g_ctx; - -extern "C" { - -struct MyModel* create_mymodel(int argc, char ** argv) { - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - return nullptr; - } - - print_build_info(); - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = uint32_t(time(NULL)); - } - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); - - llama_backend_init(params.numa); - - llama_model * model; - llama_context * ctx; - - g_ctx = &ctx; - - // load the model and apply lora adapter, if any - std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return nullptr; - } - - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", get_system_info(params).c_str()); - } - struct MyModel * ret = new MyModel(); - ret->ctx = ctx; - ret->params = params; - ret->n_past = 0; - // printf("ctx: %d\n", ret->ctx); - return ret; -} - -void free_mymodel(struct MyModel * mymodel) { - llama_context * ctx = mymodel->ctx; - llama_print_timings(ctx); - llama_free(ctx); - delete mymodel; -} - - -bool eval_float(void * model, float * input, int N){ - MyModel * mymodel = (MyModel*)model; - llama_context * ctx = mymodel->ctx; - gpt_params params = mymodel->params; - int n_emb = llama_n_embd(llama_get_model(ctx)); - int n_past = mymodel->n_past; - int n_batch = N; // params.n_batch; - - for (int i = 0; i < (int) N; i += n_batch) { - int n_eval = (int) N - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, }; - if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return false; - } - n_past += n_eval; - } - mymodel->n_past = n_past; - return true; -} - -bool eval_tokens(void * model, std::vector tokens) { - MyModel * mymodel = (MyModel* )model; - llama_context * ctx; - ctx = mymodel->ctx; - gpt_params params = mymodel->params; - int n_past = mymodel->n_past; - for (int i = 0; i < (int) tokens.size(); i += params.n_batch) { - int n_eval = (int) tokens.size() - i; - if (n_eval > params.n_batch) { - n_eval = params.n_batch; - } - if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return false; - } - n_past += n_eval; - } - mymodel->n_past = n_past; - return true; -} - -bool eval_id(struct MyModel* mymodel, int id) { - std::vector tokens; - tokens.push_back(id); - return eval_tokens(mymodel, tokens); -} - -bool eval_string(struct MyModel * mymodel,const char* str){ - llama_context * ctx = mymodel->ctx; - std::string str2 = str; - std::vector embd_inp = ::llama_tokenize(ctx, str2, true); - eval_tokens(mymodel, embd_inp); - return true; -} - -llama_token sampling_id(struct MyModel* mymodel) { - llama_context* ctx = mymodel->ctx; - gpt_params params = mymodel->params; - llama_sampling_params & sparams = params.sampling_params; - // int n_ctx = llama_n_ctx(ctx); - - // out of user input, sample next token - const float temp = sparams.temp; - const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k; - const float top_p = sparams.top_p; - const float tfs_z = sparams.tfs_z; - const float typical_p = sparams.typical_p; - // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; - // const float repeat_penalty = params.repeat_penalty; - // const float alpha_presence = params.presence_penalty; - // const float alpha_frequency = params.frequency_penalty; - const int mirostat = sparams.mirostat; - const float mirostat_tau = sparams.mirostat_tau; - const float mirostat_eta = sparams.mirostat_eta; - // const bool penalize_nl = params.penalize_nl; - - llama_token id = 0; - { - auto logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(llama_get_model(ctx)); - - // Apply params.logit_bias map - for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) { - logits[it->first] += it->second; - } - - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - - // TODO: Apply penalties - // float nl_logit = logits[llama_token_nl(ctx)]; - // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); - // llama_sample_repetition_penalty(ctx, &candidates_p, - // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - // last_n_repeat, repeat_penalty); - // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, - // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - // last_n_repeat, alpha_frequency, alpha_presence); - // if (!penalize_nl) { - // logits[llama_token_nl(ctx)] = nl_logit; - // } - - if (temp <= 0) { - // Greedy sampling - id = llama_sample_token_greedy(ctx, &candidates_p); - } else { - if (mirostat == 1) { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temp(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); - } else if (mirostat == 2) { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temp(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); - } else { - // Temperature sampling - llama_sample_top_k(ctx, &candidates_p, top_k, 1); - llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); - llama_sample_typical(ctx, &candidates_p, typical_p, 1); - llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_temp(ctx, &candidates_p, temp); - id = llama_sample_token(ctx, &candidates_p); - } - } - } - - return id; -} - -const char * sampling(struct MyModel * mymodel) { - llama_context * ctx = mymodel->ctx; - int id = sampling_id(mymodel); - static std::string ret; - if (id == llama_token_eos(ctx)) { - ret = ""; - } else { - ret = llama_token_to_piece(ctx, id); - } - eval_id(mymodel, id); - return ret.c_str(); -} - -} diff --git a/examples/embd-input/embd-input-test.cpp b/examples/embd-input/embd-input-test.cpp deleted file mode 100644 index dc4a0e488..000000000 --- a/examples/embd-input/embd-input-test.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include "embd-input.h" -#include -#include -#include - -int main(int argc, char** argv) { - - auto mymodel = create_mymodel(argc, argv); - int N = 10; - int max_tgt_len = 500; - int n_embd = llama_n_embd(llama_get_model(mymodel->ctx)); - - // add random float embd to test evaluation - float * data = new float[N*n_embd]; - std::default_random_engine e; - std::uniform_real_distribution u(0,1); - for (int i=0;iparams.prompt.c_str()); - const char* tmp; - for (int i=0; i")==0) break; - printf("%s", tmp); - fflush(stdout); - } - printf("\n"); - free_mymodel(mymodel); - return 0; -} diff --git a/examples/embd-input/embd-input.h b/examples/embd-input/embd-input.h deleted file mode 100644 index eff5e3b84..000000000 --- a/examples/embd-input/embd-input.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _EMBD_INPUT_H_ -#define _EMBD_INPUT_H_ 1 - -#include "common.h" -#include "llama.h" - -extern "C" { - -typedef struct MyModel { - llama_context* ctx; - gpt_params params; - int n_past = 0; -} MyModel; - -struct MyModel* create_mymodel(int argc, char ** argv); - -bool eval_float(void* model, float* input, int N); -bool eval_tokens(void* model, std::vector tokens); -bool eval_id(struct MyModel* mymodel, int id); -bool eval_string(struct MyModel* mymodel, const char* str); -const char * sampling(struct MyModel* mymodel); -llama_token sampling_id(struct MyModel* mymodel); -void free_mymodel(struct MyModel* mymodel); - -} - -#endif diff --git a/examples/embd-input/embd_input.py b/examples/embd-input/embd_input.py deleted file mode 100755 index f146acdc1..000000000 --- a/examples/embd-input/embd_input.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 -import ctypes -from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int -import numpy as np -import os - -libc = cdll.LoadLibrary("./libembdinput.so") -libc.sampling.restype=c_char_p -libc.create_mymodel.restype=c_void_p -libc.eval_string.argtypes=[c_void_p, c_char_p] -libc.sampling.argtypes=[c_void_p] -libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int] - - -class MyModel: - def __init__(self, args): - argc = len(args) - c_str = [c_char_p(i.encode()) for i in args] - args_c = (c_char_p * argc)(*c_str) - self.model = c_void_p(libc.create_mymodel(argc, args_c)) - self.max_tgt_len = 512 - self.print_string_eval = True - - def __del__(self): - libc.free_mymodel(self.model) - - def eval_float(self, x): - libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1]) - - def eval_string(self, x): - libc.eval_string(self.model, x.encode()) # c_char_p(x.encode())) - if self.print_string_eval: - print(x) - - def eval_token(self, x): - libc.eval_id(self.model, x) - - def sampling(self): - s = libc.sampling(self.model) - return s - - def stream_generate(self, end=""): - ret = b"" - end = end.encode() - for _ in range(self.max_tgt_len): - tmp = self.sampling() - ret += tmp - yield tmp - if ret.endswith(end): - break - - def generate_with_print(self, end=""): - ret = b"" - for i in self.stream_generate(end=end): - ret += i - print(i.decode(errors="replace"), end="", flush=True) - print("") - return ret.decode(errors="replace") - - - def generate(self, end=""): - text = b"".join(self.stream_generate(end=end)) - return text.decode(errors="replace") - -if __name__ == "__main__": - model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"]) - model.eval_string("""user: what is the color of the flag of UN?""") - x = np.random.random((5120,10))# , dtype=np.float32) - model.eval_float(x) - model.eval_string("""assistant:""") - for i in model.generate(): - print(i.decode(errors="replace"), end="", flush=True) diff --git a/examples/embd-input/llava.py b/examples/embd-input/llava.py deleted file mode 100755 index 06fad55f4..000000000 --- a/examples/embd-input/llava.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -sys.path.insert(0, os.path.dirname(__file__)) -from embd_input import MyModel -import numpy as np -from torch import nn -import torch -from transformers import CLIPVisionModel, CLIPImageProcessor -from PIL import Image - -# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1' -vision_tower = "openai/clip-vit-large-patch14" -select_hidden_state_layer = -2 -# (vision_config.image_size // vision_config.patch_size) ** 2 -image_token_len = (224//14)**2 - -class Llava: - def __init__(self, args): - self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower) - self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower) - self.mm_projector = nn.Linear(1024, 5120) - self.model = MyModel(["main", *args]) - - def load_projection(self, path): - state = torch.load(path) - self.mm_projector.load_state_dict({ - "weight": state["model.mm_projector.weight"], - "bias": state["model.mm_projector.bias"]}) - - def chat(self, question): - self.model.eval_string("user: ") - self.model.eval_string(question) - self.model.eval_string("\nassistant: ") - return self.model.generate_with_print() - - def chat_with_image(self, image, question): - with torch.no_grad(): - embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] - image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True) - select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer] - image_feature = select_hidden_state[:, 1:] - embd_image = self.mm_projector(image_feature) - embd_image = embd_image.cpu().numpy()[0] - self.model.eval_string("user: ") - self.model.eval_token(32003-2) # im_start - self.model.eval_float(embd_image.T) - for i in range(image_token_len-embd_image.shape[0]): - self.model.eval_token(32003-3) # im_patch - self.model.eval_token(32003-1) # im_end - self.model.eval_string(question) - self.model.eval_string("\nassistant: ") - return self.model.generate_with_print() - - -if __name__=="__main__": - # model form liuhaotian/LLaVA-13b-delta-v1-1 - a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"]) - # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin. - # Also here can use pytorch_model-00003-of-00003.bin directly. - a.load_projection(os.path.join( - os.path.dirname(__file__) , - "llava_projection.pth")) - respose = a.chat_with_image( - Image.open("./media/llama1-logo.png").convert('RGB'), - "what is the text in the picture?") - respose - a.chat("what is the color of it?") - - - diff --git a/examples/embd-input/minigpt4.py b/examples/embd-input/minigpt4.py deleted file mode 100755 index 7b13e4a5c..000000000 --- a/examples/embd-input/minigpt4.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -sys.path.insert(0, os.path.dirname(__file__)) -from embd_input import MyModel -import numpy as np -from torch import nn -import torch -from PIL import Image - -minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4") -sys.path.insert(0, minigpt4_path) -from minigpt4.models.blip2 import Blip2Base -from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor - - -class MiniGPT4(Blip2Base): - """ - MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4 - """ - def __init__(self, - args, - vit_model="eva_clip_g", - q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth", - img_size=224, - drop_path_rate=0, - use_grad_checkpoint=False, - vit_precision="fp32", - freeze_vit=True, - freeze_qformer=True, - num_query_token=32, - llama_model="", - prompt_path="", - prompt_template="", - max_txt_len=32, - end_sym='\n', - low_resource=False, # use 8 bit and put vit in cpu - device_8bit=0 - ): - super().__init__() - self.img_size = img_size - self.low_resource = low_resource - self.preprocessor = Blip2ImageEvalProcessor(img_size) - - print('Loading VIT') - self.visual_encoder, self.ln_vision = self.init_vision_encoder( - vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision - ) - print('Loading VIT Done') - print('Loading Q-Former') - self.Qformer, self.query_tokens = self.init_Qformer( - num_query_token, self.visual_encoder.num_features - ) - self.Qformer.cls = None - self.Qformer.bert.embeddings.word_embeddings = None - self.Qformer.bert.embeddings.position_embeddings = None - for layer in self.Qformer.bert.encoder.layer: - layer.output = None - layer.intermediate = None - self.load_from_pretrained(url_or_filename=q_former_model) - print('Loading Q-Former Done') - self.llama_proj = nn.Linear( - self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size - ) - self.max_txt_len = max_txt_len - self.end_sym = end_sym - self.model = MyModel(["main", *args]) - # system prompt - self.model.eval_string("Give the following image: ImageContent. " - "You will be able to see the image once I provide it to you. Please answer my questions." - "###") - - def encode_img(self, image): - image = self.preprocessor(image) - image = image.unsqueeze(0) - device = image.device - if self.low_resource: - self.vit_to_cpu() - image = image.to("cpu") - - with self.maybe_autocast(): - image_embeds = self.ln_vision(self.visual_encoder(image)).to(device) - image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device) - - query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) - query_output = self.Qformer.bert( - query_embeds=query_tokens, - encoder_hidden_states=image_embeds, - encoder_attention_mask=image_atts, - return_dict=True, - ) - - inputs_llama = self.llama_proj(query_output.last_hidden_state) - # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device) - return inputs_llama - - def load_projection(self, path): - state = torch.load(path)["model"] - self.llama_proj.load_state_dict({ - "weight": state["llama_proj.weight"], - "bias": state["llama_proj.bias"]}) - - def chat(self, question): - self.model.eval_string("Human: ") - self.model.eval_string(question) - self.model.eval_string("\n### Assistant:") - return self.model.generate_with_print(end="###") - - def chat_with_image(self, image, question): - with torch.no_grad(): - embd_image = self.encode_img(image) - embd_image = embd_image.cpu().numpy()[0] - self.model.eval_string("Human: ") - self.model.eval_float(embd_image.T) - self.model.eval_string(" ") - self.model.eval_string(question) - self.model.eval_string("\n### Assistant:") - return self.model.generate_with_print(end="###") - - -if __name__=="__main__": - a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"]) - a.load_projection(os.path.join( - os.path.dirname(__file__) , - "pretrained_minigpt4.pth")) - respose = a.chat_with_image( - Image.open("./media/llama1-logo.png").convert('RGB'), - "what is the text in the picture?") - a.chat("what is the color of it?") diff --git a/examples/embd-input/panda_gpt.py b/examples/embd-input/panda_gpt.py deleted file mode 100755 index 891ad7cc9..000000000 --- a/examples/embd-input/panda_gpt.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -sys.path.insert(0, os.path.dirname(__file__)) -from embd_input import MyModel -import numpy as np -from torch import nn -import torch - -# use PandaGPT path -panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT") -imagebind_ckpt_path = "./models/panda_gpt/" - -sys.path.insert(0, os.path.join(panda_gpt_path,"code","model")) -from ImageBind.models import imagebind_model -from ImageBind import data - -ModalityType = imagebind_model.ModalityType -max_tgt_len = 400 - -class PandaGPT: - def __init__(self, args): - self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path) - self.visual_encoder.eval() - self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120) - self.max_tgt_len = max_tgt_len - self.model = MyModel(["main", *args]) - self.generated_text = "" - self.device = "cpu" - - def load_projection(self, path): - state = torch.load(path, map_location="cpu") - self.llama_proj.load_state_dict({ - "weight": state["llama_proj.weight"], - "bias": state["llama_proj.bias"]}) - - def eval_inputs(self, inputs): - self.model.eval_string("") - embds = self.extract_multimoal_feature(inputs) - for i in embds: - self.model.eval_float(i.T) - self.model.eval_string(" ") - - def chat(self, question): - return self.chat_with_image(None, question) - - def chat_with_image(self, inputs, question): - if self.generated_text == "": - self.model.eval_string("###") - self.model.eval_string(" Human: ") - if inputs: - self.eval_inputs(inputs) - self.model.eval_string(question) - self.model.eval_string("\n### Assistant:") - ret = self.model.generate_with_print(end="###") - self.generated_text += ret - return ret - - def extract_multimoal_feature(self, inputs): - features = [] - for key in ["image", "audio", "video", "thermal"]: - if key + "_paths" in inputs: - embeds = self.encode_data(key, inputs[key+"_paths"]) - features.append(embeds) - return features - - def encode_data(self, data_type, data_paths): - - type_map = { - "image": ModalityType.VISION, - "audio": ModalityType.AUDIO, - "video": ModalityType.VISION, - "thermal": ModalityType.THERMAL, - } - load_map = { - "image": data.load_and_transform_vision_data, - "audio": data.load_and_transform_audio_data, - "video": data.load_and_transform_video_data, - "thermal": data.load_and_transform_thermal_data - } - - load_function = load_map[data_type] - key = type_map[data_type] - - inputs = {key: load_function(data_paths, self.device)} - with torch.no_grad(): - embeddings = self.visual_encoder(inputs) - embeds = embeddings[key] - embeds = self.llama_proj(embeds).cpu().numpy() - return embeds - - -if __name__=="__main__": - a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"]) - a.load_projection("./models/panda_gpt/adapter_model.bin") - a.chat_with_image( - {"image_paths": ["./media/llama1-logo.png"]}, - "what is the text in the picture? 'llama' or 'lambda'?") - a.chat("what is the color of it?") diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 0c752c7bb..8ffc33868 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -3,6 +3,3 @@ add_executable(${TARGET} embedding.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 14075609e..acff715e9 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,4 +1,3 @@ -#include "build-info.h" #include "common.h" #include "llama.h" @@ -8,6 +7,51 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +static std::vector split_lines(const std::string & s) { + std::string line; + std::vector lines; + std::stringstream ss(s); + while (std::getline(ss, line)) { + lines.push_back(line); + } + return lines; +} + +static void batch_add_seq(llama_batch & batch, const std::vector & tokens, int seq_id) { + for (size_t i = 0; i < tokens.size(); i++) { + llama_batch_add(batch, tokens[i], i, { seq_id }, false); + } +} + +static void normalize(float * vec, float * out, int n) { + float norm = 0; + for (int i = 0; i < n; i++) { + norm += vec[i] * vec[i]; + } + norm = sqrt(norm); + for (int i = 0; i < n; i++) { + out[i] = vec[i] / norm; + } +} + +static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { + // clear previous kv_cache values (irrelevant for embeddings) + llama_kv_cache_clear(ctx); + + // run model + fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); + if (llama_decode(ctx, batch) < 0) { + fprintf(stderr, "%s : failed to decode\n", __func__); + } + + // normalize on copy + for (int k = 0; k < n_seq; k++) { + float * emb = llama_get_embeddings_ith(ctx, k); + float * out = output + k * n_embd; + normalize(emb, out, n_embd); + } +} + int main(int argc, char ** argv) { gpt_params params; @@ -30,7 +74,8 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model; llama_context * ctx; @@ -56,49 +101,84 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s\n", get_system_info(params).c_str()); } - int n_past = 0; + // split the prompt into lines + std::vector prompts = split_lines(params.prompt); - // tokenize the prompt - auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + // max batch size + const uint64_t n_batch = params.n_batch; + GGML_ASSERT(params.n_batch == params.n_ctx); + // tokenize the prompts and trim + std::vector> inputs; + for (const auto & prompt : prompts) { + auto inp = ::llama_tokenize(ctx, prompt, true); + if (inp.size() > n_batch) { + inp.resize(n_batch); + } + inputs.push_back(inp); + } + + // tokenization stats if (params.verbose_prompt) { - fprintf(stderr, "\n"); - fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + for (int i = 0; i < (int) inputs.size(); i++) { + fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); + for (int j = 0; j < (int) inputs[i].size(); j++) { + fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); + } + fprintf(stderr, "\n\n"); } - fprintf(stderr, "\n"); } - if (embd_inp.size() > (size_t)n_ctx) { - fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n", - __func__, embd_inp.size(), n_ctx); - return 1; - } - - while (!embd_inp.empty()) { - int n_tokens = std::min(params.n_batch, (int) embd_inp.size()); - if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; - } - n_past += n_tokens; - embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens); - } + // initialize batch + const int n_prompts = prompts.size(); + struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts); + // allocate output const int n_embd = llama_n_embd(model); - const auto * embeddings = llama_get_embeddings(ctx); + std::vector embeddings(n_prompts * n_embd, 0); + float * emb = embeddings.data(); - for (int i = 0; i < n_embd; i++) { - printf("%f ", embeddings[i]); + // break into batches + int p = 0; // number of prompts processed already + int s = 0; // number of prompts in current batch + for (int k = 0; k < n_prompts; k++) { + // clamp to n_batch tokens + auto & inp = inputs[k]; + const uint64_t n_toks = inp.size(); + + // encode if at capacity + if (batch.n_tokens + n_toks > n_batch) { + float * out = emb + p * n_embd; + batch_decode(ctx, batch, out, s, n_embd); + llama_batch_clear(batch); + p += s; + s = 0; + } + + // add to batch + batch_add_seq(batch, inp, s); + s += 1; } - printf("\n"); + // final batch + float * out = emb + p * n_embd; + batch_decode(ctx, batch, out, s, n_embd); + + // print first 3 embeddings + for (int j = 0; j < std::min(3, n_prompts); j++) { + fprintf(stderr, "embedding %d: ", j); + for (int i = 0; i < n_embd; i++) { + fprintf(stderr, "%f ", emb[j * n_embd + i]); + } + fprintf(stderr, "\n\n"); + } + fprintf(stderr, "\n"); + + // clean up llama_print_timings(ctx); llama_free(ctx); llama_free_model(model); - llama_backend_free(); return 0; diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index d803cfd5c..08413f57e 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -7,8 +7,6 @@ #include #include -static const size_t tensor_alignment = 32; - struct lora_info { std::string filename; float scale; @@ -240,14 +238,13 @@ static struct lora_data * load_lora(struct lora_info * info) { } struct ggml_init_params params_ggml; - params_ggml.mem_size = ggml_tensor_overhead() * GGML_MAX_NODES; + params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE; params_ggml.mem_buffer = NULL; params_ggml.no_alloc = true; result->ctx = ggml_init(params_ggml); - uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla' uint32_t magic = file.read_u32(); - if (magic != LLAMA_FILE_MAGIC_LORA) { + if (magic != LLAMA_FILE_MAGIC_GGLA) { die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str()); } uint32_t version = file.read_u32(); @@ -309,7 +306,7 @@ static struct ggml_cgraph * build_graph_lora( ) { struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b); if (scaling != 1.0f) { - ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling)); + ab = ggml_scale(ctx, ab, scaling); } struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab); @@ -334,28 +331,18 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r; struct ggml_init_params params; - params.mem_size = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5; + params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5; params.mem_buffer = NULL; params.no_alloc = true; struct ggml_context * ctx = NULL; - struct ggml_allocr * alloc = NULL; - struct ggml_cgraph * gf = NULL; + struct ggml_gallocr * alloc = NULL; + struct ggml_cgraph * gf = NULL; ctx = ggml_init(params); - alloc = ggml_allocr_new_measure(tensor_alignment); + alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); - size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf); - ggml_allocr_free(alloc); - ggml_free(ctx); - static std::vector data_compute; - data_compute.resize(alloc_size + tensor_alignment); - - ctx = ggml_init(params); - alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment); - gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); - ggml_allocr_alloc_graph(alloc, gf); - ggml_allocr_free(alloc); + ggml_gallocr_alloc_graph(alloc, gf); struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads); static std::vector data_work; @@ -364,6 +351,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int ggml_graph_compute(gf, &cplan); + ggml_gallocr_free(alloc); ggml_free(ctx); return true; } diff --git a/examples/finetune/README.md b/examples/finetune/README.md index 36e62578c..2fafd505e 100644 --- a/examples/finetune/README.md +++ b/examples/finetune/README.md @@ -21,7 +21,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s ./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin ``` -Finetune output files will be saved every N iterations (config with `--save-every N`). +**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`). The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output. So in above example after 10 iterations these files will be written: - chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf @@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin ``` -The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values. +The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values. Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime. If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`. @@ -80,9 +80,9 @@ The LORA rank can be configured for each model tensor type separately with these --rank-wk N LORA rank for wk tensor (default 4) --rank-wv N LORA rank for wv tensor (default 4) --rank-wo N LORA rank for wo tensor (default 4) - --rank-w1 N LORA rank for w1 tensor (default 4) - --rank-w2 N LORA rank for w2 tensor (default 4) - --rank-w3 N LORA rank for w3 tensor (default 4) + --rank-ffn_gate N LORA rank for ffn_gate tensor (default 4) + --rank-ffn_down N LORA rank for ffn_down tensor (default 4) + --rank-ffn_up N LORA rank for ffn_up tensor (default 4) ``` The LORA rank of 'norm' tensors should always be 1. diff --git a/examples/finetune/convert-finetune-checkpoint-to-gguf.py b/examples/finetune/convert-finetune-checkpoint-to-gguf.py index c8e14da87..c89090918 100644 --- a/examples/finetune/convert-finetune-checkpoint-to-gguf.py +++ b/examples/finetune/convert-finetune-checkpoint-to-gguf.py @@ -3,9 +3,7 @@ import argparse import gguf -import os import struct -import sys import numpy as np from pathlib import Path diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 35824cd2d..3da5317b3 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1,17 +1,12 @@ #include "ggml.h" #include "ggml-alloc.h" +#include "ggml-backend.h" #include "llama.h" #include "common.h" #include "train.h" -#include #include -#include -#include #include -#include #include -#include -#include #include #include @@ -19,8 +14,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static const size_t tensor_alignment = 32; - struct my_llama_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; @@ -67,9 +60,9 @@ struct my_llama_layer { struct ggml_tensor * ffn_norm; // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; + struct ggml_tensor * ffn_gate; // w1 + struct ggml_tensor * ffn_down; // w2 + struct ggml_tensor * ffn_up; // w3 }; struct my_llama_model { @@ -92,9 +85,9 @@ struct my_llama_lora_hparams { uint32_t n_rank_wv = 4; uint32_t n_rank_wo = 4; uint32_t n_rank_ffn_norm = 1; - uint32_t n_rank_w1 = 4; - uint32_t n_rank_w2 = 4; - uint32_t n_rank_w3 = 4; + uint32_t n_rank_ffn_gate = 4; + uint32_t n_rank_ffn_down = 4; + uint32_t n_rank_ffn_up = 4; uint32_t n_rank_tok_embeddings = 4; uint32_t n_rank_norm = 1; uint32_t n_rank_output = 4; @@ -124,17 +117,17 @@ struct my_llama_lora_layer { struct ggml_tensor * ffn_norm_b; // ff - struct ggml_tensor * w1_a; - struct ggml_tensor * w1_b; - struct ggml_tensor * w2_a; - struct ggml_tensor * w2_b; - struct ggml_tensor * w3_a; - struct ggml_tensor * w3_b; + struct ggml_tensor * ffn_gate_a; + struct ggml_tensor * ffn_gate_b; + struct ggml_tensor * ffn_down_a; + struct ggml_tensor * ffn_down_b; + struct ggml_tensor * ffn_up_a; + struct ggml_tensor * ffn_up_b; }; struct my_llama_lora { struct ggml_context * ctx = NULL; - std::vector data; + ggml_backend_buffer_t data; my_llama_lora_hparams hparams; @@ -196,13 +189,13 @@ static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; static void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab: %u\n", __func__, params->n_vocab); - printf("%s: n_ctx: %u\n", __func__, params->n_ctx); - printf("%s: n_embd: %u\n", __func__, params->n_embd); - printf("%s: n_ff: %u\n", __func__, params->n_ff); - printf("%s: n_head: %u\n", __func__, params->n_head); - printf("%s: n_head_kv: %u\n", __func__, params->n_head_kv); - printf("%s: n_layer: %u\n", __func__, params->n_layer); + printf("%s: n_vocab : %u\n", __func__, params->n_vocab); + printf("%s: n_ctx : %u\n", __func__, params->n_ctx); + printf("%s: n_embd : %u\n", __func__, params->n_embd); + printf("%s: n_ff : %u\n", __func__, params->n_ff); + printf("%s: n_head : %u\n", __func__, params->n_head); + printf("%s: n_head_kv : %u\n", __func__, params->n_head_kv); + printf("%s: n_layer : %u\n", __func__, params->n_layer); printf("%s: norm_rms_eps : %f\n", __func__, params->f_norm_rms_eps); printf("%s: rope_freq_base : %f\n", __func__, params->rope_freq_base); printf("%s: rope_freq_scale : %f\n", __func__, params->rope_freq_scale); @@ -215,9 +208,9 @@ static void print_lora_params(struct my_llama_lora_hparams * params) { printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv); printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo); printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm); - printf("%s: n_rank_w1 : %u\n", __func__, params->n_rank_w1); - printf("%s: n_rank_w2 : %u\n", __func__, params->n_rank_w2); - printf("%s: n_rank_w3 : %u\n", __func__, params->n_rank_w3); + printf("%s: n_rank_ffn_gate : %u\n", __func__, params->n_rank_ffn_gate); + printf("%s: n_rank_ffn_down : %u\n", __func__, params->n_rank_ffn_down); + printf("%s: n_rank_ffn_up : %u\n", __func__, params->n_rank_ffn_up); printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings); printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm); printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output); @@ -269,7 +262,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h float rope_freq_scale = 1.0f; GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); - GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); if (rope_freq_scale != 1.0f) { hparams->rope_freq_scale = 1.0f / rope_freq_scale; } @@ -326,9 +319,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i)); layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i)); layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i)); - layer.w1 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); - layer.w2 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); - layer.w3 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); + layer.ffn_gate = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); + layer.ffn_down = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); + layer.ffn_up = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); assert_shape_1d(layer.attention_norm, hparams.n_embd); assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd); @@ -336,9 +329,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa()); assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd); assert_shape_1d(layer.ffn_norm, hparams.n_embd); - assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff); - assert_shape_2d(layer.w2, hparams.n_ff, hparams.n_embd); - assert_shape_2d(layer.w3, hparams.n_embd, hparams.n_ff); + assert_shape_2d(layer.ffn_gate, hparams.n_embd, hparams.n_ff); + assert_shape_2d(layer.ffn_down, hparams.n_ff, hparams.n_embd); + assert_shape_2d(layer.ffn_up, hparams.n_embd, hparams.n_ff); } } @@ -369,69 +362,12 @@ static void set_param_lora(struct my_llama_lora * lora) { ggml_set_param(ctx, layer.wo_b); ggml_set_param(ctx, layer.ffn_norm_a); ggml_set_param(ctx, layer.ffn_norm_b); - ggml_set_param(ctx, layer.w1_a); - ggml_set_param(ctx, layer.w1_b); - ggml_set_param(ctx, layer.w2_a); - ggml_set_param(ctx, layer.w2_b); - ggml_set_param(ctx, layer.w3_a); - ggml_set_param(ctx, layer.w3_b); - } -} - -static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora) { - ggml_allocr_alloc(alloc, lora->tok_embeddings_a); - ggml_allocr_alloc(alloc, lora->tok_embeddings_b); - ggml_allocr_alloc(alloc, lora->norm_a); - ggml_allocr_alloc(alloc, lora->norm_b); - ggml_allocr_alloc(alloc, lora->output_a); - ggml_allocr_alloc(alloc, lora->output_b); - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - ggml_allocr_alloc(alloc, layer.attention_norm_a); - ggml_allocr_alloc(alloc, layer.attention_norm_b); - ggml_allocr_alloc(alloc, layer.wq_a); - ggml_allocr_alloc(alloc, layer.wq_b); - ggml_allocr_alloc(alloc, layer.wk_a); - ggml_allocr_alloc(alloc, layer.wk_b); - ggml_allocr_alloc(alloc, layer.wv_a); - ggml_allocr_alloc(alloc, layer.wv_b); - ggml_allocr_alloc(alloc, layer.wo_a); - ggml_allocr_alloc(alloc, layer.wo_b); - ggml_allocr_alloc(alloc, layer.ffn_norm_a); - ggml_allocr_alloc(alloc, layer.ffn_norm_b); - ggml_allocr_alloc(alloc, layer.w1_a); - ggml_allocr_alloc(alloc, layer.w1_b); - ggml_allocr_alloc(alloc, layer.w2_a); - ggml_allocr_alloc(alloc, layer.w2_b); - ggml_allocr_alloc(alloc, layer.w3_a); - ggml_allocr_alloc(alloc, layer.w3_b); - } - ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad); - ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad); - ggml_allocr_alloc(alloc, lora->norm_a->grad); - ggml_allocr_alloc(alloc, lora->norm_b->grad); - ggml_allocr_alloc(alloc, lora->output_a->grad); - ggml_allocr_alloc(alloc, lora->output_b->grad); - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - ggml_allocr_alloc(alloc, layer.attention_norm_a->grad); - ggml_allocr_alloc(alloc, layer.attention_norm_b->grad); - ggml_allocr_alloc(alloc, layer.wq_a->grad); - ggml_allocr_alloc(alloc, layer.wq_b->grad); - ggml_allocr_alloc(alloc, layer.wk_a->grad); - ggml_allocr_alloc(alloc, layer.wk_b->grad); - ggml_allocr_alloc(alloc, layer.wv_a->grad); - ggml_allocr_alloc(alloc, layer.wv_b->grad); - ggml_allocr_alloc(alloc, layer.wo_a->grad); - ggml_allocr_alloc(alloc, layer.wo_b->grad); - ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad); - ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad); - ggml_allocr_alloc(alloc, layer.w1_a->grad); - ggml_allocr_alloc(alloc, layer.w1_b->grad); - ggml_allocr_alloc(alloc, layer.w2_a->grad); - ggml_allocr_alloc(alloc, layer.w2_b->grad); - ggml_allocr_alloc(alloc, layer.w3_a->grad); - ggml_allocr_alloc(alloc, layer.w3_b->grad); + ggml_set_param(ctx, layer.ffn_gate_a); + ggml_set_param(ctx, layer.ffn_gate_b); + ggml_set_param(ctx, layer.ffn_down_a); + ggml_set_param(ctx, layer.ffn_down_b); + ggml_set_param(ctx, layer.ffn_up_a); + ggml_set_param(ctx, layer.ffn_up_b); } } @@ -499,12 +435,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd); layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1); - layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd); - layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff); - layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff); - layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd); - layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd); - layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff); + layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd); + layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff); + layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff); + layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd); + layer.ffn_up_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_embd); + layer.ffn_up_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_ff); ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i)); ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i)); @@ -518,28 +454,18 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i)); ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i)); ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i)); - ggml_set_name(layer.w1_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); - ggml_set_name(layer.w1_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); - ggml_set_name(layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); - ggml_set_name(layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); - ggml_set_name(layer.w3_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); - ggml_set_name(layer.w3_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); } set_param_lora(lora); - // measure data size - size_t size = 0; - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - size += GGML_PAD(ggml_nbytes(t), tensor_alignment); - } - - // allocate data - struct ggml_allocr * alloc = NULL; - lora->data.resize(size + tensor_alignment); - alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment); - alloc_lora(alloc, lora); - ggml_allocr_free(alloc); + // allocate data for lora tensors + lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); } static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) { @@ -548,35 +474,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); randomize_tensor_normal(lora->tok_embeddings_a, rnd); - randomize_tensor_normal(lora->tok_embeddings_b, rnd); + ggml_set_zero(lora->tok_embeddings_b); randomize_tensor_normal(lora->norm_a, rnd); - randomize_tensor_normal(lora->norm_b, rnd); + ggml_set_zero(lora->norm_b); randomize_tensor_normal(lora->output_a, rnd); - randomize_tensor_normal(lora->output_b, rnd); + ggml_set_zero(lora->output_b); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = lora->layers[i]; randomize_tensor_normal(layer.attention_norm_a, rnd); - randomize_tensor_normal(layer.attention_norm_b, rnd); + ggml_set_zero(layer.attention_norm_b); randomize_tensor_normal(layer.wq_a, rnd); - randomize_tensor_normal(layer.wq_b, rnd); + ggml_set_zero(layer.wq_b); randomize_tensor_normal(layer.wk_a, rnd); - randomize_tensor_normal(layer.wk_b, rnd); + ggml_set_zero(layer.wk_b); randomize_tensor_normal(layer.wv_a, rnd); - randomize_tensor_normal(layer.wv_b, rnd); + ggml_set_zero(layer.wv_b); randomize_tensor_normal(layer.wo_a, rnd); - randomize_tensor_normal(layer.wo_b, rnd); + ggml_set_zero(layer.wo_b); randomize_tensor_normal(layer.ffn_norm_a, rnd); - randomize_tensor_normal(layer.ffn_norm_b, rnd); + ggml_set_zero(layer.ffn_norm_b); - randomize_tensor_normal(layer.w1_a, rnd); - randomize_tensor_normal(layer.w1_b, rnd); - randomize_tensor_normal(layer.w2_a, rnd); - randomize_tensor_normal(layer.w2_b, rnd); - randomize_tensor_normal(layer.w3_a, rnd); - randomize_tensor_normal(layer.w3_b, rnd); + randomize_tensor_normal(layer.ffn_gate_a, rnd); + ggml_set_zero(layer.ffn_gate_b); + randomize_tensor_normal(layer.ffn_down_a, rnd); + ggml_set_zero(layer.ffn_down_b); + randomize_tensor_normal(layer.ffn_up_a, rnd); + ggml_set_zero(layer.ffn_up_b); } free_random_normal_distribution(rnd); @@ -585,7 +511,7 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl static struct ggml_tensor * llama_build_lora_finetune_graphs( struct my_llama_model * model, struct my_llama_lora * lora, - struct ggml_allocr * alloc, + ggml_gallocr_t alloc, struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, @@ -596,7 +522,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( const int n_tokens, const int n_batch, const bool enable_flash_attn, - const bool enable_checkpointing) { + const bool enable_checkpointing, + const bool measure_only) { ggml_set_scratch(ctx, { 0, 0, nullptr, }); const int n_past = 0; @@ -612,6 +539,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( const int n_rot = hparams.n_embd_head(); const int n_embd_head = hparams.n_embd_head(); const int n_embd_gqa = hparams.n_embd_gqa(); + const float rms_norm_eps = hparams.f_norm_rms_eps; const float rope_freq_base = hparams.rope_freq_base; const float rope_freq_scale = hparams.rope_freq_scale; @@ -627,13 +555,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - ggml_allocr_alloc(alloc, KQ_pos); - if (!ggml_allocr_is_measure(alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < N; ++i) { - data[i] = n_past + i; - } - } + ggml_set_input(KQ_pos); // rope has so much parameters that we make a custom function for it auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] @@ -642,8 +564,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( const int rope_mode = 0; return ggml_rope_custom(ctx, - t, KQ_pos, n_rot, rope_mode, n_ctx, - rope_freq_base, rope_freq_scale); + t, KQ_pos, n_rot, rope_mode, n_ctx, 0, + rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f + ); }; set_name(tokens_input, "tokens_input"); @@ -652,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - if (ggml_is_quantized(a->type)) { + if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) { return ggml_add_cast(ctx, a, b, GGML_TYPE_F32); } else if (a->type == GGML_TYPE_F32) { return ggml_add(ctx, a, b); @@ -679,10 +602,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( checkpoints.push_back(t01); } - struct ggml_tensor * kv_scale = NULL; - if (!enable_flash_attn) { - kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)); - } + const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head); for (int il = 0; il < n_layer; ++il) { struct my_llama_layer & layer = model->layers[il]; @@ -690,13 +610,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b)); struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b)); - struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); - struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); - struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); - struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); - struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b)); - struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b)); - struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b)); + struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); + struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); + struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); + struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); + struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b)); + struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b)); + struct ggml_tensor * ffn_up = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b)); struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); @@ -739,11 +659,11 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); - struct ggml_tensor * t25 = ggml_mul_mat (ctx, w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); - struct ggml_tensor * t26 = ggml_mul_mat (ctx, w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); + struct ggml_tensor * t25 = ggml_mul_mat (ctx, ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); + struct ggml_tensor * t26 = ggml_mul_mat (ctx, ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); - struct ggml_tensor * t29 = ggml_mul_mat (ctx, w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); + struct ggml_tensor * t29 = ggml_mul_mat (ctx, ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); cur = t30; if (enable_checkpointing) { @@ -771,7 +691,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( if (enable_checkpointing) { ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size()); } else { - *gb = *gf; + ggml_graph_cpy(gf, gb); ggml_build_backward_expand(ctx, gf, gb, true); } @@ -780,43 +700,55 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( // make sure some tensors are not reallocated by inserting new temporary nodes depending on them int n_leafs_before = gb->n_leafs; int n_nodes_before = gb->n_nodes; - struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f); + // output tensors - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f)); // input gradient - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); - ggml_allocr_alloc(alloc, t36->grad); + ggml_set_input(t36->grad); // KQ_pos - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); // make sure base model tensors data cannot be used in viewable operations - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f)); for (int il = 0; il < n_layer; ++il) { struct my_llama_layer & layer = model->layers[il]; - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f)); } // allocating checkpoints in one block to reduce memory fragmentation // note: they will be freed in reverse order for (unsigned int i = 0; i < checkpoints.size(); ++i) { if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { - ggml_allocr_alloc(alloc, checkpoints[i]); + ggml_set_input(checkpoints[i]); } } - ggml_allocr_alloc_graph(alloc, gb); + if (measure_only) { + ggml_gallocr_reserve(alloc, gb); + } else { + ggml_gallocr_alloc_graph(alloc, gb); + + // set KQ_pos + { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < N; ++i) { + data[i] = n_past + i; + } + } + } // remove the additional nodes and leafs for (int i = n_leafs_before; i < gb->n_leafs; ++i) { @@ -866,9 +798,9 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V); GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT); GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); init_lora(model, lora); @@ -893,12 +825,12 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b)); copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a)); copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b)); - copy_tensor_by_name(layer.w1_a, f_ggml_ctx, ggml_get_name(layer.w1_a)); - copy_tensor_by_name(layer.w1_b, f_ggml_ctx, ggml_get_name(layer.w1_b)); - copy_tensor_by_name(layer.w2_a, f_ggml_ctx, ggml_get_name(layer.w2_a)); - copy_tensor_by_name(layer.w2_b, f_ggml_ctx, ggml_get_name(layer.w2_b)); - copy_tensor_by_name(layer.w3_a, f_ggml_ctx, ggml_get_name(layer.w3_a)); - copy_tensor_by_name(layer.w3_b, f_ggml_ctx, ggml_get_name(layer.w3_b)); + copy_tensor_by_name(layer.ffn_gate_a, f_ggml_ctx, ggml_get_name(layer.ffn_gate_a)); + copy_tensor_by_name(layer.ffn_gate_b, f_ggml_ctx, ggml_get_name(layer.ffn_gate_b)); + copy_tensor_by_name(layer.ffn_down_a, f_ggml_ctx, ggml_get_name(layer.ffn_down_a)); + copy_tensor_by_name(layer.ffn_down_b, f_ggml_ctx, ggml_get_name(layer.ffn_down_b)); + copy_tensor_by_name(layer.ffn_up_a, f_ggml_ctx, ggml_get_name(layer.ffn_up_a)); + copy_tensor_by_name(layer.ffn_up_b, f_ggml_ctx, ggml_get_name(layer.ffn_up_b)); } } @@ -936,9 +868,9 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_w1); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_w2); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_w3); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_ffn_gate); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_ffn_down); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_ffn_up); gguf_add_tensor(fctx, lora->tok_embeddings_a); gguf_add_tensor(fctx, lora->tok_embeddings_b); @@ -962,12 +894,12 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod gguf_add_tensor(fctx, layer.wo_b); gguf_add_tensor(fctx, layer.ffn_norm_a); gguf_add_tensor(fctx, layer.ffn_norm_b); - gguf_add_tensor(fctx, layer.w1_a); - gguf_add_tensor(fctx, layer.w1_b); - gguf_add_tensor(fctx, layer.w2_a); - gguf_add_tensor(fctx, layer.w2_b); - gguf_add_tensor(fctx, layer.w3_a); - gguf_add_tensor(fctx, layer.w3_b); + gguf_add_tensor(fctx, layer.ffn_gate_a); + gguf_add_tensor(fctx, layer.ffn_gate_b); + gguf_add_tensor(fctx, layer.ffn_down_a); + gguf_add_tensor(fctx, layer.ffn_down_b); + gguf_add_tensor(fctx, layer.ffn_up_a); + gguf_add_tensor(fctx, layer.ffn_up_b); } } @@ -1109,7 +1041,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, name = ggml_get_name(tensor); } uint32_t name_len = strlen(name); - uint32_t nd = tensor->n_dims; + uint32_t nd = ggml_n_dims(tensor); uint32_t ne[4] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], @@ -1145,9 +1077,8 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor return tn_buf.data(); }; - uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla' // write_magic - file.write_u32(LLAMA_FILE_MAGIC_LORA); // magic + file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic file.write_u32(1); // version // write_hparams file.write_u32(lora->hparams.lora_r); @@ -1173,12 +1104,12 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB")); write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA")); write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB")); - write_tensor(&file, layer.w1_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); - write_tensor(&file, layer.w1_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); - write_tensor(&file, layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); - write_tensor(&file, layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); - write_tensor(&file, layer.w3_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); - write_tensor(&file, layer.w3_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); } } @@ -1208,9 +1139,9 @@ struct train_params { uint32_t n_rank_wv; uint32_t n_rank_wo; uint32_t n_rank_ffn_norm; - uint32_t n_rank_w1; - uint32_t n_rank_w2; - uint32_t n_rank_w3; + uint32_t n_rank_ffn_gate; + uint32_t n_rank_ffn_down; + uint32_t n_rank_ffn_up; uint32_t n_rank_tok_embeddings; uint32_t n_rank_norm; uint32_t n_rank_output; @@ -1221,9 +1152,9 @@ struct train_params { bool custom_n_rank_wv; bool custom_n_rank_wo; bool custom_n_rank_ffn_norm; - bool custom_n_rank_w1; - bool custom_n_rank_w2; - bool custom_n_rank_w3; + bool custom_n_rank_ffn_gate; + bool custom_n_rank_ffn_down; + bool custom_n_rank_ffn_up; bool custom_n_rank_tok_embeddings; bool custom_n_rank_norm; bool custom_n_rank_output; @@ -1255,9 +1186,9 @@ static struct train_params get_default_train_params() { params.n_rank_wv = 4; params.n_rank_wo = 4; params.n_rank_ffn_norm = 1; - params.n_rank_w1 = 4; - params.n_rank_w2 = 4; - params.n_rank_w3 = 4; + params.n_rank_ffn_gate = 4; + params.n_rank_ffn_down = 4; + params.n_rank_ffn_up = 4; params.n_rank_tok_embeddings = 4; params.n_rank_norm = 1; params.n_rank_output = 4; @@ -1268,9 +1199,9 @@ static struct train_params get_default_train_params() { params.custom_n_rank_wv = false; params.custom_n_rank_wo = false; params.custom_n_rank_ffn_norm = false; - params.custom_n_rank_w1 = false; - params.custom_n_rank_w2 = false; - params.custom_n_rank_w3 = false; + params.custom_n_rank_ffn_gate = false; + params.custom_n_rank_ffn_down = false; + params.custom_n_rank_ffn_up = false; params.custom_n_rank_tok_embeddings = false; params.custom_n_rank_norm = false; params.custom_n_rank_output = false; @@ -1301,9 +1232,9 @@ static void train_print_usage(int argc, char ** argv, const struct train_params fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-w1 N LORA rank for w1 tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-w2 N LORA rank for w2 tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-w3 N LORA rank for w3 tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_gate N LORA rank for ffn_gate tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_down N LORA rank for ffn_down tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_up N LORA rank for ffn_up tensor, overrides default rank.\n"); print_common_train_usage(argc, argv, ¶ms->common); } @@ -1438,27 +1369,27 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par } params->n_rank_wo = std::stoi(argv[i]); params->custom_n_rank_wo = true; - } else if (arg == "--rank-w1") { + } else if (arg == "--rank-ffn_gate") { if (++i >= argc) { invalid_param = true; break; } - params->n_rank_w1 = std::stoi(argv[i]); - params->custom_n_rank_w1 = true; - } else if (arg == "--rank-w2") { + params->n_rank_ffn_gate = std::stoi(argv[i]); + params->custom_n_rank_ffn_gate = true; + } else if (arg == "--rank-ffn_down") { if (++i >= argc) { invalid_param = true; break; } - params->n_rank_w2 = std::stoi(argv[i]); - params->custom_n_rank_w2 = true; - } else if (arg == "--rank-w3") { + params->n_rank_ffn_down = std::stoi(argv[i]); + params->custom_n_rank_ffn_down = true; + } else if (arg == "--rank-ffn_up") { if (++i >= argc) { invalid_param = true; break; } - params->n_rank_w3 = std::stoi(argv[i]); - params->custom_n_rank_w3 = true; + params->n_rank_ffn_up = std::stoi(argv[i]); + params->custom_n_rank_ffn_up = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); train_print_usage(argc, argv, &default_params); @@ -1521,12 +1452,12 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) { nx += ggml_nelements(layer.wo_b); nx += ggml_nelements(layer.ffn_norm_a); nx += ggml_nelements(layer.ffn_norm_b); - nx += ggml_nelements(layer.w1_a); - nx += ggml_nelements(layer.w1_b); - nx += ggml_nelements(layer.w2_a); - nx += ggml_nelements(layer.w2_b); - nx += ggml_nelements(layer.w3_a); - nx += ggml_nelements(layer.w3_b); + nx += ggml_nelements(layer.ffn_gate_a); + nx += ggml_nelements(layer.ffn_gate_b); + nx += ggml_nelements(layer.ffn_down_a); + nx += ggml_nelements(layer.ffn_down_b); + nx += ggml_nelements(layer.ffn_up_a); + nx += ggml_nelements(layer.ffn_up_b); } return nx; } @@ -1545,6 +1476,7 @@ int main(int argc, char ** argv) { srand(params.common.seed); struct llama_model_params llama_mparams = llama_model_default_params(); + llama_mparams.n_gpu_layers = params.common.n_gpu_layers; llama_mparams.vocab_only = false; printf("%s: model base = '%s'\n", __func__, params.fn_model_base); @@ -1579,9 +1511,9 @@ int main(int argc, char ** argv) { uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r; uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r; uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1; - uint32_t n_rank_w1 = params.custom_n_rank_w1 ? params.n_rank_w1 : params.lora_r; - uint32_t n_rank_w2 = params.custom_n_rank_w2 ? params.n_rank_w2 : params.lora_r; - uint32_t n_rank_w3 = params.custom_n_rank_w3 ? params.n_rank_w3 : params.lora_r; + uint32_t n_rank_ffn_gate = params.custom_n_rank_ffn_gate ? params.n_rank_ffn_gate : params.lora_r; + uint32_t n_rank_ffn_down = params.custom_n_rank_ffn_down ? params.n_rank_ffn_down : params.lora_r; + uint32_t n_rank_ffn_up = params.custom_n_rank_ffn_up ? params.n_rank_ffn_up : params.lora_r; uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r; uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1; uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r; @@ -1591,17 +1523,18 @@ int main(int argc, char ** argv) { lora.hparams.n_rank_wv = n_rank_wv; lora.hparams.n_rank_wo = n_rank_wo; lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm; - lora.hparams.n_rank_w1 = n_rank_w1; - lora.hparams.n_rank_w2 = n_rank_w2; - lora.hparams.n_rank_w3 = n_rank_w3; + lora.hparams.n_rank_ffn_gate = n_rank_ffn_gate; + lora.hparams.n_rank_ffn_down = n_rank_ffn_down; + lora.hparams.n_rank_ffn_up = n_rank_ffn_up; lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings; lora.hparams.n_rank_norm = n_rank_norm; lora.hparams.n_rank_output = n_rank_output; // set opt params from command line - opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); opt->params.print_forward_graph = false; opt->params.print_backward_graph = false; + opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; opt->params.n_threads = params.common.n_threads; opt->params.past = params.common.opt_past; opt->params.delta = params.common.opt_delta; @@ -1617,8 +1550,6 @@ int main(int argc, char ** argv) { opt->params.adam.gclip = params.common.adam_gclip; opt->params.adam.eps_f = params.common.adam_eps_f; - ggml_allocr * alloc = NULL; - printf("%s: init model\n", __func__); bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train); @@ -1635,9 +1566,9 @@ int main(int argc, char ** argv) { || (lora.hparams.n_rank_wv != n_rank_wv) || (lora.hparams.n_rank_wo != n_rank_wo) || (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm) - || (lora.hparams.n_rank_w1 != n_rank_w1) - || (lora.hparams.n_rank_w2 != n_rank_w2) - || (lora.hparams.n_rank_w3 != n_rank_w3) + || (lora.hparams.n_rank_ffn_gate != n_rank_ffn_gate) + || (lora.hparams.n_rank_ffn_down != n_rank_ffn_down) + || (lora.hparams.n_rank_ffn_up != n_rank_ffn_up) || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings) || (lora.hparams.n_rank_norm != n_rank_norm) || (lora.hparams.n_rank_output != n_rank_output) @@ -1671,7 +1602,7 @@ int main(int argc, char ** argv) { printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); - printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f)); + printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f)); if (params.only_write_lora) { save_train_files_data save_data; @@ -1698,10 +1629,6 @@ int main(int argc, char ** argv) { int n_vocab = model.hparams.n_vocab; int n_batch = params.common.n_batch; - - std::vector mem_input_data; - std::vector mem_compute_data; - // context for input tensors without their data struct ggml_init_params ctx_input_params = { ggml_tensor_overhead() * 2, // mem_size @@ -1714,25 +1641,16 @@ int main(int argc, char ** argv) { struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch); struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); + // allocate input tensors // measure required memory for input tensors - size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) + - GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) + - tensor_alignment; + ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type()); + size_t max_input_size = ggml_backend_buffer_get_size(input_data); printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); - // allocate input tensors - mem_input_data.resize(max_input_size); - alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); - ggml_allocr_alloc(alloc, tokens_input); - ggml_allocr_alloc(alloc, target_probs); - ggml_allocr_free(alloc); - // context for compute tensors without their data - size_t estimated_compute_size_wo_data = ( - ggml_tensor_overhead()*GGML_MAX_NODES*2 - + (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*( - params.common.use_checkpointing ? 3 : 2 - ) + const size_t estimated_compute_size_wo_data = ( + 2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() + + (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true)) ); struct ggml_init_params ctx_compute_params = { estimated_compute_size_wo_data, // mem_size @@ -1754,12 +1672,12 @@ int main(int argc, char ** argv) { // find best evaluation order for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { ctx_compute = ggml_init(ctx_compute_params); - alloc = ggml_allocr_new_measure(tensor_alignment); - gf = ggml_new_graph(ctx_compute); + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); + gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = (enum ggml_cgraph_eval_order) order; - gb = ggml_new_graph(ctx_compute); + gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb_tmp = params.common.use_checkpointing - ? ggml_new_graph(ctx_compute) + ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) : NULL; loss = llama_build_lora_finetune_graphs( &model, &lora, alloc, ctx_compute, @@ -1767,14 +1685,15 @@ int main(int argc, char ** argv) { &logits, tokens_input, target_probs, n_tokens, n_batch, params.common.use_flash, - params.common.use_checkpointing + params.common.use_checkpointing, + true ); - size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; + size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer if (max_compute_size < best_compute_size) { best_compute_size = max_compute_size; best_order = gf->order; } - ggml_allocr_free(alloc); + ggml_gallocr_free(alloc); ggml_free(ctx_compute); } size_t max_compute_size = best_compute_size; @@ -1785,14 +1704,13 @@ int main(int argc, char ** argv) { "invalid"); // allocate compute tensors - mem_compute_data.resize(max_compute_size); ctx_compute = ggml_init(ctx_compute_params); - alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); - gf = ggml_new_graph(ctx_compute); + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); + gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = best_order; - gb = ggml_new_graph(ctx_compute); + gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb_tmp = params.common.use_checkpointing - ? ggml_new_graph(ctx_compute) + ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) : NULL; loss = llama_build_lora_finetune_graphs( &model, &lora, alloc, ctx_compute, @@ -1800,15 +1718,17 @@ int main(int argc, char ** argv) { &logits, tokens_input, target_probs, n_tokens, n_batch, params.common.use_flash, - params.common.use_checkpointing + params.common.use_checkpointing, + false ); - ggml_allocr_free(alloc); // tokenize data std::vector train_tokens; std::vector train_samples_begin; std::vector train_samples_size; - printf("%s: tokenize training data\n", __func__); + printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data); + printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str()); + printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false"); tokenize_file(lctx, params.common.fn_train_data, params.common.sample_start, @@ -1915,6 +1835,8 @@ int main(int argc, char ** argv) { ggml_free(ctx_work); ggml_free(ctx_compute); ggml_free(ctx_input); + ggml_gallocr_free(alloc); + int64_t t1 = ggml_time_ms(); printf("%s: total training time: ", __func__); diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh new file mode 100644 index 000000000..079bfa113 --- /dev/null +++ b/examples/finetune/finetune.sh @@ -0,0 +1,34 @@ +#!/bin/bash +cd `dirname $0` +cd ../.. + +EXE="./finetune" + +if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi +if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi + +# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses. +MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing. + +while getopts "dg" opt; do + case $opt in + d) + DEBUGGER="gdb --args" + ;; + g) + EXE="./build/bin/Release/finetune" + GPUARG="--gpu-layers 25" + ;; + esac +done + +$DEBUGGER $EXE \ + --model-base $MODEL \ + $GPUARG \ + --checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \ + --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \ + --lora-out lora-ol3b-shakespeare-ITERATION.bin \ + --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \ + --save-every 10 \ + --threads 10 --adam-iter 30 --batch 4 --ctx 64 \ + --use-checkpointing diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt index 7d1806af3..6481f087b 100644 --- a/examples/gguf/CMakeLists.txt +++ b/examples/gguf/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET gguf) add_executable(${TARGET} gguf.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 9ab63a293..e67be4fb2 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -1,5 +1,4 @@ #include "ggml.h" -#include "llama.h" #include #include @@ -195,7 +194,7 @@ static bool gguf_ex_read_1(const std::string & fname) { struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); + printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data); // print first 10 elements const float * data = (const float *) cur->data; diff --git a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp deleted file mode 100644 index 9d433f4b1..000000000 --- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp +++ /dev/null @@ -1,1133 +0,0 @@ -#ifndef CMPNCT_GPT2BPE -#define CMPNCT_GPT2BPE - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -// Unicode GPT2 Byte Pair Encoding Tokenizer -// Adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License] -// Removed loading of merges from HF json and parts made for a specific vocab - - -//----------------- -// Unicode library (from cmpnct_unicode.cpp) -//----------------- - -// Minimal library for high performance handling and categorization of UTF8 strings and characters -// Using std::string - -enum CNCTCharType { - DIGIT, // a numerical char in any language - LETTER, // a letter in any language - WHITESPACE, // any form of whitespace - ACCENT_MARK, // letter modifiers like ´ in é - PUNCTUATION, // punctuation including brackets - SYMBOL, // math, currency, other symbols - CONTROL, // control characters - MIXED, // a mix of the above - UNIDENTIFIED // something more exotic like emoji or separators -}; - -struct CNCTUnicode; - -struct CNCTString { - std::string str; - size_t utf8_chars; - - CNCTCharType char_type=UNIDENTIFIED; - bool is_sequential=false; - - size_t seq_offset_bytes=0; - size_t seq_offset_utf8_chars=0; - - bool operator==(const std::string &other) const; - bool operator==(const char other) const; - bool operator==(const CNCTString &other) const; - CNCTString &operator+=(const std::string &other); - CNCTString &operator+=(const char other); - friend CNCTString operator+(CNCTString lhs, const std::string &rhs); - friend CNCTString operator+(CNCTString lhs, const char rhs); - CNCTString& operator+=(const CNCTString& other); - friend CNCTString operator+(CNCTString lhs, const CNCTString& rhs); -}; - -struct CNCTUnicode { - static bool check_code_range(int c, const std::vector>& ranges); - static CNCTCharType get_code_type(int c); - static CNCTCharType get_code_type(const std::string &utf8_char); - static int utf8_len(const char c); - static int strlen_utf8(std::string src); - static std::vector split_utf8(const std::string &src); - static std::vector split_utf8_enhanced(const std::string &src); - static CNCTCharType string_identify(const std::string& str); - static bool string_test(const std::string& str, CNCTCharType chartype); -}; - -static const std::vector> digit_ranges = { -{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F}, -{0xCE6, 0xCEF}, {0xD66, 0xD6F}, {0xDE6, 0xDEF}, {0xE50, 0xE59}, {0xED0, 0xED9}, {0xF20, 0xF29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x1369, 0x1371}, {0x17E0, 0x17E9}, {0x1810, 0x1819}, {0x1946, 0x194F}, -{0x19D0, 0x19DA}, {0x1A80, 0x1A89}, {0x1A90, 0x1A99}, {0x1B50, 0x1B59}, {0x1BB0, 0x1BB9}, {0x1C40, 0x1C49}, {0x1C50, 0x1C59}, {0x2070, 0x2070}, {0x2074, 0x2079}, {0x2080, 0x2089}, {0x2460, 0x2468}, -{0x2474, 0x247C}, {0x2488, 0x2490}, {0x24EA, 0x24EA}, {0x24F5, 0x24FD}, {0x24FF, 0x24FF}, {0x2776, 0x277E}, {0x2780, 0x2788}, {0x278A, 0x2792}, {0xA620, 0xA629}, {0xA8D0, 0xA8D9}, {0xA900, 0xA909}, -{0xA9D0, 0xA9D9}, {0xA9F0, 0xA9F9}, {0xAA50, 0xAA59}, {0xABF0, 0xABF9}, {0xFF10, 0xFF19}, {0x104A0, 0x104A9}, {0x10A40, 0x10A43}, {0x10D30, 0x10D39}, {0x10E60, 0x10E68}, {0x11052, 0x1105A}, -{0x11066, 0x1106F}, {0x110F0, 0x110F9}, {0x11136, 0x1113F}, {0x111D0, 0x111D9}, {0x112F0, 0x112F9}, {0x11450, 0x11459}, {0x114D0, 0x114D9}, {0x11650, 0x11659}, {0x116C0, 0x116C9}, {0x11730, 0x11739}, -{0x118E0, 0x118E9}, {0x11950, 0x11959}, {0x11C50, 0x11C59}, {0x11D50, 0x11D59}, {0x11DA0, 0x11DA9}, {0x16A60, 0x16A69}, {0x16B50, 0x16B59}, {0x1D7CE, 0x1D7FF}, {0x1E140, 0x1E149}, {0x1E2F0, 0x1E2F9}, -{0x1E950, 0x1E959}, {0x1F100, 0x1F10A}, {0x1FBF0, 0x1FBF9}, -}; - -static const std::vector> letter_ranges = { -{0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374}, -{0x376, 0x377}, {0x37A, 0x37D}, {0x37F, 0x37F}, {0x386, 0x386}, {0x388, 0x38A}, {0x38C, 0x38C}, {0x38E, 0x3A1}, {0x3A3, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x52F}, {0x531, 0x556}, {0x559, 0x559}, -{0x560, 0x588}, {0x5D0, 0x5EA}, {0x5EF, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710}, -{0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x7FA}, {0x800, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x860, 0x86A}, -{0x8A0, 0x8B4}, {0x8B6, 0x8C7}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x980}, {0x985, 0x98C}, {0x98F, 0x990}, {0x993, 0x9A8}, {0x9AA, 0x9B0}, {0x9B2, 0x9B2}, -{0x9B6, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9DD}, {0x9DF, 0x9E1}, {0x9F0, 0x9F1}, {0x9FC, 0x9FC}, {0xA05, 0xA0A}, {0xA0F, 0xA10}, {0xA13, 0xA28}, {0xA2A, 0xA30}, {0xA32, 0xA33}, -{0xA35, 0xA36}, {0xA38, 0xA39}, {0xA59, 0xA5C}, {0xA5E, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xA8D}, {0xA8F, 0xA91}, {0xA93, 0xAA8}, {0xAAA, 0xAB0}, {0xAB2, 0xAB3}, {0xAB5, 0xAB9}, {0xABD, 0xABD}, -{0xAD0, 0xAD0}, {0xAE0, 0xAE1}, {0xAF9, 0xAF9}, {0xB05, 0xB0C}, {0xB0F, 0xB10}, {0xB13, 0xB28}, {0xB2A, 0xB30}, {0xB32, 0xB33}, {0xB35, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB5D}, {0xB5F, 0xB61}, -{0xB71, 0xB71}, {0xB83, 0xB83}, {0xB85, 0xB8A}, {0xB8E, 0xB90}, {0xB92, 0xB95}, {0xB99, 0xB9A}, {0xB9C, 0xB9C}, {0xB9E, 0xB9F}, {0xBA3, 0xBA4}, {0xBA8, 0xBAA}, {0xBAE, 0xBB9}, {0xBD0, 0xBD0}, -{0xC05, 0xC0C}, {0xC0E, 0xC10}, {0xC12, 0xC28}, {0xC2A, 0xC39}, {0xC3D, 0xC3D}, {0xC58, 0xC5A}, {0xC60, 0xC61}, {0xC80, 0xC80}, {0xC85, 0xC8C}, {0xC8E, 0xC90}, {0xC92, 0xCA8}, {0xCAA, 0xCB3}, -{0xCB5, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCDE}, {0xCE0, 0xCE1}, {0xCF1, 0xCF2}, {0xD04, 0xD0C}, {0xD0E, 0xD10}, {0xD12, 0xD3A}, {0xD3D, 0xD3D}, {0xD4E, 0xD4E}, {0xD54, 0xD56}, {0xD5F, 0xD61}, -{0xD7A, 0xD7F}, {0xD85, 0xD96}, {0xD9A, 0xDB1}, {0xDB3, 0xDBB}, {0xDBD, 0xDBD}, {0xDC0, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xE82}, {0xE84, 0xE84}, {0xE86, 0xE8A}, -{0xE8C, 0xEA3}, {0xEA5, 0xEA5}, {0xEA7, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEBD}, {0xEC0, 0xEC4}, {0xEC6, 0xEC6}, {0xEDC, 0xEDF}, {0xF00, 0xF00}, {0xF40, 0xF47}, {0xF49, 0xF6C}, {0xF88, 0xF8C}, -{0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10C5}, {0x10C7, 0x10C7}, -{0x10CD, 0x10CD}, {0x10D0, 0x10FA}, {0x10FC, 0x1248}, {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, {0x1290, 0x12B0}, {0x12B2, 0x12B5}, -{0x12B8, 0x12BE}, {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F5}, {0x13F8, 0x13FD}, {0x1401, 0x166C}, -{0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16F1, 0x16F8}, {0x1700, 0x170C}, {0x170E, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1780, 0x17B3}, -{0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x1878}, {0x1880, 0x1884}, {0x1887, 0x18A8}, {0x18AA, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, {0x1950, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB}, -{0x19B0, 0x19C9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BBA, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F}, -{0x1C5A, 0x1C7D}, {0x1C80, 0x1C88}, {0x1C90, 0x1CBA}, {0x1CBD, 0x1CBF}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF3}, {0x1CF5, 0x1CF6}, {0x1CFA, 0x1CFA}, {0x1D00, 0x1DBF}, {0x1E00, 0x1F15}, {0x1F18, 0x1F1D}, -{0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, {0x1F80, 0x1FB4}, {0x1FB6, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FC4}, -{0x1FC6, 0x1FCC}, {0x1FD0, 0x1FD3}, {0x1FD6, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107}, -{0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E}, -{0x2183, 0x2184}, {0x2C00, 0x2C2E}, {0x2C30, 0x2C5E}, {0x2C60, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2CF2, 0x2CF3}, {0x2D00, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, {0x2D30, 0x2D67}, {0x2D6F, 0x2D6F}, -{0x2D80, 0x2D96}, {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3006}, -{0x3031, 0x3035}, {0x303B, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, {0x31A0, 0x31BF}, {0x31F0, 0x31FF}, {0x3400, 0x4DBF}, -{0x4E00, 0x9FFC}, {0xA000, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA62B}, {0xA640, 0xA66E}, {0xA67F, 0xA69D}, {0xA6A0, 0xA6E5}, {0xA717, 0xA71F}, {0xA722, 0xA788}, -{0xA78B, 0xA7BF}, {0xA7C2, 0xA7CA}, {0xA7F5, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA8FD, 0xA8FE}, -{0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xA9E0, 0xA9E4}, {0xA9E6, 0xA9EF}, {0xA9FA, 0xA9FE}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B}, -{0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA7E, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAAC2}, {0xAADB, 0xAADD}, {0xAAE0, 0xAAEA}, {0xAAF2, 0xAAF4}, -{0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, {0xAB30, 0xAB5A}, {0xAB5C, 0xAB69}, {0xAB70, 0xABE2}, {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB}, -{0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, {0xFB1D, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44}, -{0xFB46, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, {0xFDF0, 0xFDFB}, {0xFE70, 0xFE74}, {0xFE76, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFBE}, {0xFFC2, 0xFFC7}, -{0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, {0x10080, 0x100FA}, -{0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F}, {0x1032D, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375}, {0x10380, 0x1039D}, {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x10400, 0x1049D}, -{0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835}, -{0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5}, {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7}, -{0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13}, {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C}, {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4}, {0x10B00, 0x10B35}, -{0x10B40, 0x10B55}, {0x10B60, 0x10B72}, {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C}, -{0x10F27, 0x10F27}, {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6}, {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8}, {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147}, -{0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2}, {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC}, {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286}, {0x11288, 0x11288}, -{0x1128A, 0x1128D}, {0x1128F, 0x1129D}, {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, {0x11335, 0x11339}, -{0x1133D, 0x1133D}, {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF}, {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE}, -{0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644}, {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A}, {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906}, {0x11909, 0x11909}, -{0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941}, {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1}, {0x119E3, 0x119E3}, {0x11A00, 0x11A00}, -{0x11A0B, 0x11A32}, {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89}, {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08}, {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F}, -{0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30}, {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68}, {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2}, {0x11FB0, 0x11FB0}, -{0x12000, 0x12399}, {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646}, {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED}, {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77}, -{0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A}, {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1}, {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08}, -{0x1B000, 0x1B11E}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1D400, 0x1D454}, {0x1D456, 0x1D49C}, -{0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514}, -{0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA}, {0x1D6DC, 0x1D6FA}, -{0x1D6FC, 0x1D714}, {0x1D716, 0x1D734}, {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788}, {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB}, {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D}, -{0x1E14E, 0x1E14E}, {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943}, {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27}, -{0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, {0x1EE51, 0x1EE52}, -{0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72}, -{0x1EE74, 0x1EE77}, {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734}, -{0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A}, -}; - -static const std::vector> whitespace_ranges = { -{0x9, 0xD}, {0x1C, 0x20}, {0x85, 0x85}, {0xA0, 0xA0}, {0x1680, 0x1680}, {0x2000, 0x200A}, {0x2028, 0x2029}, {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}, -}; - -static const std::vector> accent_mark_ranges = { -{0x300, 0x36F}, {0x483, 0x489}, {0x591, 0x5BD}, {0x5BF, 0x5BF}, {0x5C1, 0x5C2}, {0x5C4, 0x5C5}, {0x5C7, 0x5C7}, {0x610, 0x61A}, {0x64B, 0x65F}, {0x670, 0x670}, {0x6D6, 0x6DC}, {0x6DF, 0x6E4}, -{0x6E7, 0x6E8}, {0x6EA, 0x6ED}, {0x711, 0x711}, {0x730, 0x74A}, {0x7A6, 0x7B0}, {0x7EB, 0x7F3}, {0x7FD, 0x7FD}, {0x816, 0x819}, {0x81B, 0x823}, {0x825, 0x827}, {0x829, 0x82D}, {0x859, 0x85B}, -{0x8D3, 0x8E1}, {0x8E3, 0x903}, {0x93A, 0x93C}, {0x93E, 0x94F}, {0x951, 0x957}, {0x962, 0x963}, {0x981, 0x983}, {0x9BC, 0x9BC}, {0x9BE, 0x9C4}, {0x9C7, 0x9C8}, {0x9CB, 0x9CD}, {0x9D7, 0x9D7}, -{0x9E2, 0x9E3}, {0x9FE, 0x9FE}, {0xA01, 0xA03}, {0xA3C, 0xA3C}, {0xA3E, 0xA42}, {0xA47, 0xA48}, {0xA4B, 0xA4D}, {0xA51, 0xA51}, {0xA70, 0xA71}, {0xA75, 0xA75}, {0xA81, 0xA83}, {0xABC, 0xABC}, -{0xABE, 0xAC5}, {0xAC7, 0xAC9}, {0xACB, 0xACD}, {0xAE2, 0xAE3}, {0xAFA, 0xAFF}, {0xB01, 0xB03}, {0xB3C, 0xB3C}, {0xB3E, 0xB44}, {0xB47, 0xB48}, {0xB4B, 0xB4D}, {0xB55, 0xB57}, {0xB62, 0xB63}, -{0xB82, 0xB82}, {0xBBE, 0xBC2}, {0xBC6, 0xBC8}, {0xBCA, 0xBCD}, {0xBD7, 0xBD7}, {0xC00, 0xC04}, {0xC3E, 0xC44}, {0xC46, 0xC48}, {0xC4A, 0xC4D}, {0xC55, 0xC56}, {0xC62, 0xC63}, {0xC81, 0xC83}, -{0xCBC, 0xCBC}, {0xCBE, 0xCC4}, {0xCC6, 0xCC8}, {0xCCA, 0xCCD}, {0xCD5, 0xCD6}, {0xCE2, 0xCE3}, {0xD00, 0xD03}, {0xD3B, 0xD3C}, {0xD3E, 0xD44}, {0xD46, 0xD48}, {0xD4A, 0xD4D}, {0xD57, 0xD57}, -{0xD62, 0xD63}, {0xD81, 0xD83}, {0xDCA, 0xDCA}, {0xDCF, 0xDD4}, {0xDD6, 0xDD6}, {0xDD8, 0xDDF}, {0xDF2, 0xDF3}, {0xE31, 0xE31}, {0xE34, 0xE3A}, {0xE47, 0xE4E}, {0xEB1, 0xEB1}, {0xEB4, 0xEBC}, -{0xEC8, 0xECD}, {0xF18, 0xF19}, {0xF35, 0xF35}, {0xF37, 0xF37}, {0xF39, 0xF39}, {0xF3E, 0xF3F}, {0xF71, 0xF84}, {0xF86, 0xF87}, {0xF8D, 0xF97}, {0xF99, 0xFBC}, {0xFC6, 0xFC6}, {0x102B, 0x103E}, -{0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, {0x108F, 0x108F}, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714}, {0x1732, 0x1734}, -{0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D}, {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E}, -{0x1A60, 0x1A7C}, {0x1A7F, 0x1A7F}, {0x1AB0, 0x1AC0}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2}, -{0x1CD4, 0x1CE8}, {0x1CED, 0x1CED}, {0x1CF4, 0x1CF4}, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DF9}, {0x1DFB, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F}, {0x2DE0, 0x2DFF}, {0x302A, 0x302F}, -{0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, {0xA823, 0xA827}, {0xA82C, 0xA82C}, {0xA880, 0xA881}, -{0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, {0xA9E5, 0xA9E5}, {0xAA29, 0xAA36}, {0xAA43, 0xAA43}, {0xAA4C, 0xAA4D}, -{0xAA7B, 0xAA7D}, {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, {0xFB1E, 0xFB1E}, -{0xFE00, 0xFE0F}, {0xFE20, 0xFE2F}, {0x101FD, 0x101FD}, {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F}, -{0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, {0x11100, 0x11102}, {0x11127, 0x11134}, -{0x11145, 0x11146}, {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, {0x1123E, 0x1123E}, {0x112DF, 0x112EA}, {0x11300, 0x11303}, -{0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, {0x1145E, 0x1145E}, -{0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938}, -{0x1193B, 0x1193E}, {0x11940, 0x11940}, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4}, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, {0x11A47, 0x11A47}, -{0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45}, -{0x11D47, 0x11D47}, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, {0x16F4F, 0x16F4F}, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92}, -{0x16FE4, 0x16FE4}, {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36}, -{0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75}, {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A}, -{0x1E130, 0x1E136}, {0x1E2EC, 0x1E2EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0xE0100, 0xE01EF}, -}; - -static const std::vector> punctuation_ranges = { -{0x21, 0x23}, {0x25, 0x2A}, {0x2C, 0x2F}, {0x3A, 0x3B}, {0x3F, 0x40}, {0x5B, 0x5D}, {0x5F, 0x5F}, {0x7B, 0x7B}, {0x7D, 0x7D}, {0xA1, 0xA1}, {0xA7, 0xA7}, {0xAB, 0xAB}, {0xB6, 0xB7}, {0xBB, 0xBB}, -{0xBF, 0xBF}, {0x37E, 0x37E}, {0x387, 0x387}, {0x55A, 0x55F}, {0x589, 0x58A}, {0x5BE, 0x5BE}, {0x5C0, 0x5C0}, {0x5C3, 0x5C3}, {0x5C6, 0x5C6}, {0x5F3, 0x5F4}, {0x609, 0x60A}, {0x60C, 0x60D}, -{0x61B, 0x61B}, {0x61E, 0x61F}, {0x66A, 0x66D}, {0x6D4, 0x6D4}, {0x700, 0x70D}, {0x7F7, 0x7F9}, {0x830, 0x83E}, {0x85E, 0x85E}, {0x964, 0x965}, {0x970, 0x970}, {0x9FD, 0x9FD}, {0xA76, 0xA76}, -{0xAF0, 0xAF0}, {0xC77, 0xC77}, {0xC84, 0xC84}, {0xDF4, 0xDF4}, {0xE4F, 0xE4F}, {0xE5A, 0xE5B}, {0xF04, 0xF12}, {0xF14, 0xF14}, {0xF3A, 0xF3D}, {0xF85, 0xF85}, {0xFD0, 0xFD4}, {0xFD9, 0xFDA}, -{0x104A, 0x104F}, {0x10FB, 0x10FB}, {0x1360, 0x1368}, {0x1400, 0x1400}, {0x166E, 0x166E}, {0x169B, 0x169C}, {0x16EB, 0x16ED}, {0x1735, 0x1736}, {0x17D4, 0x17D6}, {0x17D8, 0x17DA}, {0x1800, 0x180A}, -{0x1944, 0x1945}, {0x1A1E, 0x1A1F}, {0x1AA0, 0x1AA6}, {0x1AA8, 0x1AAD}, {0x1B5A, 0x1B60}, {0x1BFC, 0x1BFF}, {0x1C3B, 0x1C3F}, {0x1C7E, 0x1C7F}, {0x1CC0, 0x1CC7}, {0x1CD3, 0x1CD3}, {0x2010, 0x2027}, -{0x2030, 0x2043}, {0x2045, 0x2051}, {0x2053, 0x205E}, {0x207D, 0x207E}, {0x208D, 0x208E}, {0x2308, 0x230B}, {0x2329, 0x232A}, {0x2768, 0x2775}, {0x27C5, 0x27C6}, {0x27E6, 0x27EF}, {0x2983, 0x2998}, -{0x29D8, 0x29DB}, {0x29FC, 0x29FD}, {0x2CF9, 0x2CFC}, {0x2CFE, 0x2CFF}, {0x2D70, 0x2D70}, {0x2E00, 0x2E2E}, {0x2E30, 0x2E4F}, {0x2E52, 0x2E52}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301F}, -{0x3030, 0x3030}, {0x303D, 0x303D}, {0x30A0, 0x30A0}, {0x30FB, 0x30FB}, {0xA4FE, 0xA4FF}, {0xA60D, 0xA60F}, {0xA673, 0xA673}, {0xA67E, 0xA67E}, {0xA6F2, 0xA6F7}, {0xA874, 0xA877}, {0xA8CE, 0xA8CF}, -{0xA8F8, 0xA8FA}, {0xA8FC, 0xA8FC}, {0xA92E, 0xA92F}, {0xA95F, 0xA95F}, {0xA9C1, 0xA9CD}, {0xA9DE, 0xA9DF}, {0xAA5C, 0xAA5F}, {0xAADE, 0xAADF}, {0xAAF0, 0xAAF1}, {0xABEB, 0xABEB}, {0xFD3E, 0xFD3F}, -{0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE61}, {0xFE63, 0xFE63}, {0xFE68, 0xFE68}, {0xFE6A, 0xFE6B}, {0xFF01, 0xFF03}, {0xFF05, 0xFF0A}, {0xFF0C, 0xFF0F}, {0xFF1A, 0xFF1B}, {0xFF1F, 0xFF20}, -{0xFF3B, 0xFF3D}, {0xFF3F, 0xFF3F}, {0xFF5B, 0xFF5B}, {0xFF5D, 0xFF5D}, {0xFF5F, 0xFF65}, {0x10100, 0x10102}, {0x1039F, 0x1039F}, {0x103D0, 0x103D0}, {0x1056F, 0x1056F}, {0x10857, 0x10857}, -{0x1091F, 0x1091F}, {0x1093F, 0x1093F}, {0x10A50, 0x10A58}, {0x10A7F, 0x10A7F}, {0x10AF0, 0x10AF6}, {0x10B39, 0x10B3F}, {0x10B99, 0x10B9C}, {0x10EAD, 0x10EAD}, {0x10F55, 0x10F59}, {0x11047, 0x1104D}, -{0x110BB, 0x110BC}, {0x110BE, 0x110C1}, {0x11140, 0x11143}, {0x11174, 0x11175}, {0x111C5, 0x111C8}, {0x111CD, 0x111CD}, {0x111DB, 0x111DB}, {0x111DD, 0x111DF}, {0x11238, 0x1123D}, {0x112A9, 0x112A9}, -{0x1144B, 0x1144F}, {0x1145A, 0x1145B}, {0x1145D, 0x1145D}, {0x114C6, 0x114C6}, {0x115C1, 0x115D7}, {0x11641, 0x11643}, {0x11660, 0x1166C}, {0x1173C, 0x1173E}, {0x1183B, 0x1183B}, {0x11944, 0x11946}, -{0x119E2, 0x119E2}, {0x11A3F, 0x11A46}, {0x11A9A, 0x11A9C}, {0x11A9E, 0x11AA2}, {0x11C41, 0x11C45}, {0x11C70, 0x11C71}, {0x11EF7, 0x11EF8}, {0x11FFF, 0x11FFF}, {0x12470, 0x12474}, {0x16A6E, 0x16A6F}, -{0x16AF5, 0x16AF5}, {0x16B37, 0x16B3B}, {0x16B44, 0x16B44}, {0x16E97, 0x16E9A}, {0x16FE2, 0x16FE2}, {0x1BC9F, 0x1BC9F}, {0x1DA87, 0x1DA8B}, {0x1E95E, 0x1E95F}, -}; - -static const std::vector> symbol_ranges = { -{0x24, 0x24}, {0x2B, 0x2B}, {0x3C, 0x3E}, {0x5E, 0x5E}, {0x60, 0x60}, {0x7C, 0x7C}, {0x7E, 0x7E}, {0xA2, 0xA6}, {0xA8, 0xA9}, {0xAC, 0xAC}, {0xAE, 0xB1}, {0xB4, 0xB4}, {0xB8, 0xB8}, {0xD7, 0xD7}, -{0xF7, 0xF7}, {0x2C2, 0x2C5}, {0x2D2, 0x2DF}, {0x2E5, 0x2EB}, {0x2ED, 0x2ED}, {0x2EF, 0x2FF}, {0x375, 0x375}, {0x384, 0x385}, {0x3F6, 0x3F6}, {0x482, 0x482}, {0x58D, 0x58F}, {0x606, 0x608}, -{0x60B, 0x60B}, {0x60E, 0x60F}, {0x6DE, 0x6DE}, {0x6E9, 0x6E9}, {0x6FD, 0x6FE}, {0x7F6, 0x7F6}, {0x7FE, 0x7FF}, {0x9F2, 0x9F3}, {0x9FA, 0x9FB}, {0xAF1, 0xAF1}, {0xB70, 0xB70}, {0xBF3, 0xBFA}, -{0xC7F, 0xC7F}, {0xD4F, 0xD4F}, {0xD79, 0xD79}, {0xE3F, 0xE3F}, {0xF01, 0xF03}, {0xF13, 0xF13}, {0xF15, 0xF17}, {0xF1A, 0xF1F}, {0xF34, 0xF34}, {0xF36, 0xF36}, {0xF38, 0xF38}, {0xFBE, 0xFC5}, -{0xFC7, 0xFCC}, {0xFCE, 0xFCF}, {0xFD5, 0xFD8}, {0x109E, 0x109F}, {0x1390, 0x1399}, {0x166D, 0x166D}, {0x17DB, 0x17DB}, {0x1940, 0x1940}, {0x19DE, 0x19FF}, {0x1B61, 0x1B6A}, {0x1B74, 0x1B7C}, -{0x1FBD, 0x1FBD}, {0x1FBF, 0x1FC1}, {0x1FCD, 0x1FCF}, {0x1FDD, 0x1FDF}, {0x1FED, 0x1FEF}, {0x1FFD, 0x1FFE}, {0x2044, 0x2044}, {0x2052, 0x2052}, {0x207A, 0x207C}, {0x208A, 0x208C}, {0x20A0, 0x20BF}, -{0x2100, 0x2101}, {0x2103, 0x2106}, {0x2108, 0x2109}, {0x2114, 0x2114}, {0x2116, 0x2118}, {0x211E, 0x2123}, {0x2125, 0x2125}, {0x2127, 0x2127}, {0x2129, 0x2129}, {0x212E, 0x212E}, {0x213A, 0x213B}, -{0x2140, 0x2144}, {0x214A, 0x214D}, {0x214F, 0x214F}, {0x218A, 0x218B}, {0x2190, 0x2307}, {0x230C, 0x2328}, {0x232B, 0x2426}, {0x2440, 0x244A}, {0x249C, 0x24E9}, {0x2500, 0x2767}, {0x2794, 0x27C4}, -{0x27C7, 0x27E5}, {0x27F0, 0x2982}, {0x2999, 0x29D7}, {0x29DC, 0x29FB}, {0x29FE, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2BFF}, {0x2CE5, 0x2CEA}, {0x2E50, 0x2E51}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3}, -{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3004, 0x3004}, {0x3012, 0x3013}, {0x3020, 0x3020}, {0x3036, 0x3037}, {0x303E, 0x303F}, {0x309B, 0x309C}, {0x3190, 0x3191}, {0x3196, 0x319F}, {0x31C0, 0x31E3}, -{0x3200, 0x321E}, {0x322A, 0x3247}, {0x3250, 0x3250}, {0x3260, 0x327F}, {0x328A, 0x32B0}, {0x32C0, 0x33FF}, {0x4DC0, 0x4DFF}, {0xA490, 0xA4C6}, {0xA700, 0xA716}, {0xA720, 0xA721}, {0xA789, 0xA78A}, -{0xA828, 0xA82B}, {0xA836, 0xA839}, {0xAA77, 0xAA79}, {0xAB5B, 0xAB5B}, {0xAB6A, 0xAB6B}, {0xFB29, 0xFB29}, {0xFBB2, 0xFBC1}, {0xFDFC, 0xFDFD}, {0xFE62, 0xFE62}, {0xFE64, 0xFE66}, {0xFE69, 0xFE69}, -{0xFF04, 0xFF04}, {0xFF0B, 0xFF0B}, {0xFF1C, 0xFF1E}, {0xFF3E, 0xFF3E}, {0xFF40, 0xFF40}, {0xFF5C, 0xFF5C}, {0xFF5E, 0xFF5E}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, {0x10137, 0x1013F}, -{0x10179, 0x10189}, {0x1018C, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, {0x101D0, 0x101FC}, {0x10877, 0x10878}, {0x10AC8, 0x10AC8}, {0x1173F, 0x1173F}, {0x11FD5, 0x11FF1}, {0x16B3C, 0x16B3F}, -{0x16B45, 0x16B45}, {0x1BC9C, 0x1BC9C}, {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D164}, {0x1D16A, 0x1D16C}, {0x1D183, 0x1D184}, {0x1D18C, 0x1D1A9}, {0x1D1AE, 0x1D1E8}, {0x1D200, 0x1D241}, -{0x1D245, 0x1D245}, {0x1D300, 0x1D356}, {0x1D6C1, 0x1D6C1}, {0x1D6DB, 0x1D6DB}, {0x1D6FB, 0x1D6FB}, {0x1D715, 0x1D715}, {0x1D735, 0x1D735}, {0x1D74F, 0x1D74F}, {0x1D76F, 0x1D76F}, {0x1D789, 0x1D789}, -{0x1D7A9, 0x1D7A9}, {0x1D7C3, 0x1D7C3}, {0x1D800, 0x1D9FF}, {0x1DA37, 0x1DA3A}, {0x1DA6D, 0x1DA74}, {0x1DA76, 0x1DA83}, {0x1DA85, 0x1DA86}, {0x1E14F, 0x1E14F}, {0x1E2FF, 0x1E2FF}, {0x1ECAC, 0x1ECAC}, -{0x1ECB0, 0x1ECB0}, {0x1ED2E, 0x1ED2E}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F10D, 0x1F1AD}, -{0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, {0x1F6E0, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, {0x1F780, 0x1F7D8}, -{0x1F7E0, 0x1F7EB}, {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, {0x1F900, 0x1F978}, {0x1F97A, 0x1F9CB}, {0x1F9CD, 0x1FA53}, -{0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, {0x1FA78, 0x1FA7A}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAA8}, {0x1FAB0, 0x1FAB6}, {0x1FAC0, 0x1FAC2}, {0x1FAD0, 0x1FAD6}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA}, -}; - -static const std::vector> control_ranges = { -{0x0, 0x8}, {0xE, 0x1B}, {0x7F, 0x84}, {0x86, 0x9F}, {0xAD, 0xAD}, {0x378, 0x379}, {0x380, 0x383}, {0x38B, 0x38B}, {0x38D, 0x38D}, {0x3A2, 0x3A2}, {0x530, 0x530}, {0x557, 0x558}, {0x58B, 0x58C}, -{0x590, 0x590}, {0x5C8, 0x5CF}, {0x5EB, 0x5EE}, {0x5F5, 0x605}, {0x61C, 0x61D}, {0x6DD, 0x6DD}, {0x70E, 0x70F}, {0x74B, 0x74C}, {0x7B2, 0x7BF}, {0x7FB, 0x7FC}, {0x82E, 0x82F}, {0x83F, 0x83F}, -{0x85C, 0x85D}, {0x85F, 0x85F}, {0x86B, 0x89F}, {0x8B5, 0x8B5}, {0x8C8, 0x8D2}, {0x8E2, 0x8E2}, {0x984, 0x984}, {0x98D, 0x98E}, {0x991, 0x992}, {0x9A9, 0x9A9}, {0x9B1, 0x9B1}, {0x9B3, 0x9B5}, -{0x9BA, 0x9BB}, {0x9C5, 0x9C6}, {0x9C9, 0x9CA}, {0x9CF, 0x9D6}, {0x9D8, 0x9DB}, {0x9DE, 0x9DE}, {0x9E4, 0x9E5}, {0x9FF, 0xA00}, {0xA04, 0xA04}, {0xA0B, 0xA0E}, {0xA11, 0xA12}, {0xA29, 0xA29}, -{0xA31, 0xA31}, {0xA34, 0xA34}, {0xA37, 0xA37}, {0xA3A, 0xA3B}, {0xA3D, 0xA3D}, {0xA43, 0xA46}, {0xA49, 0xA4A}, {0xA4E, 0xA50}, {0xA52, 0xA58}, {0xA5D, 0xA5D}, {0xA5F, 0xA65}, {0xA77, 0xA80}, -{0xA84, 0xA84}, {0xA8E, 0xA8E}, {0xA92, 0xA92}, {0xAA9, 0xAA9}, {0xAB1, 0xAB1}, {0xAB4, 0xAB4}, {0xABA, 0xABB}, {0xAC6, 0xAC6}, {0xACA, 0xACA}, {0xACE, 0xACF}, {0xAD1, 0xADF}, {0xAE4, 0xAE5}, -{0xAF2, 0xAF8}, {0xB00, 0xB00}, {0xB04, 0xB04}, {0xB0D, 0xB0E}, {0xB11, 0xB12}, {0xB29, 0xB29}, {0xB31, 0xB31}, {0xB34, 0xB34}, {0xB3A, 0xB3B}, {0xB45, 0xB46}, {0xB49, 0xB4A}, {0xB4E, 0xB54}, -{0xB58, 0xB5B}, {0xB5E, 0xB5E}, {0xB64, 0xB65}, {0xB78, 0xB81}, {0xB84, 0xB84}, {0xB8B, 0xB8D}, {0xB91, 0xB91}, {0xB96, 0xB98}, {0xB9B, 0xB9B}, {0xB9D, 0xB9D}, {0xBA0, 0xBA2}, {0xBA5, 0xBA7}, -{0xBAB, 0xBAD}, {0xBBA, 0xBBD}, {0xBC3, 0xBC5}, {0xBC9, 0xBC9}, {0xBCE, 0xBCF}, {0xBD1, 0xBD6}, {0xBD8, 0xBE5}, {0xBFB, 0xBFF}, {0xC0D, 0xC0D}, {0xC11, 0xC11}, {0xC29, 0xC29}, {0xC3A, 0xC3C}, -{0xC45, 0xC45}, {0xC49, 0xC49}, {0xC4E, 0xC54}, {0xC57, 0xC57}, {0xC5B, 0xC5F}, {0xC64, 0xC65}, {0xC70, 0xC76}, {0xC8D, 0xC8D}, {0xC91, 0xC91}, {0xCA9, 0xCA9}, {0xCB4, 0xCB4}, {0xCBA, 0xCBB}, -{0xCC5, 0xCC5}, {0xCC9, 0xCC9}, {0xCCE, 0xCD4}, {0xCD7, 0xCDD}, {0xCDF, 0xCDF}, {0xCE4, 0xCE5}, {0xCF0, 0xCF0}, {0xCF3, 0xCFF}, {0xD0D, 0xD0D}, {0xD11, 0xD11}, {0xD45, 0xD45}, {0xD49, 0xD49}, -{0xD50, 0xD53}, {0xD64, 0xD65}, {0xD80, 0xD80}, {0xD84, 0xD84}, {0xD97, 0xD99}, {0xDB2, 0xDB2}, {0xDBC, 0xDBC}, {0xDBE, 0xDBF}, {0xDC7, 0xDC9}, {0xDCB, 0xDCE}, {0xDD5, 0xDD5}, {0xDD7, 0xDD7}, -{0xDE0, 0xDE5}, {0xDF0, 0xDF1}, {0xDF5, 0xE00}, {0xE3B, 0xE3E}, {0xE5C, 0xE80}, {0xE83, 0xE83}, {0xE85, 0xE85}, {0xE8B, 0xE8B}, {0xEA4, 0xEA4}, {0xEA6, 0xEA6}, {0xEBE, 0xEBF}, {0xEC5, 0xEC5}, -{0xEC7, 0xEC7}, {0xECE, 0xECF}, {0xEDA, 0xEDB}, {0xEE0, 0xEFF}, {0xF48, 0xF48}, {0xF6D, 0xF70}, {0xF98, 0xF98}, {0xFBD, 0xFBD}, {0xFCD, 0xFCD}, {0xFDB, 0xFFF}, {0x10C6, 0x10C6}, {0x10C8, 0x10CC}, -{0x10CE, 0x10CF}, {0x1249, 0x1249}, {0x124E, 0x124F}, {0x1257, 0x1257}, {0x1259, 0x1259}, {0x125E, 0x125F}, {0x1289, 0x1289}, {0x128E, 0x128F}, {0x12B1, 0x12B1}, {0x12B6, 0x12B7}, {0x12BF, 0x12BF}, -{0x12C1, 0x12C1}, {0x12C6, 0x12C7}, {0x12D7, 0x12D7}, {0x1311, 0x1311}, {0x1316, 0x1317}, {0x135B, 0x135C}, {0x137D, 0x137F}, {0x139A, 0x139F}, {0x13F6, 0x13F7}, {0x13FE, 0x13FF}, {0x169D, 0x169F}, -{0x16F9, 0x16FF}, {0x170D, 0x170D}, {0x1715, 0x171F}, {0x1737, 0x173F}, {0x1754, 0x175F}, {0x176D, 0x176D}, {0x1771, 0x1771}, {0x1774, 0x177F}, {0x17DE, 0x17DF}, {0x17EA, 0x17EF}, {0x17FA, 0x17FF}, -{0x180E, 0x180F}, {0x181A, 0x181F}, {0x1879, 0x187F}, {0x18AB, 0x18AF}, {0x18F6, 0x18FF}, {0x191F, 0x191F}, {0x192C, 0x192F}, {0x193C, 0x193F}, {0x1941, 0x1943}, {0x196E, 0x196F}, {0x1975, 0x197F}, -{0x19AC, 0x19AF}, {0x19CA, 0x19CF}, {0x19DB, 0x19DD}, {0x1A1C, 0x1A1D}, {0x1A5F, 0x1A5F}, {0x1A7D, 0x1A7E}, {0x1A8A, 0x1A8F}, {0x1A9A, 0x1A9F}, {0x1AAE, 0x1AAF}, {0x1AC1, 0x1AFF}, {0x1B4C, 0x1B4F}, -{0x1B7D, 0x1B7F}, {0x1BF4, 0x1BFB}, {0x1C38, 0x1C3A}, {0x1C4A, 0x1C4C}, {0x1C89, 0x1C8F}, {0x1CBB, 0x1CBC}, {0x1CC8, 0x1CCF}, {0x1CFB, 0x1CFF}, {0x1DFA, 0x1DFA}, {0x1F16, 0x1F17}, {0x1F1E, 0x1F1F}, -{0x1F46, 0x1F47}, {0x1F4E, 0x1F4F}, {0x1F58, 0x1F58}, {0x1F5A, 0x1F5A}, {0x1F5C, 0x1F5C}, {0x1F5E, 0x1F5E}, {0x1F7E, 0x1F7F}, {0x1FB5, 0x1FB5}, {0x1FC5, 0x1FC5}, {0x1FD4, 0x1FD5}, {0x1FDC, 0x1FDC}, -{0x1FF0, 0x1FF1}, {0x1FF5, 0x1FF5}, {0x1FFF, 0x1FFF}, {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x206F}, {0x2072, 0x2073}, {0x208F, 0x208F}, {0x209D, 0x209F}, {0x20C0, 0x20CF}, {0x20F1, 0x20FF}, -{0x218C, 0x218F}, {0x2427, 0x243F}, {0x244B, 0x245F}, {0x2B74, 0x2B75}, {0x2B96, 0x2B96}, {0x2C2F, 0x2C2F}, {0x2C5F, 0x2C5F}, {0x2CF4, 0x2CF8}, {0x2D26, 0x2D26}, {0x2D28, 0x2D2C}, {0x2D2E, 0x2D2F}, -{0x2D68, 0x2D6E}, {0x2D71, 0x2D7E}, {0x2D97, 0x2D9F}, {0x2DA7, 0x2DA7}, {0x2DAF, 0x2DAF}, {0x2DB7, 0x2DB7}, {0x2DBF, 0x2DBF}, {0x2DC7, 0x2DC7}, {0x2DCF, 0x2DCF}, {0x2DD7, 0x2DD7}, {0x2DDF, 0x2DDF}, -{0x2E53, 0x2E7F}, {0x2E9A, 0x2E9A}, {0x2EF4, 0x2EFF}, {0x2FD6, 0x2FEF}, {0x2FFC, 0x2FFF}, {0x3040, 0x3040}, {0x3097, 0x3098}, {0x3100, 0x3104}, {0x3130, 0x3130}, {0x318F, 0x318F}, {0x31E4, 0x31EF}, -{0x321F, 0x321F}, {0x9FFD, 0x9FFF}, {0xA48D, 0xA48F}, {0xA4C7, 0xA4CF}, {0xA62C, 0xA63F}, {0xA6F8, 0xA6FF}, {0xA7C0, 0xA7C1}, {0xA7CB, 0xA7F4}, {0xA82D, 0xA82F}, {0xA83A, 0xA83F}, {0xA878, 0xA87F}, -{0xA8C6, 0xA8CD}, {0xA8DA, 0xA8DF}, {0xA954, 0xA95E}, {0xA97D, 0xA97F}, {0xA9CE, 0xA9CE}, {0xA9DA, 0xA9DD}, {0xA9FF, 0xA9FF}, {0xAA37, 0xAA3F}, {0xAA4E, 0xAA4F}, {0xAA5A, 0xAA5B}, {0xAAC3, 0xAADA}, -{0xAAF7, 0xAB00}, {0xAB07, 0xAB08}, {0xAB0F, 0xAB10}, {0xAB17, 0xAB1F}, {0xAB27, 0xAB27}, {0xAB2F, 0xAB2F}, {0xAB6C, 0xAB6F}, {0xABEE, 0xABEF}, {0xABFA, 0xABFF}, {0xD7A4, 0xD7AF}, {0xD7C7, 0xD7CA}, -{0xD7FC, 0xF8FF}, {0xFA6E, 0xFA6F}, {0xFADA, 0xFAFF}, {0xFB07, 0xFB12}, {0xFB18, 0xFB1C}, {0xFB37, 0xFB37}, {0xFB3D, 0xFB3D}, {0xFB3F, 0xFB3F}, {0xFB42, 0xFB42}, {0xFB45, 0xFB45}, {0xFBC2, 0xFBD2}, -{0xFD40, 0xFD4F}, {0xFD90, 0xFD91}, {0xFDC8, 0xFDEF}, {0xFDFE, 0xFDFF}, {0xFE1A, 0xFE1F}, {0xFE53, 0xFE53}, {0xFE67, 0xFE67}, {0xFE6C, 0xFE6F}, {0xFE75, 0xFE75}, {0xFEFD, 0xFF00}, {0xFFBF, 0xFFC1}, -{0xFFC8, 0xFFC9}, {0xFFD0, 0xFFD1}, {0xFFD8, 0xFFD9}, {0xFFDD, 0xFFDF}, {0xFFE7, 0xFFE7}, {0xFFEF, 0xFFFB}, {0xFFFE, 0xFFFF}, {0x1000C, 0x1000C}, {0x10027, 0x10027}, {0x1003B, 0x1003B}, -{0x1003E, 0x1003E}, {0x1004E, 0x1004F}, {0x1005E, 0x1007F}, {0x100FB, 0x100FF}, {0x10103, 0x10106}, {0x10134, 0x10136}, {0x1018F, 0x1018F}, {0x1019D, 0x1019F}, {0x101A1, 0x101CF}, {0x101FE, 0x1027F}, -{0x1029D, 0x1029F}, {0x102D1, 0x102DF}, {0x102FC, 0x102FF}, {0x10324, 0x1032C}, {0x1034B, 0x1034F}, {0x1037B, 0x1037F}, {0x1039E, 0x1039E}, {0x103C4, 0x103C7}, {0x103D6, 0x103FF}, {0x1049E, 0x1049F}, -{0x104AA, 0x104AF}, {0x104D4, 0x104D7}, {0x104FC, 0x104FF}, {0x10528, 0x1052F}, {0x10564, 0x1056E}, {0x10570, 0x105FF}, {0x10737, 0x1073F}, {0x10756, 0x1075F}, {0x10768, 0x107FF}, {0x10806, 0x10807}, -{0x10809, 0x10809}, {0x10836, 0x10836}, {0x10839, 0x1083B}, {0x1083D, 0x1083E}, {0x10856, 0x10856}, {0x1089F, 0x108A6}, {0x108B0, 0x108DF}, {0x108F3, 0x108F3}, {0x108F6, 0x108FA}, {0x1091C, 0x1091E}, -{0x1093A, 0x1093E}, {0x10940, 0x1097F}, {0x109B8, 0x109BB}, {0x109D0, 0x109D1}, {0x10A04, 0x10A04}, {0x10A07, 0x10A0B}, {0x10A14, 0x10A14}, {0x10A18, 0x10A18}, {0x10A36, 0x10A37}, {0x10A3B, 0x10A3E}, -{0x10A49, 0x10A4F}, {0x10A59, 0x10A5F}, {0x10AA0, 0x10ABF}, {0x10AE7, 0x10AEA}, {0x10AF7, 0x10AFF}, {0x10B36, 0x10B38}, {0x10B56, 0x10B57}, {0x10B73, 0x10B77}, {0x10B92, 0x10B98}, {0x10B9D, 0x10BA8}, -{0x10BB0, 0x10BFF}, {0x10C49, 0x10C7F}, {0x10CB3, 0x10CBF}, {0x10CF3, 0x10CF9}, {0x10D28, 0x10D2F}, {0x10D3A, 0x10E5F}, {0x10E7F, 0x10E7F}, {0x10EAA, 0x10EAA}, {0x10EAE, 0x10EAF}, {0x10EB2, 0x10EFF}, -{0x10F28, 0x10F2F}, {0x10F5A, 0x10FAF}, {0x10FCC, 0x10FDF}, {0x10FF7, 0x10FFF}, {0x1104E, 0x11051}, {0x11070, 0x1107E}, {0x110BD, 0x110BD}, {0x110C2, 0x110CF}, {0x110E9, 0x110EF}, {0x110FA, 0x110FF}, -{0x11135, 0x11135}, {0x11148, 0x1114F}, {0x11177, 0x1117F}, {0x111E0, 0x111E0}, {0x111F5, 0x111FF}, {0x11212, 0x11212}, {0x1123F, 0x1127F}, {0x11287, 0x11287}, {0x11289, 0x11289}, {0x1128E, 0x1128E}, -{0x1129E, 0x1129E}, {0x112AA, 0x112AF}, {0x112EB, 0x112EF}, {0x112FA, 0x112FF}, {0x11304, 0x11304}, {0x1130D, 0x1130E}, {0x11311, 0x11312}, {0x11329, 0x11329}, {0x11331, 0x11331}, {0x11334, 0x11334}, -{0x1133A, 0x1133A}, {0x11345, 0x11346}, {0x11349, 0x1134A}, {0x1134E, 0x1134F}, {0x11351, 0x11356}, {0x11358, 0x1135C}, {0x11364, 0x11365}, {0x1136D, 0x1136F}, {0x11375, 0x113FF}, {0x1145C, 0x1145C}, -{0x11462, 0x1147F}, {0x114C8, 0x114CF}, {0x114DA, 0x1157F}, {0x115B6, 0x115B7}, {0x115DE, 0x115FF}, {0x11645, 0x1164F}, {0x1165A, 0x1165F}, {0x1166D, 0x1167F}, {0x116B9, 0x116BF}, {0x116CA, 0x116FF}, -{0x1171B, 0x1171C}, {0x1172C, 0x1172F}, {0x11740, 0x117FF}, {0x1183C, 0x1189F}, {0x118F3, 0x118FE}, {0x11907, 0x11908}, {0x1190A, 0x1190B}, {0x11914, 0x11914}, {0x11917, 0x11917}, {0x11936, 0x11936}, -{0x11939, 0x1193A}, {0x11947, 0x1194F}, {0x1195A, 0x1199F}, {0x119A8, 0x119A9}, {0x119D8, 0x119D9}, {0x119E5, 0x119FF}, {0x11A48, 0x11A4F}, {0x11AA3, 0x11ABF}, {0x11AF9, 0x11BFF}, {0x11C09, 0x11C09}, -{0x11C37, 0x11C37}, {0x11C46, 0x11C4F}, {0x11C6D, 0x11C6F}, {0x11C90, 0x11C91}, {0x11CA8, 0x11CA8}, {0x11CB7, 0x11CFF}, {0x11D07, 0x11D07}, {0x11D0A, 0x11D0A}, {0x11D37, 0x11D39}, {0x11D3B, 0x11D3B}, -{0x11D3E, 0x11D3E}, {0x11D48, 0x11D4F}, {0x11D5A, 0x11D5F}, {0x11D66, 0x11D66}, {0x11D69, 0x11D69}, {0x11D8F, 0x11D8F}, {0x11D92, 0x11D92}, {0x11D99, 0x11D9F}, {0x11DAA, 0x11EDF}, {0x11EF9, 0x11FAF}, -{0x11FB1, 0x11FBF}, {0x11FF2, 0x11FFE}, {0x1239A, 0x123FF}, {0x1246F, 0x1246F}, {0x12475, 0x1247F}, {0x12544, 0x12FFF}, {0x1342F, 0x143FF}, {0x14647, 0x167FF}, {0x16A39, 0x16A3F}, {0x16A5F, 0x16A5F}, -{0x16A6A, 0x16A6D}, {0x16A70, 0x16ACF}, {0x16AEE, 0x16AEF}, {0x16AF6, 0x16AFF}, {0x16B46, 0x16B4F}, {0x16B5A, 0x16B5A}, {0x16B62, 0x16B62}, {0x16B78, 0x16B7C}, {0x16B90, 0x16E3F}, {0x16E9B, 0x16EFF}, -{0x16F4B, 0x16F4E}, {0x16F88, 0x16F8E}, {0x16FA0, 0x16FDF}, {0x16FE5, 0x16FEF}, {0x16FF2, 0x16FFF}, {0x187F8, 0x187FF}, {0x18CD6, 0x18CFF}, {0x18D09, 0x1AFFF}, {0x1B11F, 0x1B14F}, {0x1B153, 0x1B163}, -{0x1B168, 0x1B16F}, {0x1B2FC, 0x1BBFF}, {0x1BC6B, 0x1BC6F}, {0x1BC7D, 0x1BC7F}, {0x1BC89, 0x1BC8F}, {0x1BC9A, 0x1BC9B}, {0x1BCA0, 0x1CFFF}, {0x1D0F6, 0x1D0FF}, {0x1D127, 0x1D128}, {0x1D173, 0x1D17A}, -{0x1D1E9, 0x1D1FF}, {0x1D246, 0x1D2DF}, {0x1D2F4, 0x1D2FF}, {0x1D357, 0x1D35F}, {0x1D379, 0x1D3FF}, {0x1D455, 0x1D455}, {0x1D49D, 0x1D49D}, {0x1D4A0, 0x1D4A1}, {0x1D4A3, 0x1D4A4}, {0x1D4A7, 0x1D4A8}, -{0x1D4AD, 0x1D4AD}, {0x1D4BA, 0x1D4BA}, {0x1D4BC, 0x1D4BC}, {0x1D4C4, 0x1D4C4}, {0x1D506, 0x1D506}, {0x1D50B, 0x1D50C}, {0x1D515, 0x1D515}, {0x1D51D, 0x1D51D}, {0x1D53A, 0x1D53A}, {0x1D53F, 0x1D53F}, -{0x1D545, 0x1D545}, {0x1D547, 0x1D549}, {0x1D551, 0x1D551}, {0x1D6A6, 0x1D6A7}, {0x1D7CC, 0x1D7CD}, {0x1DA8C, 0x1DA9A}, {0x1DAA0, 0x1DAA0}, {0x1DAB0, 0x1DFFF}, {0x1E007, 0x1E007}, {0x1E019, 0x1E01A}, -{0x1E022, 0x1E022}, {0x1E025, 0x1E025}, {0x1E02B, 0x1E0FF}, {0x1E12D, 0x1E12F}, {0x1E13E, 0x1E13F}, {0x1E14A, 0x1E14D}, {0x1E150, 0x1E2BF}, {0x1E2FA, 0x1E2FE}, {0x1E300, 0x1E7FF}, {0x1E8C5, 0x1E8C6}, -{0x1E8D7, 0x1E8FF}, {0x1E94C, 0x1E94F}, {0x1E95A, 0x1E95D}, {0x1E960, 0x1EC70}, {0x1ECB5, 0x1ED00}, {0x1ED3E, 0x1EDFF}, {0x1EE04, 0x1EE04}, {0x1EE20, 0x1EE20}, {0x1EE23, 0x1EE23}, {0x1EE25, 0x1EE26}, -{0x1EE28, 0x1EE28}, {0x1EE33, 0x1EE33}, {0x1EE38, 0x1EE38}, {0x1EE3A, 0x1EE3A}, {0x1EE3C, 0x1EE41}, {0x1EE43, 0x1EE46}, {0x1EE48, 0x1EE48}, {0x1EE4A, 0x1EE4A}, {0x1EE4C, 0x1EE4C}, {0x1EE50, 0x1EE50}, -{0x1EE53, 0x1EE53}, {0x1EE55, 0x1EE56}, {0x1EE58, 0x1EE58}, {0x1EE5A, 0x1EE5A}, {0x1EE5C, 0x1EE5C}, {0x1EE5E, 0x1EE5E}, {0x1EE60, 0x1EE60}, {0x1EE63, 0x1EE63}, {0x1EE65, 0x1EE66}, {0x1EE6B, 0x1EE6B}, -{0x1EE73, 0x1EE73}, {0x1EE78, 0x1EE78}, {0x1EE7D, 0x1EE7D}, {0x1EE7F, 0x1EE7F}, {0x1EE8A, 0x1EE8A}, {0x1EE9C, 0x1EEA0}, {0x1EEA4, 0x1EEA4}, {0x1EEAA, 0x1EEAA}, {0x1EEBC, 0x1EEEF}, {0x1EEF2, 0x1EFFF}, -{0x1F02C, 0x1F02F}, {0x1F094, 0x1F09F}, {0x1F0AF, 0x1F0B0}, {0x1F0C0, 0x1F0C0}, {0x1F0D0, 0x1F0D0}, {0x1F0F6, 0x1F0FF}, {0x1F1AE, 0x1F1E5}, {0x1F203, 0x1F20F}, {0x1F23C, 0x1F23F}, {0x1F249, 0x1F24F}, -{0x1F252, 0x1F25F}, {0x1F266, 0x1F2FF}, {0x1F6D8, 0x1F6DF}, {0x1F6ED, 0x1F6EF}, {0x1F6FD, 0x1F6FF}, {0x1F774, 0x1F77F}, {0x1F7D9, 0x1F7DF}, {0x1F7EC, 0x1F7FF}, {0x1F80C, 0x1F80F}, {0x1F848, 0x1F84F}, -{0x1F85A, 0x1F85F}, {0x1F888, 0x1F88F}, {0x1F8AE, 0x1F8AF}, {0x1F8B2, 0x1F8FF}, {0x1F979, 0x1F979}, {0x1F9CC, 0x1F9CC}, {0x1FA54, 0x1FA5F}, {0x1FA6E, 0x1FA6F}, {0x1FA75, 0x1FA77}, {0x1FA7B, 0x1FA7F}, -{0x1FA87, 0x1FA8F}, {0x1FAA9, 0x1FAAF}, {0x1FAB7, 0x1FABF}, {0x1FAC3, 0x1FACF}, {0x1FAD7, 0x1FAFF}, {0x1FB93, 0x1FB93}, {0x1FBCB, 0x1FBEF}, {0x1FBFA, 0x1FFFF}, {0x2A6DE, 0x2A6FF}, {0x2B735, 0x2B73F}, -{0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF}, -}; - -//String -bool CNCTString::operator==(const std::string& other) const { - return str.compare(other) == 0; -} -bool CNCTString::operator==(const char other) const { - return str.compare(std::string(1, other)) == 0; -} -bool CNCTString::operator==(const CNCTString& other) const { - return str.compare(other.str) == 0; -} -// + operators -CNCTString& CNCTString::operator+=(const std::string& other) { - str += other; - int new_len = CNCTUnicode::strlen_utf8(other); - utf8_chars += new_len; - char_type = CNCTUnicode::string_identify(str); - seq_offset_bytes += other.size(); - seq_offset_utf8_chars += new_len; - return *this; -} - -CNCTString& CNCTString::operator+=(const char other) { - std::string str = std::string(1, other); - *this += str; - return *this; -} - -CNCTString& CNCTString::operator+=(const CNCTString& other) { - str += other.str; - utf8_chars += other.utf8_chars; - char_type = CNCTUnicode::string_identify(str); - seq_offset_bytes += other.str.size(); - seq_offset_utf8_chars += other.utf8_chars; - return *this; -} - -struct CRCompare { - bool operator()(const std::pair& p, int i) { - return p.second < i; - } - bool operator()(int i, const std::pair& p) { - return i < p.first; - } -}; - -// binary search for code range -bool CNCTUnicode::check_code_range(int c, const std::vector> &ranges) { - auto it = std::upper_bound(ranges.begin(), ranges.end(), c, CRCompare()); - if (it != ranges.begin()) { - --it; - } - return c >= it->first && c <= it->second; -} - -// these are binary searches, it takes only a few operations -CNCTCharType CNCTUnicode::get_code_type(int c) { - if (check_code_range(c, letter_ranges)) { - return LETTER; - } - if (check_code_range(c, digit_ranges)) { - return DIGIT; - } - if (check_code_range(c, whitespace_ranges)) { - return WHITESPACE; - } - if (check_code_range(c, punctuation_ranges)) { - return PUNCTUATION; - } - if (check_code_range(c, symbol_ranges)) { - return SYMBOL; - } - if (check_code_range(c, accent_mark_ranges)) { - return ACCENT_MARK; - } - if (check_code_range(c, control_ranges)) { - return CONTROL; - } - return UNIDENTIFIED; -} - -static int utf8_to_unicode(const std::string& utf8_char) { - int c = 0; - int len = (int)utf8_char.size(); - if (len == 1) { - c = utf8_char[0]; - } else if (len == 2) { - c = ((utf8_char[0] & 0x1F) << 6) | (utf8_char[1] & 0x3F); - } else if (len == 3) { - c = ((utf8_char[0] & 0x0F) << 12) | ((utf8_char[1] & 0x3F) << 6) | (utf8_char[2] & 0x3F); - } else if (len == 4) { - c = ((utf8_char[0] & 0x07) << 18) | ((utf8_char[1] & 0x3F) << 12) | ((utf8_char[2] & 0x3F) << 6) | (utf8_char[3] & 0x3F); - } - return c; -} - -CNCTCharType CNCTUnicode::get_code_type(const std::string &utf8_char) { - return get_code_type(utf8_to_unicode(utf8_char)); -} - -int CNCTUnicode::utf8_len(const char c) -{ - if ((c & 0x80) == 0) { - return 1; // ASCII character - } - if ((c & 0xE0) == 0xC0) { - return 2; // 2-byte character - } - if ((c & 0xF0) == 0xE0) { - return 3; // 3-byte character - } - if ((c & 0xF0) == 0xF0) { - return 4; // 4-byte character - } - return 1; // not valid utf8 - // static const uint8_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; - // return lookup[static_cast(c) >> 4]; -} - -int CNCTUnicode::strlen_utf8(const std::string src) { - int len = 0; - for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) { - int char_len = utf8_len(*it); - if (char_len > 1) { - it += char_len - 1; - } - len += 1; - } - return len; -} - -// split a string into unicode strings -std::vector CNCTUnicode::split_utf8(const std::string &src) { - std::vector result; - for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) { - int char_len = utf8_len(*it); - std::string str(it, it + char_len); - result.push_back(str); - if (char_len > 1) { - it += char_len - 1; - } - } - return result; -} - -// split a string into unicode strings (CNCTString) with sequence information -std::vector CNCTUnicode::split_utf8_enhanced(const std::string &src) { - std::vector result; - int seq_offset_bytes=0; - int seq_offset_utf8_chars=0; - for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) { - int char_len = utf8_len(*it); - std::string str(it, it + char_len); - CNCTString cnct_str; - cnct_str.seq_offset_bytes = seq_offset_bytes; - cnct_str.seq_offset_utf8_chars = seq_offset_utf8_chars; - cnct_str.str = str; - cnct_str.utf8_chars = 1; - cnct_str.char_type = get_code_type(str); - #if 0 - switch (cnct_str.char_type) - { - case DIGIT: - printf("%s = DIGIT\n", str.c_str()); - break; - case LETTER: - printf("%s = LETTER\n", str.c_str()); - break; - case WHITESPACE: - printf("%s = WHITESPACE\n", str.c_str()); - break; - case PUNCTUATION: - printf("%s = PUNCTUATION\n", str.c_str()); - break; - case UNIDENTIFIED: - printf("%s = UNIDENTIFIED\n", str.c_str()); - break; - case SYMBOL: - printf("%s = SYMBOL\n", str.c_str()); - break; - case CONTROL: - printf("%s = CONTROL\n", str.c_str()); - break; - } - #endif - - result.push_back(cnct_str); - seq_offset_bytes += char_len; - seq_offset_utf8_chars += 1; - if (char_len > 1) { - it += char_len - 1; - } - - } - return result; -} - -// return the type of the string -CNCTCharType CNCTUnicode::string_identify(const std::string &str) { - CNCTCharType result = UNIDENTIFIED; - std::string::const_iterator it = str.begin(); - while (it != str.end()) { - int len = utf8_len(*it); - int c = 0; - for (int i = 0; i < len && it != str.end(); ++i, ++it) { - c = (c << 8) | static_cast(*it); - } - switch (get_code_type(c)) { - case DIGIT: - if (result == UNIDENTIFIED) { - result = DIGIT; - } else if (result != DIGIT) { - return MIXED; - } - break; - case LETTER: - if (result == UNIDENTIFIED) { - result = LETTER; - } else if (result != LETTER) { - return MIXED; - } - break; - case WHITESPACE: - if (result == UNIDENTIFIED) { - result = WHITESPACE; - } else if (result != WHITESPACE) { - return MIXED; - } - break; - case PUNCTUATION: - if (result == UNIDENTIFIED) { - result = PUNCTUATION; - } else if (result != PUNCTUATION) { - return MIXED; - } - break; - default: - return MIXED; - break; - } - } - return result; -} - -// verify the content of a string -bool CNCTUnicode::string_test(const std::string &str, CNCTCharType chartype) -{ - std::string::const_iterator it = str.begin(); - while (it != str.end()) { - int len = utf8_len(*it); - int c = 0; - for (int i = 0; i < len && it != str.end(); ++i, ++it) { - c = (c << 8) | static_cast(*it); - } - if (get_code_type(c) != chartype) { - return false; - } - } - return true; -} - -//----------------- -// llama.cpp GPT2 vocab (from libfalcon.cpp) -//----------------- - -std::string replaceAll(std::string str, const std::string& from, const std::string& to) { - size_t start_pos = 0; - while((start_pos = str.find(from, start_pos)) != std::string::npos) { - str.replace(start_pos, from.length(), to); - start_pos += to.length(); // Handles case where 'to' is a substring of 'from' - } - return str; -} - -struct TrieNode { - std::map map; - int32_t Id = -1; -}; - -struct Trie { - TrieNode *root; - - Trie() : root(new TrieNode()) {} - - ~Trie() { - if(root) - deleteTrie(root); - } - - // Move constructor - Trie(Trie&& other) noexcept : root(other.root) { - other.root = nullptr; - } - - // Move assignment operator - Trie& operator=(Trie&& other) noexcept { - if (this != &other) { - if(root) - deleteTrie(root); - root = other.root; - other.root = nullptr; - } - return *this; - } - - void insert(const std::string &token, int32_t Id) { - TrieNode* current = root; - for(auto ch : token) { - if(current->map.find(ch) == current->map.end()) { - current->map[ch] = new TrieNode(); - } - current = current->map[ch]; - } - current->Id = Id; - } - - void reset() { - deleteTrie(root); - root = new TrieNode(); - } - -private: - void deleteTrie(TrieNode* node) { - for(auto &it: node->map) { - deleteTrie(it.second); - } - delete node; - } - -}; - -struct gpt2bpe_vocab { - using id = int32_t; - using token = std::string; - - std::map max_token_length; // max length, for each 2byte prefix - std::map, int> bpe_ranks; - std::vector> bpe_merges; - - id special_bos_id = -1; - id special_eos_id = -1; - id special_unk_id = -1; - id special_sep_id = -1; - id special_pad_id = -1; - - id linefeed_id = -1; - - std::unordered_map token_to_id; - std::unordered_map id_to_token; - - Trie trie; // highspeed access to tokens by prefix tree - - // populate trie from map - void populate_trie_from_map() { - trie.reset(); - for (const auto& pair : token_to_id) { - trie.insert(pair.first, pair.second); - if (pair.first.size() >= 2) { - std::string prefix = pair.first.substr(0, 2); - max_token_length[prefix] = std::max(max_token_length[prefix], (uint32_t)pair.first.size()); - } - } - } - // populate token ranks map - int populate_bpe_ranks(std::vector> bpe_merges_) { - for (int i = 0; i < (int)bpe_merges_.size(); i++) { - bpe_ranks.emplace(bpe_merges_[i], i); - } - bpe_merges = bpe_merges_; - return bpe_merges_.size(); - } - - // Trim whitespace characters from the beginning and end of the string - void trim(std::string& str) { - // Remove whitespace characters from the beginning of the string - str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](int ch) { - return !std::isspace(ch); - })); - - // Remove whitespace characters from the end of the string - str.erase(std::find_if(str.rbegin(), str.rend(), [](int ch) { - return !std::isspace(ch); - }).base(), str.end()); - } - - // get max token length available for a prefix of 2 bytes (string at least 2 bytes long) - int get_max_token_length(const std::string& string) const { - if (string.size() < 2) { - return -1; - } - std::string prefix = string.substr(0, 2); - if (max_token_length.find(prefix) == max_token_length.end()) { - return 0; - } - return max_token_length.at(prefix); - } - - // function to find if two tokens match in bpe_rank, return rank or -1 - int find_bpe_rank(const std::string& token1, const std::string& token2) const { - std::string left_token = token1; - std::string right_token = token2; - left_token = replaceAll(left_token, " ", "Ġ"); - left_token = replaceAll(left_token, "\n", "Ċ"); - right_token = replaceAll(right_token, " ", "Ġ"); - right_token = replaceAll(right_token, "\n", "Ċ"); - - auto it = bpe_ranks.find(std::make_pair(left_token, right_token)); - if (it == bpe_ranks.end()) { - return -1; - } - return it->second; - } - - std::pair find_longest_match(const std::string& snippet) const { - TrieNode* current = trie.root; - gpt2bpe_vocab::id last_matched_id = -1; - std::string last_matched_token = ""; - std::string current_token = ""; - for (auto ch : snippet) { - if (current->map.find(ch) == current->map.end()) { - break; - } - current = current->map[ch]; - current_token += ch; - if (current->Id != -1) { - last_matched_id = current->Id; - last_matched_token = current_token; - } - } - return {last_matched_id, last_matched_token}; - } - -}; - - -// -// tokenizer - bpe type, gpt2 tokenization compatible -// - -struct ggllm_bpe_symbol { - using index = int; - index prev; - index next; - const char * text; - size_t n; -}; - -static_assert(std::is_trivially_copyable::value, "ggllm_bpe_symbol is not trivially copyable"); - -struct ggllm_bpe_bigram { - struct comparator { - bool operator()(ggllm_bpe_bigram & l, ggllm_bpe_bigram & r) { - return l.rank > r.rank || (l.rank == r.rank && l.left > r.left); - } - }; - - using queue_storage = std::vector; - using queue = std::priority_queue; - ggllm_bpe_symbol::index left; - ggllm_bpe_symbol::index right; - std::string text; - int rank; - size_t size; -}; - -struct gpt2bpe_tokenizer { - gpt2bpe_tokenizer(const gpt2bpe_vocab & vocab, bool g2ws_): vocab_(vocab) { flag_g2ws = g2ws_; } - - void tokenize(const std::string & text, std::vector & output) { - int final_prev_index = -1; - // auto start = ggml_time_us(); - auto word_collection = bpe_gpt2_preprocess(text); - // auto end = ggml_time_us(); - // fprintf(stderr, "%s: preprocessing took %0.3f ms\n", __func__, (end - start) / 1000.0); - - symbols_final.clear(); - - for (auto & word : word_collection) { - work_queue_ = ggllm_bpe_bigram::queue(); - symbols_.clear(); - - int index = 0; - size_t offset = 0; - - while (offset < word.size()) { - ggllm_bpe_symbol sym; - size_t char_len = std::min(word.size() - offset, (size_t) CNCTUnicode::utf8_len(word[offset])); - sym.text = word.c_str() + offset; - sym.n = 1; - sym.n = char_len; - offset += sym.n; - sym.prev = index - 1; - sym.next = offset == word.size() ? -1 : index + 1; - index++; - symbols_.emplace_back(sym); - } - for (size_t i = 1; i < symbols_.size(); ++i) { - add_new_bigram(i - 1, i); - } - - // build token(s) - while (!work_queue_.empty()) { - auto bigram = work_queue_.top(); - work_queue_.pop(); - - auto & left_symbol = symbols_[bigram.left]; - auto & right_symbol = symbols_[bigram.right]; - - if (left_symbol.n == 0 || right_symbol.n == 0) { - continue; - } - std::string left_token = std::string(left_symbol.text, left_symbol.n); - std::string right_token = std::string(right_symbol.text, right_symbol.n); - if (left_token + right_token != bigram.text) { - continue; // Skip this bigram if it's outdated - } - - // merge the right sym into the left one - left_symbol.n += right_symbol.n; - right_symbol.n = 0; - - // remove the right sym from the chain - left_symbol.next = right_symbol.next; - if (right_symbol.next >= 0) { - symbols_[right_symbol.next].prev = bigram.left; - } - - add_new_bigram(left_symbol.prev, bigram.left); // left side of current symbol - add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol - } - - // add the fnished tokens to the final list keeping correct order for next and prev - for (auto & sym : symbols_) { - if (sym.n > 0) { - sym.prev = final_prev_index; - sym.next = -1; - if (final_prev_index != -1) { - symbols_final[final_prev_index].next = symbols_final.size(); - } - symbols_final.emplace_back(sym); - final_prev_index = symbols_final.size() - 1; - } - } - } - - symbols_ = symbols_final; - if (symbols_.size()) - for (int i = 0; i != -1; i = symbols_[i].next) { - auto & symbol = symbols_[i]; - if (symbol.n == 0) { - continue; - } - std::string str = std::string(symbol.text, symbol.n); - std::string str_decoded = decode_token(str); - auto token = vocab_.token_to_id.find(str_decoded); - - if (token == vocab_.token_to_id.end()) { - for (auto j = str_decoded.begin(); j != str_decoded.end(); ++j) { - std::string byte_str(1, *j); - auto token_multibyte = vocab_.token_to_id.find(byte_str); - if (token_multibyte == vocab_.token_to_id.end()) { - fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str()); - } - output.push_back((*token_multibyte).second); - } - } else { - output.push_back((*token).second); - } - } - } - -private: - void add_new_bigram(int left, int right) { - if (left == -1 || right == -1) return; - - std::string left_token = std::string(symbols_[left].text, symbols_[left].n); - std::string right_token = std::string(symbols_[right].text, symbols_[right].n); - - int rank_found = -1; - rank_found = vocab_.find_bpe_rank(left_token, right_token); - - if (rank_found < 0) { - return; - } - - ggllm_bpe_bigram bigram; - bigram.left = left; - bigram.right = right; - bigram.rank = rank_found; - bigram.size = left_token.size() + right_token.size(); - bigram.text = left_token + right_token; - work_queue_.push(bigram); - } - - std::unordered_map bytes_to_unicode() { - static std::unordered_map hex_map = { - { 0x21, "\x21" }, { 0x22, "\x22" }, { 0x23, "\x23" }, { 0x24, "\x24" }, { 0x25, "\x25" }, { 0x26, "\x26" }, { 0x27, "\x27" }, { 0x28, "\x28" }, { 0x29, "\x29" }, { 0x2A, "\x2A" }, - { 0x2B, "\x2B" }, { 0x2C, "\x2C" }, { 0x2D, "\x2D" }, { 0x2E, "\x2E" }, { 0x2F, "\x2F" }, { 0x30, "\x30" }, { 0x31, "\x31" }, { 0x32, "\x32" }, { 0x33, "\x33" }, { 0x34, "\x34" }, - { 0x35, "\x35" }, { 0x36, "\x36" }, { 0x37, "\x37" }, { 0x38, "\x38" }, { 0x39, "\x39" }, { 0x3A, "\x3A" }, { 0x3B, "\x3B" }, { 0x3C, "\x3C" }, { 0x3D, "\x3D" }, { 0x3E, "\x3E" }, - { 0x3F, "\x3F" }, { 0x40, "\x40" }, { 0x41, "\x41" }, { 0x42, "\x42" }, { 0x43, "\x43" }, { 0x44, "\x44" }, { 0x45, "\x45" }, { 0x46, "\x46" }, { 0x47, "\x47" }, { 0x48, "\x48" }, - { 0x49, "\x49" }, { 0x4A, "\x4A" }, { 0x4B, "\x4B" }, { 0x4C, "\x4C" }, { 0x4D, "\x4D" }, { 0x4E, "\x4E" }, { 0x4F, "\x4F" }, { 0x50, "\x50" }, { 0x51, "\x51" }, { 0x52, "\x52" }, - { 0x53, "\x53" }, { 0x54, "\x54" }, { 0x55, "\x55" }, { 0x56, "\x56" }, { 0x57, "\x57" }, { 0x58, "\x58" }, { 0x59, "\x59" }, { 0x5A, "\x5A" }, { 0x5B, "\x5B" }, { 0x5C, "\x5C" }, - { 0x5D, "\x5D" }, { 0x5E, "\x5E" }, { 0x5F, "\x5F" }, { 0x60, "\x60" }, { 0x61, "\x61" }, { 0x62, "\x62" }, { 0x63, "\x63" }, { 0x64, "\x64" }, { 0x65, "\x65" }, { 0x66, "\x66" }, - { 0x67, "\x67" }, { 0x68, "\x68" }, { 0x69, "\x69" }, { 0x6A, "\x6A" }, { 0x6B, "\x6B" }, { 0x6C, "\x6C" }, { 0x6D, "\x6D" }, { 0x6E, "\x6E" }, { 0x6F, "\x6F" }, { 0x70, "\x70" }, - { 0x71, "\x71" }, { 0x72, "\x72" }, { 0x73, "\x73" }, { 0x74, "\x74" }, { 0x75, "\x75" }, { 0x76, "\x76" }, { 0x77, "\x77" }, { 0x78, "\x78" }, { 0x79, "\x79" }, { 0x7A, "\x7A" }, - { 0x7B, "\x7B" }, { 0x7C, "\x7C" }, { 0x7D, "\x7D" }, { 0x7E, "\x7E" }, { 0xA1, "\xC2\xA1" }, { 0xA2, "\xC2\xA2" }, { 0xA3, "\xC2\xA3" }, { 0xA4, "\xC2\xA4" }, { 0xA5, "\xC2\xA5" }, - { 0xA6, "\xC2\xA6" }, { 0xA7, "\xC2\xA7" }, { 0xA8, "\xC2\xA8" }, { 0xA9, "\xC2\xA9" }, { 0xAA, "\xC2\xAA" }, { 0xAB, "\xC2\xAB" }, { 0xAC, "\xC2\xAC" }, { 0xAE, "\xC2\xAE" }, - { 0xAF, "\xC2\xAF" }, { 0xB0, "\xC2\xB0" }, { 0xB1, "\xC2\xB1" }, { 0xB2, "\xC2\xB2" }, { 0xB3, "\xC2\xB3" }, { 0xB4, "\xC2\xB4" }, { 0xB5, "\xC2\xB5" }, { 0xB6, "\xC2\xB6" }, - { 0xB7, "\xC2\xB7" }, { 0xB8, "\xC2\xB8" }, { 0xB9, "\xC2\xB9" }, { 0xBA, "\xC2\xBA" }, { 0xBB, "\xC2\xBB" }, { 0xBC, "\xC2\xBC" }, { 0xBD, "\xC2\xBD" }, { 0xBE, "\xC2\xBE" }, - { 0xBF, "\xC2\xBF" }, { 0xC0, "\xC3\x80" }, { 0xC1, "\xC3\x81" }, { 0xC2, "\xC3\x82" }, { 0xC3, "\xC3\x83" }, { 0xC4, "\xC3\x84" }, { 0xC5, "\xC3\x85" }, { 0xC6, "\xC3\x86" }, - { 0xC7, "\xC3\x87" }, { 0xC8, "\xC3\x88" }, { 0xC9, "\xC3\x89" }, { 0xCA, "\xC3\x8A" }, { 0xCB, "\xC3\x8B" }, { 0xCC, "\xC3\x8C" }, { 0xCD, "\xC3\x8D" }, { 0xCE, "\xC3\x8E" }, - { 0xCF, "\xC3\x8F" }, { 0xD0, "\xC3\x90" }, { 0xD1, "\xC3\x91" }, { 0xD2, "\xC3\x92" }, { 0xD3, "\xC3\x93" }, { 0xD4, "\xC3\x94" }, { 0xD5, "\xC3\x95" }, { 0xD6, "\xC3\x96" }, - { 0xD7, "\xC3\x97" }, { 0xD8, "\xC3\x98" }, { 0xD9, "\xC3\x99" }, { 0xDA, "\xC3\x9A" }, { 0xDB, "\xC3\x9B" }, { 0xDC, "\xC3\x9C" }, { 0xDD, "\xC3\x9D" }, { 0xDE, "\xC3\x9E" }, - { 0xDF, "\xC3\x9F" }, { 0xE0, "\xC3\xA0" }, { 0xE1, "\xC3\xA1" }, { 0xE2, "\xC3\xA2" }, { 0xE3, "\xC3\xA3" }, { 0xE4, "\xC3\xA4" }, { 0xE5, "\xC3\xA5" }, { 0xE6, "\xC3\xA6" }, - { 0xE7, "\xC3\xA7" }, { 0xE8, "\xC3\xA8" }, { 0xE9, "\xC3\xA9" }, { 0xEA, "\xC3\xAA" }, { 0xEB, "\xC3\xAB" }, { 0xEC, "\xC3\xAC" }, { 0xED, "\xC3\xAD" }, { 0xEE, "\xC3\xAE" }, - { 0xEF, "\xC3\xAF" }, { 0xF0, "\xC3\xB0" }, { 0xF1, "\xC3\xB1" }, { 0xF2, "\xC3\xB2" }, { 0xF3, "\xC3\xB3" }, { 0xF4, "\xC3\xB4" }, { 0xF5, "\xC3\xB5" }, { 0xF6, "\xC3\xB6" }, - { 0xF7, "\xC3\xB7" }, { 0xF8, "\xC3\xB8" }, { 0xF9, "\xC3\xB9" }, { 0xFA, "\xC3\xBA" }, { 0xFB, "\xC3\xBB" }, { 0xFC, "\xC3\xBC" }, { 0xFD, "\xC3\xBD" }, { 0xFE, "\xC3\xBE" }, - { 0xFF, "\xC3\xBF" }, { 0x00, "\xC4\x80" }, { 0x01, "\xC4\x81" }, { 0x02, "\xC4\x82" }, { 0x03, "\xC4\x83" }, { 0x04, "\xC4\x84" }, { 0x05, "\xC4\x85" }, { 0x06, "\xC4\x86" }, - { 0x07, "\xC4\x87" }, { 0x08, "\xC4\x88" }, { 0x09, "\xC4\x89" }, { 0x0A, "\xC4\x8A" }, { 0x0B, "\xC4\x8B" }, { 0x0C, "\xC4\x8C" }, { 0x0D, "\xC4\x8D" }, { 0x0E, "\xC4\x8E" }, - { 0x0F, "\xC4\x8F" }, { 0x10, "\xC4\x90" }, { 0x11, "\xC4\x91" }, { 0x12, "\xC4\x92" }, { 0x13, "\xC4\x93" }, { 0x14, "\xC4\x94" }, { 0x15, "\xC4\x95" }, { 0x16, "\xC4\x96" }, - { 0x17, "\xC4\x97" }, { 0x18, "\xC4\x98" }, { 0x19, "\xC4\x99" }, { 0x1A, "\xC4\x9A" }, { 0x1B, "\xC4\x9B" }, { 0x1C, "\xC4\x9C" }, { 0x1D, "\xC4\x9D" }, { 0x1E, "\xC4\x9E" }, - { 0x1F, "\xC4\x9F" }, { 0x20, "\xC4\xA0" }, { 0x7F, "\xC4\xA1" }, { 0x80, "\xC4\xA2" }, { 0x81, "\xC4\xA3" }, { 0x82, "\xC4\xA4" }, { 0x83, "\xC4\xA5" }, { 0x84, "\xC4\xA6" }, - { 0x85, "\xC4\xA7" }, { 0x86, "\xC4\xA8" }, { 0x87, "\xC4\xA9" }, { 0x88, "\xC4\xAA" }, { 0x89, "\xC4\xAB" }, { 0x8A, "\xC4\xAC" }, { 0x8B, "\xC4\xAD" }, { 0x8C, "\xC4\xAE" }, - { 0x8D, "\xC4\xAF" }, { 0x8E, "\xC4\xB0" }, { 0x8F, "\xC4\xB1" }, { 0x90, "\xC4\xB2" }, { 0x91, "\xC4\xB3" }, { 0x92, "\xC4\xB4" }, { 0x93, "\xC4\xB5" }, { 0x94, "\xC4\xB6" }, - { 0x95, "\xC4\xB7" }, { 0x96, "\xC4\xB8" }, { 0x97, "\xC4\xB9" }, { 0x98, "\xC4\xBA" }, { 0x99, "\xC4\xBB" }, { 0x9A, "\xC4\xBC" }, { 0x9B, "\xC4\xBD" }, { 0x9C, "\xC4\xBE" }, - { 0x9D, "\xC4\xBF" }, { 0x9E, "\xC5\x80" }, { 0x9F, "\xC5\x81" }, { 0xA0, "\xC5\x82" }, { 0xAD, "\xC5\x83" } - }; - return hex_map; - } - - std::unordered_map unicode_to_bytes() { - static std::unordered_map hex_map = { - { "\x21", 0x21 }, { "\x22", 0x22 }, { "\x23", 0x23 }, { "\x24", 0x24 }, { "\x25", 0x25 }, { "\x26", 0x26 }, { "\x27", 0x27 }, { "\x28", 0x28 }, { "\x29", 0x29 }, { "\x2A", 0x2A }, - { "\x2B", 0x2B }, { "\x2C", 0x2C }, { "\x2D", 0x2D }, { "\x2E", 0x2E }, { "\x2F", 0x2F }, { "\x30", 0x30 }, { "\x31", 0x31 }, { "\x32", 0x32 }, { "\x33", 0x33 }, { "\x34", 0x34 }, - { "\x35", 0x35 }, { "\x36", 0x36 }, { "\x37", 0x37 }, { "\x38", 0x38 }, { "\x39", 0x39 }, { "\x3A", 0x3A }, { "\x3B", 0x3B }, { "\x3C", 0x3C }, { "\x3D", 0x3D }, { "\x3E", 0x3E }, - { "\x3F", 0x3F }, { "\x40", 0x40 }, { "\x41", 0x41 }, { "\x42", 0x42 }, { "\x43", 0x43 }, { "\x44", 0x44 }, { "\x45", 0x45 }, { "\x46", 0x46 }, { "\x47", 0x47 }, { "\x48", 0x48 }, - { "\x49", 0x49 }, { "\x4A", 0x4A }, { "\x4B", 0x4B }, { "\x4C", 0x4C }, { "\x4D", 0x4D }, { "\x4E", 0x4E }, { "\x4F", 0x4F }, { "\x50", 0x50 }, { "\x51", 0x51 }, { "\x52", 0x52 }, - { "\x53", 0x53 }, { "\x54", 0x54 }, { "\x55", 0x55 }, { "\x56", 0x56 }, { "\x57", 0x57 }, { "\x58", 0x58 }, { "\x59", 0x59 }, { "\x5A", 0x5A }, { "\x5B", 0x5B }, { "\x5C", 0x5C }, - { "\x5D", 0x5D }, { "\x5E", 0x5E }, { "\x5F", 0x5F }, { "\x60", 0x60 }, { "\x61", 0x61 }, { "\x62", 0x62 }, { "\x63", 0x63 }, { "\x64", 0x64 }, { "\x65", 0x65 }, { "\x66", 0x66 }, - { "\x67", 0x67 }, { "\x68", 0x68 }, { "\x69", 0x69 }, { "\x6A", 0x6A }, { "\x6B", 0x6B }, { "\x6C", 0x6C }, { "\x6D", 0x6D }, { "\x6E", 0x6E }, { "\x6F", 0x6F }, { "\x70", 0x70 }, - { "\x71", 0x71 }, { "\x72", 0x72 }, { "\x73", 0x73 }, { "\x74", 0x74 }, { "\x75", 0x75 }, { "\x76", 0x76 }, { "\x77", 0x77 }, { "\x78", 0x78 }, { "\x79", 0x79 }, { "\x7A", 0x7A }, - { "\x7B", 0x7B }, { "\x7C", 0x7C }, { "\x7D", 0x7D }, { "\x7E", 0x7E }, { "\xC2\xA1", 0xA1 }, { "\xC2\xA2", 0xA2 }, { "\xC2\xA3", 0xA3 }, { "\xC2\xA4", 0xA4 }, { "\xC2\xA5", 0xA5 }, - { "\xC2\xA6", 0xA6 }, { "\xC2\xA7", 0xA7 }, { "\xC2\xA8", 0xA8 }, { "\xC2\xA9", 0xA9 }, { "\xC2\xAA", 0xAA }, { "\xC2\xAB", 0xAB }, { "\xC2\xAC", 0xAC }, { "\xC2\xAE", 0xAE }, - { "\xC2\xAF", 0xAF }, { "\xC2\xB0", 0xB0 }, { "\xC2\xB1", 0xB1 }, { "\xC2\xB2", 0xB2 }, { "\xC2\xB3", 0xB3 }, { "\xC2\xB4", 0xB4 }, { "\xC2\xB5", 0xB5 }, { "\xC2\xB6", 0xB6 }, - { "\xC2\xB7", 0xB7 }, { "\xC2\xB8", 0xB8 }, { "\xC2\xB9", 0xB9 }, { "\xC2\xBA", 0xBA }, { "\xC2\xBB", 0xBB }, { "\xC2\xBC", 0xBC }, { "\xC2\xBD", 0xBD }, { "\xC2\xBE", 0xBE }, - { "\xC2\xBF", 0xBF }, { "\xC3\x80", 0xC0 }, { "\xC3\x81", 0xC1 }, { "\xC3\x82", 0xC2 }, { "\xC3\x83", 0xC3 }, { "\xC3\x84", 0xC4 }, { "\xC3\x85", 0xC5 }, { "\xC3\x86", 0xC6 }, - { "\xC3\x87", 0xC7 }, { "\xC3\x88", 0xC8 }, { "\xC3\x89", 0xC9 }, { "\xC3\x8A", 0xCA }, { "\xC3\x8B", 0xCB }, { "\xC3\x8C", 0xCC }, { "\xC3\x8D", 0xCD }, { "\xC3\x8E", 0xCE }, - { "\xC3\x8F", 0xCF }, { "\xC3\x90", 0xD0 }, { "\xC3\x91", 0xD1 }, { "\xC3\x92", 0xD2 }, { "\xC3\x93", 0xD3 }, { "\xC3\x94", 0xD4 }, { "\xC3\x95", 0xD5 }, { "\xC3\x96", 0xD6 }, - { "\xC3\x97", 0xD7 }, { "\xC3\x98", 0xD8 }, { "\xC3\x99", 0xD9 }, { "\xC3\x9A", 0xDA }, { "\xC3\x9B", 0xDB }, { "\xC3\x9C", 0xDC }, { "\xC3\x9D", 0xDD }, { "\xC3\x9E", 0xDE }, - { "\xC3\x9F", 0xDF }, { "\xC3\xA0", 0xE0 }, { "\xC3\xA1", 0xE1 }, { "\xC3\xA2", 0xE2 }, { "\xC3\xA3", 0xE3 }, { "\xC3\xA4", 0xE4 }, { "\xC3\xA5", 0xE5 }, { "\xC3\xA6", 0xE6 }, - { "\xC3\xA7", 0xE7 }, { "\xC3\xA8", 0xE8 }, { "\xC3\xA9", 0xE9 }, { "\xC3\xAA", 0xEA }, { "\xC3\xAB", 0xEB }, { "\xC3\xAC", 0xEC }, { "\xC3\xAD", 0xED }, { "\xC3\xAE", 0xEE }, - { "\xC3\xAF", 0xEF }, { "\xC3\xB0", 0xF0 }, { "\xC3\xB1", 0xF1 }, { "\xC3\xB2", 0xF2 }, { "\xC3\xB3", 0xF3 }, { "\xC3\xB4", 0xF4 }, { "\xC3\xB5", 0xF5 }, { "\xC3\xB6", 0xF6 }, - { "\xC3\xB7", 0xF7 }, { "\xC3\xB8", 0xF8 }, { "\xC3\xB9", 0xF9 }, { "\xC3\xBA", 0xFA }, { "\xC3\xBB", 0xFB }, { "\xC3\xBC", 0xFC }, { "\xC3\xBD", 0xFD }, { "\xC3\xBE", 0xFE }, - { "\xC3\xBF", 0xFF }, { "\xC4\x80", 0x00 }, { "\xC4\x81", 0x01 }, { "\xC4\x82", 0x02 }, { "\xC4\x83", 0x03 }, { "\xC4\x84", 0x04 }, { "\xC4\x85", 0x05 }, { "\xC4\x86", 0x06 }, - { "\xC4\x87", 0x07 }, { "\xC4\x88", 0x08 }, { "\xC4\x89", 0x09 }, { "\xC4\x8A", 0x0A }, { "\xC4\x8B", 0x0B }, { "\xC4\x8C", 0x0C }, { "\xC4\x8D", 0x0D }, { "\xC4\x8E", 0x0E }, - { "\xC4\x8F", 0x0F }, { "\xC4\x90", 0x10 }, { "\xC4\x91", 0x11 }, { "\xC4\x92", 0x12 }, { "\xC4\x93", 0x13 }, { "\xC4\x94", 0x14 }, { "\xC4\x95", 0x15 }, { "\xC4\x96", 0x16 }, - { "\xC4\x97", 0x17 }, { "\xC4\x98", 0x18 }, { "\xC4\x99", 0x19 }, { "\xC4\x9A", 0x1A }, { "\xC4\x9B", 0x1B }, { "\xC4\x9C", 0x1C }, { "\xC4\x9D", 0x1D }, { "\xC4\x9E", 0x1E }, - { "\xC4\x9F", 0x1F }, { "\xC4\xA0", 0x20 }, { "\xC4\xA1", 0x7F }, { "\xC4\xA2", 0x80 }, { "\xC4\xA3", 0x81 }, { "\xC4\xA4", 0x82 }, { "\xC4\xA5", 0x83 }, { "\xC4\xA6", 0x84 }, - { "\xC4\xA7", 0x85 }, { "\xC4\xA8", 0x86 }, { "\xC4\xA9", 0x87 }, { "\xC4\xAA", 0x88 }, { "\xC4\xAB", 0x89 }, { "\xC4\xAC", 0x8A }, { "\xC4\xAD", 0x8B }, { "\xC4\xAE", 0x8C }, - { "\xC4\xAF", 0x8D }, { "\xC4\xB0", 0x8E }, { "\xC4\xB1", 0x8F }, { "\xC4\xB2", 0x90 }, { "\xC4\xB3", 0x91 }, { "\xC4\xB4", 0x92 }, { "\xC4\xB5", 0x93 }, { "\xC4\xB6", 0x94 }, - { "\xC4\xB7", 0x95 }, { "\xC4\xB8", 0x96 }, { "\xC4\xB9", 0x97 }, { "\xC4\xBA", 0x98 }, { "\xC4\xBB", 0x99 }, { "\xC4\xBC", 0x9A }, { "\xC4\xBD", 0x9B }, { "\xC4\xBE", 0x9C }, - { "\xC4\xBF", 0x9D }, { "\xC5\x80", 0x9E }, { "\xC5\x81", 0x9F }, { "\xC5\x82", 0xA0 }, { "\xC5\x83", 0xAD } - }; - return hex_map; - } - - // len must be available - bool inline str_is_equal(const char* str1, const char* str2, size_t len) { - for (size_t i = 0; i < len; ++i) { - if (str1[i] != str2[i]) { - return false; - } - } - return true; - } - - std::vector bpe_gpt2_preprocess(const std::string& text) { - static std::unordered_map< unsigned char, std::string> byte_encoder = bytes_to_unicode(); - std::vector bpe_words; - std::vector bpe_encoded_words; - - std::string token=""; - const char *raw_text_p = text.c_str(); - // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ - bool collecting_numeric = false; - bool collecting_letter = false; - bool collecting_special = false; - bool collecting_whitespace_lookahead = false; - bool collecting=false; - - std::vector text_utf; - text_utf.reserve(text.size()); - bpe_words.reserve(text.size()); - bpe_encoded_words.reserve(text.size()); - - text_utf = CNCTUnicode::split_utf8_enhanced(text); - - for (int i = 0; i < (int)text_utf.size(); i++) { - const CNCTString &utf_char = text_utf[i]; - bool split_condition = false; - const char *text_pos = raw_text_p + utf_char.seq_offset_bytes; - int bytes_remain = strlen(text_pos); - // forward backward lookups - const CNCTString &utf_char_next = (i+1 < (int)text_utf.size()) ? text_utf[i+1] : CNCTString(); - const CNCTString &utf_char_next_next = (i+2 < (int)text_utf.size()) ? text_utf[i+2] : CNCTString(); - // const CNCTString &utf_char_prev = (i > 0) ? text_utf[i-1] : CNCTString(); - - // handling contractions - if (!split_condition && bytes_remain >= 2) { - // 's|'t|'m|'d - if (utf_char == '\'' && (utf_char_next == 's' || utf_char_next == 't' || utf_char_next == 'm' || utf_char_next == 'd')) { - split_condition = true; - } - if (split_condition) { - if (token.size()) { - bpe_words.emplace_back(token); // push previous content as token - } - token = utf_char.str + utf_char_next.str; - bpe_words.emplace_back(token); - token=""; - i++; - continue; - } - } - if (!split_condition && bytes_remain >= 3) { - // 're|'ve|'ll - if (utf_char == '\'' && ( - (utf_char_next == 'r' || utf_char_next_next == 'e') || - (utf_char_next == 'v' || utf_char_next_next == 'e') || - (utf_char_next == 'l' || utf_char_next_next == 'l')) - ) { - split_condition = true; - } - if (split_condition) { - // current token + next token can be defined - if (token.size()) { - bpe_words.emplace_back(token); // push previous content as token - } - token = utf_char.str + utf_char_next.str + utf_char_next_next.str; - bpe_words.emplace_back(token); // the contraction - token=""; - i+=2; - continue; - } - } - - if (!split_condition && !collecting) { - if (utf_char.char_type == CNCTCharType::LETTER || (!token.size() && utf_char==" " && utf_char_next.char_type == CNCTCharType::LETTER)) { - collecting_letter = true; - collecting = true; - } else if (utf_char.char_type == CNCTCharType::DIGIT || (!token.size() && utf_char==" " && utf_char_next.char_type == CNCTCharType::DIGIT)) { - collecting_numeric = true; - collecting = true; - } else if ( - ((utf_char.char_type != CNCTCharType::LETTER && utf_char.char_type != CNCTCharType::DIGIT) && (utf_char.char_type != CNCTCharType::WHITESPACE)) || - (!token.size() && utf_char==" " && utf_char_next.char_type != CNCTCharType::LETTER && utf_char_next.char_type != CNCTCharType::DIGIT && utf_char_next.char_type != CNCTCharType::WHITESPACE) - ) { - collecting_special = true; - collecting = true; - } else if (utf_char.char_type == CNCTCharType::WHITESPACE && utf_char_next.char_type == CNCTCharType::WHITESPACE) { - collecting_whitespace_lookahead = true; - collecting = true; - } else if (utf_char.char_type == CNCTCharType::WHITESPACE) { - split_condition = true; - } - } else if (!split_condition && collecting) { - if (collecting_letter && utf_char.char_type != CNCTCharType::LETTER) { - split_condition = true; - } else if (collecting_numeric && utf_char.char_type != CNCTCharType::DIGIT) { - split_condition = true; - } else if (collecting_special && (utf_char.char_type == CNCTCharType::LETTER || utf_char.char_type == CNCTCharType::DIGIT || utf_char.char_type == CNCTCharType::WHITESPACE)) { - split_condition = true; - } else if (collecting_whitespace_lookahead && utf_char_next.char_type != CNCTCharType::WHITESPACE) { - split_condition = true; - } - } - - if(utf_char_next.str.size() == 0) { - split_condition = true; // final - token += utf_char.str; - } - - if (split_condition) { - if (token.size()) { - bpe_words.emplace_back(token); - } - token = utf_char.str; - collecting = false; - collecting_letter = false; - collecting_numeric = false; - collecting_special = false; - collecting_whitespace_lookahead = false; - } else { - token += utf_char.str; - } - } - - for (std::string& word : bpe_words) { - std::string encoded_token=""; - for (char& c : word) { - encoded_token += byte_encoder[c]; - } - bpe_encoded_words.emplace_back(encoded_token); - } - - return bpe_encoded_words; - } - - // decoder (for one token) - std::string decode_token(const std::string& token) { - static std::unordered_map< std::string, unsigned char> byte_decoder = unicode_to_bytes(); - std::string decoded_token=""; - auto unicode_seqeunces = CNCTUnicode::split_utf8(token); - for (auto& unicode_sequence : unicode_seqeunces) { - decoded_token += byte_decoder[unicode_sequence]; - } - - return decoded_token; - } - - const gpt2bpe_vocab & vocab_; - std::vector symbols_; - std::vector symbols_final; - ggllm_bpe_bigram::queue work_queue_; - bool flag_g2ws=false; -}; - -static std::vector gpt2bpe_tokenize(const gpt2bpe_vocab & vocab, const std::string & text, bool bos, bool g2ws ) { - gpt2bpe_tokenizer tokenizer(vocab, g2ws); - std::vector output; - - if (text.empty()) { - return output; - } - - if (bos && vocab.special_bos_id != -1) { - output.push_back(vocab.special_bos_id); - } - - tokenizer.tokenize(text, output); - return output; -} - -#endif // CMPNCT_GPT2BPE diff --git a/examples/gptneox-wip/falcon-main.cpp b/examples/gptneox-wip/falcon-main.cpp deleted file mode 100644 index e9197f6b5..000000000 --- a/examples/gptneox-wip/falcon-main.cpp +++ /dev/null @@ -1,1111 +0,0 @@ -#include "ggml.h" -#include "cmpnct_gpt2bpe.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -// default hparams -struct falcon_hparams { - size_t n_merges = 0; - size_t n_vocab = 0; - uint32_t n_ctx = 0; - uint32_t n_embd = 0; - uint32_t n_head = 0; - uint32_t n_head_kv = 1; // Needs to be 1 for 7B model - uint32_t n_ff = 0; - uint32_t n_block = 0; - float norm_eps = 1e-5; -}; -struct falcon_block { - // normalization - struct ggml_tensor* input_layernorm; - struct ggml_tensor* input_layernorm_b; - struct ggml_tensor* attention_norm; // Falcon-40B only - struct ggml_tensor* attention_norm_b; // Falcon-40B only - - // attention - struct ggml_tensor* query_key_value; - struct ggml_tensor* wo; - - // ff - struct ggml_tensor* ffn_up; - struct ggml_tensor* ffn_down; -}; - -struct falcon_model { - falcon_hparams hparams; - - struct ggml_tensor* tok_embeddings; - struct ggml_tensor* output_norm; - struct ggml_tensor* output_norm_b; - struct ggml_tensor* lm_head; - - std::vector blocks; - - // key + value memory - struct ggml_tensor* memory_k; - struct ggml_tensor* memory_v; - - struct gguf_context * ggufctx; - struct ggml_context * ctx; - struct ggml_context * kvctx; - - std::map tensors; -}; - -struct gpt_params { - int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - uint32_t n_predict = 200; // new tokens to predict - uint32_t n_batch = 512; // batch size for prompt processing - - // sampling parameters - int32_t top_k = 40; - float top_p = 1.0f; - float temp = 0.8f; - int32_t repeat_last_n = 64; - float repeat_penalty = 1.02f; - - std::string model = ""; // model path - std::string prompt = ""; - - std::string token_test = ""; - bool interactive = false; - int32_t interactive_port = -1; - int32_t n_gpu_layers = 0; -}; - -void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers); - fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); - fprintf(stderr, " prompt to start generation with (default: random)\n"); - fprintf(stderr, " -f FNAME, --file FNAME\n"); - fprintf(stderr, " load prompt from a file\n"); - fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n"); - fprintf(stderr, " test tokenization\n"); - fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); - fprintf(stderr, " --top_k N top-k sampling, 0 = n_vocab (default: %d)\n", params.top_k); - fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); - fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); - fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n); - fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty); - fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, "\n"); -} - -// Function to check if the next argument exists -std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) { - if (i + 1 < argc && argv[i + 1][0] != '-') { - return argv[++i]; - } else { - fprintf(stderr, "error: %s requires one argument.\n", flag.c_str()); - gpt_print_usage(argc, argv, params); - exit(0); - } -} - -bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - - if (arg == "-s" || arg == "--seed") { - params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-t" || arg == "--threads") { - params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { - params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-p" || arg == "--prompt") { - params.prompt = get_next_arg(i, argc, argv, arg, params); - } else if (arg == "-n" || arg == "--n_predict") { - params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--top_k") { - params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--top_p") { - params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--temp") { - params.temp = std::stof(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--repeat-last-n") { - params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--repeat-penalty") { - params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-b" || arg == "--batch_size") { - params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-m" || arg == "--model") { - params.model = get_next_arg(i, argc, argv, arg, params); - } else if (arg == "-i" || arg == "--interactive") { - params.interactive = true; - } else if (arg == "-ip" || arg == "--interactive-port") { - params.interactive = true; - params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-h" || arg == "--help") { - gpt_print_usage(argc, argv, params); - exit(0); - } else if (arg == "-f" || arg == "--file") { - get_next_arg(i, argc, argv, arg, params); - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - break; - } - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - } else if (arg == "-tt" || arg == "--token_test") { - params.token_test = get_next_arg(i, argc, argv, arg, params); - } - else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, params); - exit(0); - } - } - - return true; -} - -gpt2bpe_vocab::id sample_top_k_top_p_repeat( - const gpt2bpe_vocab & vocab, - const float * logits, - const int32_t * last_n_tokens_data, - size_t last_n_tokens_data_size, - int top_k, - double top_p, - double temp, - int repeat_last_n, - float repeat_penalty, - std::mt19937 & rng) { - - int n_logits = vocab.id_to_token.size(); - - const auto * plogits = logits; - - const auto last_n_tokens = std::vector(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size); - - if (temp <= 0) { - // select the token with the highest logit directly - float max_logit = plogits[0]; - gpt2bpe_vocab::id max_id = 0; - - for (int i = 1; i < n_logits; ++i) { - if (plogits[i] > max_logit) { - max_logit = plogits[i]; - max_id = i; - } - } - return max_id; - } - - - std::vector> logits_id; - logits_id.reserve(n_logits); - - { - const float scale = 1.0f/temp; - for (int i = 0; i < n_logits; ++i) { - // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858) - // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main - if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) { - // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability - if (plogits[i] < 0.0f) { - logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i)); - } else { - logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i)); - } - } else { - logits_id.push_back(std::make_pair(plogits[i]*scale, i)); - } - } - } - - // find the top K tokens - std::partial_sort( - logits_id.begin(), - logits_id.begin() + top_k, logits_id.end(), - [](const std::pair & a, const std::pair & b) { - return a.first > b.first; - }); - - logits_id.resize(top_k); - - double maxl = -INFINITY; - for (const auto & kv : logits_id) { - maxl = std::max(maxl, kv.first); - } - - // compute probs for the top K tokens - std::vector probs; - probs.reserve(logits_id.size()); - - double sum = 0.0; - for (const auto & kv : logits_id) { - double p = exp(kv.first - maxl); - probs.push_back(p); - sum += p; - } - - // normalize the probs - for (auto & p : probs) { - p /= sum; - } - - if (top_p < 1.0f) { - double cumsum = 0.0f; - for (int i = 0; i < top_k; i++) { - cumsum += probs[i]; - if (cumsum >= top_p) { - top_k = i + 1; - probs.resize(top_k); - logits_id.resize(top_k); - break; - } - } - - cumsum = 1.0/cumsum; - for (int i = 0; i < (int) probs.size(); i++) { - probs[i] *= cumsum; - } - } - -// printf("\n"); -// for (int i = 0; i < (int) probs.size(); i++) { -// for (int i = 0; i < 10; i++) { -// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); -// } - - std::discrete_distribution<> dist(probs.begin(), probs.end()); - int idx = dist(rng); - - return logits_id[idx].second; - -} - -struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name){ - - struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); - if( cur == NULL ) { - printf("%s: tensor '%s' not found!\n", __func__, name.c_str()); - } else { -// printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); - } - - return cur; -} - -// load the model's weights from a file -bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_vocab & vocab) { - printf("%s: loading model from '%s'..\n", __func__, fname.c_str()); - - model.ctx = NULL; - - struct gguf_init_params ggufparams = { - /*.no_alloc = */ false, - /*.ctx = */ &model.ctx, - }; - - auto & ggufctx = model.ggufctx; - - ggufctx = gguf_init_from_file(fname.c_str(), ggufparams); - - if (!ggufctx) { - fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__); - return false; - } - - printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); - printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); - printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); - - // print all kv - #if 0 - { - const int n_kv = gguf_get_n_kv(ggufctx); - - printf("%s: n_kv: %d\n", __func__, n_kv); - - for (int i = 0; i < n_kv; ++i) { - const char * key = gguf_get_key(ggufctx, i); - - printf("%s: kv[%d]: key = %s\n", __func__, i, key); - } - } - #endif - - // print some standard metadata - { - int keyidx; - - keyidx = gguf_find_key(ggufctx, "general.name"); - if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.description"); - if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.author"); - if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.license"); - if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.architecture"); - if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.file_type"); - if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); - if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository"); - if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - } - - // check required metadata - { - int keyidx; - - // check model architecture kv - keyidx = gguf_find_key(ggufctx, "general.architecture"); - if (keyidx != -1) { - if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) { - printf("%s: model architecture not supported!\n", __func__); - return false; - } - } else { - printf("%s: gguf model architecture not found!\n", __func__); - return false; - } - - // check model tensor data layout kv - keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout"); - if (keyidx != -1) { - if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) { - printf("%s: model tensor data layout not supported!\n", __func__); - return false; - } - } else { - printf("%s: gguf model tensor data layout not found!\n", __func__); - return false; - } - - } - - // load hparams - { - auto & hparams = model.hparams; - - bool ok = true; - int keyidx; - - if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.context_length"); - if (keyidx != -1) { hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.embedding_length"); - if (keyidx != -1) { hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.attention.head_count"); - if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.feed_forward_length"); - if (keyidx != -1) { hparams.n_ff = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.block_count"); - if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.attention.layer_norm_epsilon"); - if (keyidx != -1) { hparams.norm_eps= gguf_get_val_f32(ggufctx, keyidx); } else { ok = false; } } - - if (!ok) { - fprintf(stderr, "%s: required hparam missing!\n", __func__); - return false; - } - - keyidx = gguf_find_key(ggufctx, "falcon.attention.head_count_kv"); - if (keyidx != -1) { hparams.n_head_kv = gguf_get_val_u32(ggufctx, keyidx); } - - - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_head_kv = %d\n", __func__, hparams.n_head_kv); - printf("%s: n_block = %d\n", __func__, hparams.n_block); - printf("%s: norm_eps = %g\n", __func__, hparams.norm_eps); - - } - - // load vocab - { - auto & hparams = model.hparams; - - int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model"); - - if (keyidx != -1) { - if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { - printf("%s: tokenizer model not supported!\n", __func__); - return false; - } - } else { - printf("%s: tokenizer model not found!\n", __func__); - return false; - } - - - int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); - - if (tokens_keyidx == -1) { - printf("%s: gpt2 tokenizer vocab not found!\n", __func__); - return false; - } - - int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); - - if (merges_keyidx == -1) { - printf("%s: gpt2 tokenizer merges not found!\n", __func__); - return false; - } - - hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); - hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); - - printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); - printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); - - for (size_t i = 0; i < hparams.n_vocab; i++) { - std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); - -// printf("token %d = '%s'\n",i,word.c_str() ); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - - if( vocab.id_to_token[i] == "\n" ) { - vocab.linefeed_id = i; - } - } - - std::vector> bpe_merges; - - for (size_t i = 0; i < hparams.n_merges; i++) { - - std::string word = gguf_get_arr_str(ggufctx, merges_keyidx, i); - - // Split the merges - std::string first, second; - size_t pos = word.find(' ', 1); // Start the search from the second character - if (pos != std::string::npos) { - first = word.substr(0, pos); - second = word.substr(pos + 1); - } - - bpe_merges.push_back(std::make_pair(first, second)); - } - - vocab.populate_bpe_ranks(bpe_merges); - - - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) { vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) { vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) { vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - - if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } - if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } - if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } - if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } - if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } - if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); } - - } - - - auto & ctx = model.ctx; - size_t ctx_size = ggml_get_mem_size(ctx); - - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); - - // print tensor info - #if 0 - { - const int n_tensors = gguf_get_n_tensors(ggufctx); - - printf("%s: n_tensors: %d\n", __func__, n_tensors); - - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name (ggufctx, i); - const size_t offset = gguf_get_tensor_offset(ggufctx, i); - - printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); - } - } - #endif - - // prepare memory for the weights - { - - auto & hparams = model.hparams; - - const int n_block = hparams.n_block; - - model.blocks.resize(n_block); - - model.tok_embeddings = ggml_get_tensor(ctx, "token_embd.weight"); - - model.output_norm = ggml_get_tensor(ctx, "output_norm.weight"); - model.output_norm_b = ggml_get_tensor(ctx, "output_norm.bias"); - model.lm_head = ggml_get_tensor(ctx, "output.weight"); - - // map by name - model.tensors["token_embd.weight"] = model.tok_embeddings; - model.tensors["output_norm.weight"] = model.output_norm; - model.tensors["output_norm.bias"] = model.output_norm_b; - model.tensors["output.weight"] = model.lm_head; - - for (int i = 0; i < n_block; ++i) { - - auto& block = model.blocks[i]; - std::string blocknamestart = "blk." + std::to_string(i) + "."; - - block.input_layernorm = get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" ); - block.input_layernorm_b = get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" ); - - if ( hparams.n_head_kv == 8 ) { // Falcon-40B - block.attention_norm = get_tensor_ex(ctx, blocknamestart + "attn_norm_2.weight" ); - block.attention_norm_b = get_tensor_ex(ctx, blocknamestart + "attn_norm_2.bias" ); - } - - // query_key_value shape for config.multi_query == True: - block.query_key_value = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" ); - block.wo = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" ); - - block.ffn_up = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" ); - block.ffn_down = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" ); - - // map by name - if ( hparams.n_head_kv == 8 ) { // Falcon-40B - // Falcon-40B: - model.tensors[blocknamestart + "attn_norm.weight"] = block.input_layernorm; - model.tensors[blocknamestart + "attn_norm.bias"] = block.input_layernorm_b; - model.tensors[blocknamestart + "attn_norm_2.weight"] = block.attention_norm; - model.tensors[blocknamestart + "attn_norm_2.bias"] = block.attention_norm_b; - } else { - // Falcon-7B: - model.tensors[blocknamestart + "attn_norm.weight"] = block.input_layernorm; - model.tensors[blocknamestart + "attn_norm.bias"] = block.input_layernorm_b; - } - - model.tensors[blocknamestart + "attn_qkv.weight"] = block.query_key_value; - model.tensors[blocknamestart + "attn_output.weight"] = block.wo; - - model.tensors[blocknamestart + "ffn_up.weight"] = block.ffn_up; - model.tensors[blocknamestart + "ffn_down.weight"] = block.ffn_down; - } - } - - // key + value memory - { - const auto & kvctx = model.kvctx; - const auto & hparams = model.hparams; - - const int n_block = hparams.n_block; - const int n_ctx = hparams.n_ctx; - const int n_embd = hparams.n_embd; - - const int64_t n_mem = n_block*n_ctx; - const int64_t n_elements = n_embd*n_mem; - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ size_t(n_elements*4+ggml_tensor_overhead()*2), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.kvctx = ggml_init(params); - if (!model.kvctx) { - fprintf(stderr, "%s: kv ggml_init() failed\n", __func__); - return false; - } - - } - - - model.memory_k = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements); - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - - return true; -} - - -// evaluate the transformer -// -// - model: the model -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool falcon_eval( - const falcon_model & model, - const int n_threads, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w, - size_t & mem_per_token) { - - - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_block = hparams.n_block; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - const int n_head_kv = hparams.n_head_kv; - const int n_vocab = hparams.n_vocab; - const size_t head_dim = n_embd / n_head; - - static size_t buf_size = 256u*1024*1024; - static void * buf = malloc(buf_size); - - // use 2 scratch buffers - // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = 256u*1024*1024; - static void * scr0 = malloc(scr0_size); - - static size_t scr1_size = 256u*1024*1024; - static void * scr1 = malloc(scr1_size); - - if (mem_per_token > 0 && mem_per_token*N > buf_size) { - const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead - //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); - - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return false; - } - } - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; -// gf.n_threads = n_threads; - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - - // wte - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); -// struct ggml_tensor* repeat_dummy = ggml_new_tensor_3d(ctx0, inpL->type, head_dim, N + n_past, n_head); - - ggml_type wtype = GGML_TYPE_F32; - const int sizeof_wtype = ggml_type_sizef(wtype); - - for (int il = 0; il < n_block; ++il) { - struct ggml_tensor * cur; - struct ggml_tensor * layernorm_output; - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // self-attention - { - layernorm_output = ggml_norm(ctx0, inpL); - - layernorm_output = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.blocks[il].input_layernorm, layernorm_output), - layernorm_output), - ggml_repeat(ctx0, model.blocks[il].input_layernorm_b, layernorm_output)); - - if ( hparams.n_head_kv == 8 ) { // Falcon-40B - cur = ggml_norm(ctx0, inpL); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.blocks[il].attention_norm, cur), - cur), - ggml_repeat(ctx0, model.blocks[il].attention_norm_b, cur)); - } - else { // Falcon 7B - cur = layernorm_output; - } - - // compute QKV - - cur = ggml_mul_mat(ctx0, model.blocks[il].query_key_value, cur); - - // Note that the strides for Kcur, Vcur are set up so that the - // resulting views are misaligned with the tensor's storage - // (by applying the K/V offset we shift the tensor's original - // view to stick out behind the viewed QKV tensor's allocated - // memory, so to say). This is ok because no actual accesses - // happen to that out-of-range memory, but it can require some - // trickery when trying to accurately dump these views for - // debugging. - - struct ggml_tensor * Qcur = ggml_view_3d( - ctx0, cur, head_dim, n_head, N, - head_dim * sizeof_wtype, - head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype, - 0); - - struct ggml_tensor * Kcur = ggml_view_3d( - ctx0, cur, head_dim, n_head_kv, N, - head_dim * sizeof_wtype, - head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype, - head_dim * n_head * sizeof_wtype); - - struct ggml_tensor * Vcur = ggml_view_3d( - ctx0, cur, head_dim, n_head_kv, N, - head_dim * sizeof_wtype, - head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype, - head_dim * (n_head + n_head_kv) * sizeof_wtype); - - // using mode = 2 for neox mode - Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2, 0); - Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2, 0); - - // store key and value to memory - { - struct ggml_tensor* k = ggml_view_1d( - ctx0, model.memory_k, N * n_head_kv * head_dim, - (ggml_element_size(model.memory_k) * n_head_kv * head_dim) * - (il * n_ctx + n_past)); - struct ggml_tensor* v = ggml_view_1d( - ctx0, model.memory_v, N * n_head_kv * head_dim, - (ggml_element_size(model.memory_v) * n_head_kv * head_dim) * - (il * n_ctx + n_past)); - - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * K = ggml_permute( - ctx0, - ggml_reshape_3d( - ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_head_kv * head_dim, - il * n_ctx * - ggml_element_size(model.memory_k) * - n_head_kv * - head_dim), - head_dim, n_head_kv, n_past + N), - 0, 2, 1, 3); - - // K * Q - -// K = ggml_cont(ctx0, ggml_repeat2(ctx0, K, repeat_dummy)); - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(head_dim))) - ); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - struct ggml_tensor* V = ggml_permute( - ctx0, - ggml_reshape_3d( - ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_head_kv * head_dim, - il * n_ctx * - ggml_element_size(model.memory_v) * - n_head_kv * - head_dim), - head_dim, n_head_kv, n_past + N), - 0, 2, 1, 3); - -// V = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_repeat2(ctx0, V, repeat_dummy))); - V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); - - // KQV = transpose(V) * KQ_soft_max - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection - { - cur = ggml_mul_mat(ctx0, - model.blocks[il].wo, - cur); - } - } - - ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); - - struct ggml_tensor* inpFF = layernorm_output; - struct ggml_tensor* attn_out = ggml_cpy( - ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - { - cur = ggml_mul_mat(ctx0, model.blocks[il].ffn_up, inpFF); - cur = ggml_gelu(ctx0, cur); - cur = ggml_mul_mat(ctx0, model.blocks[il].ffn_down, cur); - } - - cur = ggml_add(ctx0, cur, attn_out); - cur = ggml_add(ctx0, cur, inpL); - // input for next layer - inpL = cur; - } - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // norm - { - inpL = ggml_norm(ctx0, inpL); - - // inpL = ln_f_g*inpL + ln_f_b - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.output_norm, inpL), - inpL), - ggml_repeat(ctx0, model.output_norm_b, inpL)); - } - - ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - - // lm_head - { - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); - - //inpL = ggml_add(ctx0, - // ggml_repeat(ctx0, model.lmh_b, inpL), - // inpL); - } - - // logits -> probs - //inpL = ggml_soft_max_inplace(ctx0, inpL); - - // run the computation - ggml_build_forward_expand(&gf, inpL); -// ggml_graph_compute (ctx0, &gf); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); - - //if (n_past%100 == 0) { - // ggml_graph_print (&gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); - //} - - // return result for just the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab); - - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; - } - //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); - - ggml_free(ctx0); - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - return 1; - } - - int64_t t_load_us = 0; - - gpt2bpe_vocab vocab; - falcon_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!falcon_model_load(params.model, model, vocab)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - } - - if (params.seed < 0) { - params.seed = time(NULL); - } - - if (params.top_k == 0) { - params.top_k = model.hparams.n_vocab; - } - - printf("%s: seed = %d\n", __func__, params.seed); - printf("%s: temp = %.3f\n", __func__, params.temp); - printf("%s: top_k = %d\n", __func__, params.top_k); - printf("%s: top_p = %.3f\n", __func__, params.top_p); - printf("%s: repeat_last_n = %d\n", __func__, params.repeat_last_n); - printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty); - - std::mt19937 rng(params.seed); - - if (params.prompt.empty()) { - params.prompt = "Once upon"; - } - - std::vector last_n_tokens(model.hparams.n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = gpt2bpe_tokenize(vocab, params.prompt,false, false); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); -// for (size_t i = 0; i < embd_inp.size(); i++) { -// printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token[embd_inp[i]].c_str()); -// } - - if( model.hparams.n_ctx < params.n_predict+embd_inp.size() ) { - params.n_predict = model.hparams.n_ctx-embd_inp.size(); - } - - printf("%s: n_predict = %d\n", __func__, params.n_predict); - printf("\n"); - - std::vector embd; - - // determine the required inference memory per token: - size_t mem_per_token = 0; - falcon_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); - - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!falcon_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { - printf("Failed to predict\n"); - return 1; - } - - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - const int repeat_last_n = params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - - const int n_vocab = model.hparams.n_vocab; - - gpt2bpe_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng); - - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (embd.size() > params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str() ); - } - fflush(stdout); - - // end of text token - if (vocab.special_eos_id != -1 && embd.back() == vocab.special_eos_id) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} diff --git a/examples/gptneox-wip/gptneox-main.cpp b/examples/gptneox-wip/gptneox-main.cpp deleted file mode 100644 index b76bafaa8..000000000 --- a/examples/gptneox-wip/gptneox-main.cpp +++ /dev/null @@ -1,1083 +0,0 @@ -#include "ggml.h" -#include "cmpnct_gpt2bpe.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -// default hparams -struct gpt_neox_hparams { - size_t n_merges = 0; - size_t n_vocab = 0; - uint32_t n_ctx = 0; - uint32_t n_embd = 0; - uint32_t n_head = 0; - uint32_t n_block = 0; - uint32_t n_rot = 0; // rotary_pct * (n_embd / n_head) - bool par_res = true; - float norm_eps = 1e-5; -}; - -struct gpt_neox_block { - // pre normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - // attention - struct ggml_tensor * c_attn_attn_w; - struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // post normalization - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // ff - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; -}; - -struct gpt_neox_model { - gpt_neox_hparams hparams; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - - struct ggml_tensor * lmh_g; // language model head - - std::vector blocks; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct gguf_context * ggufctx; - struct ggml_context * ctx; - struct ggml_context * kvctx; - - std::map tensors; -}; - -struct gpt_params { - int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - uint32_t n_predict = 200; // new tokens to predict - uint32_t n_batch = 512; // batch size for prompt processing - - // sampling parameters - int32_t top_k = 40; - float top_p = 1.0f; - float temp = 0.8f; - int32_t repeat_last_n = 64; - float repeat_penalty = 1.02f; - - std::string model = ""; // model path - std::string prompt = ""; - - std::string token_test = ""; - bool interactive = false; - int32_t interactive_port = -1; - int32_t n_gpu_layers = 0; -}; - -void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers); - fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); - fprintf(stderr, " prompt to start generation with (default: random)\n"); - fprintf(stderr, " -f FNAME, --file FNAME\n"); - fprintf(stderr, " load prompt from a file\n"); - fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n"); - fprintf(stderr, " test tokenization\n"); - fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); - fprintf(stderr, " --top_k N top-k sampling, 0 = n_vocab (default: %d)\n", params.top_k); - fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); - fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); - fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n); - fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty); - fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, "\n"); -} - -// Function to check if the next argument exists -std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) { - if (i + 1 < argc && argv[i + 1][0] != '-') { - return argv[++i]; - } else { - fprintf(stderr, "error: %s requires one argument.\n", flag.c_str()); - gpt_print_usage(argc, argv, params); - exit(0); - } -} - -bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - - if (arg == "-s" || arg == "--seed") { - params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-t" || arg == "--threads") { - params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { - params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-p" || arg == "--prompt") { - params.prompt = get_next_arg(i, argc, argv, arg, params); - } else if (arg == "-n" || arg == "--n_predict") { - params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--top_k") { - params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--top_p") { - params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--temp") { - params.temp = std::stof(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--repeat-last-n") { - params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--repeat-penalty") { - params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-b" || arg == "--batch_size") { - params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-m" || arg == "--model") { - params.model = get_next_arg(i, argc, argv, arg, params); - } else if (arg == "-i" || arg == "--interactive") { - params.interactive = true; - } else if (arg == "-ip" || arg == "--interactive-port") { - params.interactive = true; - params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-h" || arg == "--help") { - gpt_print_usage(argc, argv, params); - exit(0); - } else if (arg == "-f" || arg == "--file") { - get_next_arg(i, argc, argv, arg, params); - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - break; - } - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - } else if (arg == "-tt" || arg == "--token_test") { - params.token_test = get_next_arg(i, argc, argv, arg, params); - } - else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, params); - exit(0); - } - } - - return true; -} - -gpt2bpe_vocab::id sample_top_k_top_p_repeat( - const gpt2bpe_vocab & vocab, - const float * logits, - const int32_t * last_n_tokens_data, - size_t last_n_tokens_data_size, - int top_k, - double top_p, - double temp, - int repeat_last_n, - float repeat_penalty, - std::mt19937 & rng) { - - int n_logits = vocab.id_to_token.size(); - - const auto * plogits = logits; - - const auto last_n_tokens = std::vector(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size); - - if (temp <= 0) { - // select the token with the highest logit directly - float max_logit = plogits[0]; - gpt2bpe_vocab::id max_id = 0; - - for (int i = 1; i < n_logits; ++i) { - if (plogits[i] > max_logit) { - max_logit = plogits[i]; - max_id = i; - } - } - return max_id; - } - - - std::vector> logits_id; - logits_id.reserve(n_logits); - - { - const float scale = 1.0f/temp; - for (int i = 0; i < n_logits; ++i) { - // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858) - // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main - if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) { - // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability - if (plogits[i] < 0.0f) { - logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i)); - } else { - logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i)); - } - } else { - logits_id.push_back(std::make_pair(plogits[i]*scale, i)); - } - } - } - - // find the top K tokens - std::partial_sort( - logits_id.begin(), - logits_id.begin() + top_k, logits_id.end(), - [](const std::pair & a, const std::pair & b) { - return a.first > b.first; - }); - - logits_id.resize(top_k); - - double maxl = -INFINITY; - for (const auto & kv : logits_id) { - maxl = std::max(maxl, kv.first); - } - - // compute probs for the top K tokens - std::vector probs; - probs.reserve(logits_id.size()); - - double sum = 0.0; - for (const auto & kv : logits_id) { - double p = exp(kv.first - maxl); - probs.push_back(p); - sum += p; - } - - // normalize the probs - for (auto & p : probs) { - p /= sum; - } - - if (top_p < 1.0f) { - double cumsum = 0.0f; - for (int i = 0; i < top_k; i++) { - cumsum += probs[i]; - if (cumsum >= top_p) { - top_k = i + 1; - probs.resize(top_k); - logits_id.resize(top_k); - break; - } - } - - cumsum = 1.0/cumsum; - for (int i = 0; i < (int) probs.size(); i++) { - probs[i] *= cumsum; - } - } - -// printf("\n"); -// for (int i = 0; i < (int) probs.size(); i++) { -// for (int i = 0; i < 10; i++) { -// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); -// } - - std::discrete_distribution<> dist(probs.begin(), probs.end()); - int idx = dist(rng); - - return logits_id[idx].second; - -} - -struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name){ - - struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); - if( cur == NULL ) { - printf("%s: tensor '%s' not found!\n", __func__, name.c_str()); - } else { -// printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); - } - - return cur; -} - -// load the model's weights from a file -bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2bpe_vocab & vocab) { - printf("%s: loading model from '%s'..\n", __func__, fname.c_str()); - - model.ctx = NULL; - - struct gguf_init_params ggufparams = { - /*.no_alloc = */ false, - /*.ctx = */ &model.ctx, - }; - - auto & ggufctx = model.ggufctx; - - ggufctx = gguf_init_from_file(fname.c_str(), ggufparams); - - if (!ggufctx) { - fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__); - return false; - } - - printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); - printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); - printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); - - // print all kv - #if 0 - { - const int n_kv = gguf_get_n_kv(ggufctx); - - printf("%s: n_kv: %d\n", __func__, n_kv); - - for (int i = 0; i < n_kv; ++i) { - const char * key = gguf_get_key(ggufctx, i); - - printf("%s: kv[%d]: key = %s\n", __func__, i, key); - } - } - #endif - - // print some standard metadata - { - int keyidx; - - keyidx = gguf_find_key(ggufctx, "general.name"); - if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.description"); - if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.author"); - if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.license"); - if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.architecture"); - if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.file_type"); - if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); - if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository"); - if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - } - - // check required metadata - { - int keyidx; - - // check model architecture kv - keyidx = gguf_find_key(ggufctx, "general.architecture"); - if (keyidx != -1) { - if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) { - printf("%s: model architecture not supported!\n", __func__); - return false; - } - } else { - printf("%s: gguf model architecture not found!\n", __func__); - return false; - } - - } - - // load hparams - { - auto & hparams = model.hparams; - - bool ok = true; - int keyidx; - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.context_length"); - if (keyidx != -1) { hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.embedding_length"); - if (keyidx != -1) { hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.attention.head_count"); - if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.block_count"); - if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.rope.dimension_count"); - if (keyidx != -1) { hparams.n_rot = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.use_parallel_residual"); - if (keyidx != -1) { hparams.par_res = gguf_get_val_bool(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.attention.layer_norm_epsilon"); - if (keyidx != -1) { hparams.norm_eps= gguf_get_val_f32(ggufctx, keyidx); } else { ok = false; } } - - if (!ok) { - fprintf(stderr, "%s: required hparam missing!\n", __func__); - return false; - } - - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_block = %d\n", __func__, hparams.n_block); - printf("%s: n_rot = %d\n", __func__, hparams.n_rot); - printf("%s: par_res = %d\n", __func__, hparams.par_res); - printf("%s: norm_eps = %g\n", __func__, hparams.norm_eps); - - } - - // load vocab - { - auto & hparams = model.hparams; - - int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model"); - - if (keyidx != -1) { - if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { - printf("%s: tokenizer model not supported!\n", __func__); - return false; - } - } else { - printf("%s: tokenizer model not found!\n", __func__); - return false; - } - - - int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); - - if (tokens_keyidx == -1) { - printf("%s: gpt2 tokenizer vocab not found!\n", __func__); - return false; - } - - int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); - - if (merges_keyidx == -1) { - printf("%s: gpt2 tokenizer merges not found!\n", __func__); - return false; - } - - hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); - hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); - - printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); - printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); - - for (size_t i = 0; i < hparams.n_vocab; i++) { - std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); - -// printf("token %d = '%s'\n",i,word.c_str() ); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - - if( vocab.id_to_token[i] == "\n" ) { - vocab.linefeed_id = i; - } - } - - std::vector> bpe_merges; - - for (size_t i = 0; i < hparams.n_merges; i++) { - - std::string word = gguf_get_arr_str(ggufctx, merges_keyidx, i); - - // Split the merges - std::string first, second; - size_t pos = word.find(' ', 1); // Start the search from the second character - if (pos != std::string::npos) { - first = word.substr(0, pos); - second = word.substr(pos + 1); - } - - bpe_merges.push_back(std::make_pair(first, second)); - } - - vocab.populate_bpe_ranks(bpe_merges); - - - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) { vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) { vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) { vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - - if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } - if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } - if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } - if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } - if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } - if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); } - } - - - auto & ctx = model.ctx; - size_t ctx_size = ggml_get_mem_size(ctx); - - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); - - // print tensor info - #if 0 - { - const int n_tensors = gguf_get_n_tensors(ggufctx); - - printf("%s: n_tensors: %d\n", __func__, n_tensors); - - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name (ggufctx, i); - const size_t offset = gguf_get_tensor_offset(ggufctx, i); - - printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); - } - } - #endif - - // prepare memory for the weights - { - const int n_block = model.hparams.n_block; - - model.blocks.resize(n_block); - - model.wte = ggml_get_tensor(ctx, "token_embd.weight"); - model.ln_f_g = ggml_get_tensor(ctx, "output_norm.weight"); - model.ln_f_b = ggml_get_tensor(ctx, "output_norm.bias"); - model.lmh_g = ggml_get_tensor(ctx, "output.weight"); - - // map by name - model.tensors["token_embd.weight"] = model.wte; - model.tensors["output_norm.weight"] = model.ln_f_g; - model.tensors["output_norm.bias"] = model.ln_f_b; - model.tensors["output.weight"] = model.lmh_g; - - for (int i = 0; i < n_block; ++i) { - auto & block = model.blocks[i]; - - std::string blocknamestart = "blk." + std::to_string(i) + "."; - - block.ln_1_g = get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" ); - block.ln_1_b = get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" ); - - block.c_attn_attn_w = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" ); - block.c_attn_attn_b = get_tensor_ex(ctx ,blocknamestart + "attn_qkv.bias" ); - - block.c_attn_proj_w = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" ); - block.c_attn_proj_b = get_tensor_ex(ctx, blocknamestart + "attn_output.bias" ); - - block.ln_2_g = get_tensor_ex(ctx, blocknamestart + "ffn_norm.weight" ); - block.ln_2_b = get_tensor_ex(ctx, blocknamestart + "ffn_norm.bias"); - - block.c_mlp_fc_w = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" ); - block.c_mlp_fc_b = get_tensor_ex(ctx, blocknamestart + "ffn_up.bias" ); - - block.c_mlp_proj_w = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" ); - block.c_mlp_proj_b = get_tensor_ex(ctx, blocknamestart + "ffn_down.bias" ); - - // map by name - model.tensors[blocknamestart + "attn_norm.weight"] = block.ln_1_g; - model.tensors[blocknamestart + "attn_norm.bias"] = block.ln_1_b; - - model.tensors[blocknamestart + "attn_qkv.weight"] = block.c_attn_attn_w; - model.tensors[blocknamestart + "attn_qkv.bias"] = block.c_attn_attn_b; - - model.tensors[blocknamestart + "attn_output.weight"] = block.c_attn_proj_w; - model.tensors[blocknamestart + "attn_output.bias"] = block.c_attn_proj_b; - - model.tensors[blocknamestart + "ffn_norm.weight"] = block.ln_2_g; - model.tensors[blocknamestart + "ffn_norm.bias"] = block.ln_2_b; - - model.tensors[blocknamestart + "ffn_up.weight"] = block.c_mlp_fc_w; - model.tensors[blocknamestart + "ffn_up.bias"] = block.c_mlp_fc_b; - - model.tensors[blocknamestart + "ffn_down.weight"] = block.c_mlp_proj_w; - model.tensors[blocknamestart + "ffn_down.bias"] = block.c_mlp_proj_b; - } - } - - // key + value memory - { - const auto & kvctx = model.kvctx; - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_block = hparams.n_block; - const int n_ctx = hparams.n_ctx; - - const int64_t n_mem = n_block*n_ctx; - const int64_t n_elements = n_embd*n_mem; - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ size_t(n_elements*4+ggml_tensor_overhead()*2), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.kvctx = ggml_init(params); - if (!model.kvctx) { - fprintf(stderr, "%s: kv ggml_init() failed\n", __func__); - return false; - } - - } - - - model.memory_k = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements); - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - - return true; -} - - -// feed-forward network -ggml_tensor * gpt_neox_ff( - const gpt_neox_block &block, - ggml_context * ctx0, - ggml_tensor * inp, - const gpt_neox_hparams &hparams) { - - ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps); - - cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur)); - cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur); - cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_fc_b, cur), cur); - - // GELU activation - cur = ggml_gelu(ctx0, cur); - - // projection - // cur = proj_w*cur + proj_b - cur = ggml_mul_mat(ctx0, block.c_mlp_proj_w, cur); - - cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_proj_b, cur), cur); - return cur; -} - -// evaluate the transformer -// -// - model: the model -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool gpt_neox_eval( - const gpt_neox_model & model, - const int n_threads, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w, - size_t & mem_per_token) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_block = hparams.n_block; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - const int n_vocab = hparams.n_vocab; - const int n_rot = hparams.n_rot; - - static size_t buf_size = 256u*1024*1024; - static void * buf = malloc(buf_size); - - // use 2 scratch buffers - // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = 256u*1024*1024; - static void * scr0 = malloc(scr0_size); - - static size_t scr1_size = 256u*1024*1024; - static void * scr1 = malloc(scr1_size); - - if (mem_per_token > 0 && mem_per_token*N > buf_size) { - const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead - //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); - - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return false; - } - } - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - - - // wte - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); - - for (int il = 0; il < n_block; ++il) { - struct ggml_tensor * cur; - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // self-attention - { - { - cur = ggml_norm(ctx0, inpL, hparams.norm_eps); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur), - ggml_repeat(ctx0, model.blocks[il].ln_1_b, cur)); - } - - // compute QKV - { - - cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_attn_w, cur); - cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_attn_b, cur), cur); - } - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head)); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head)); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head)); - - // using mode = 2 for GPT-NeoX mode - Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0); - Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, 0); - - // store key and value to memory - { - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); - - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, - ( n_ctx)*ggml_element_size(model.memory_v), - (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v)); - - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), - n_embd/n_head, n_head, n_past + N), - 0, 2, 1, 3); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) - ); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - struct ggml_tensor * V = - ggml_view_3d(ctx0, model.memory_v, - n_past + N, n_embd/n_head, n_head, - n_ctx*ggml_element_size(model.memory_v), - n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head, - il*n_ctx*ggml_element_size(model.memory_v)*n_embd); - - // KQV = transpose(V) * KQ_soft_max - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection - { - cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_proj_w, cur); - cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_proj_b, cur), cur); - } - } - - ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); - - if (hparams.par_res == 0) { - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); - - cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams); - - // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); - } else { - struct ggml_tensor * inpFF = cur; - - // this is independent of the self-attention result, so it could be done in parallel to the self-attention - // note here we pass inpL instead of cur - cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams); - - // layer input + FF - cur = ggml_add(ctx0, cur, inpFF); - - // input for next layer - inpL = ggml_add(ctx0, cur, inpL); - } - } - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // norm - { - inpL = ggml_norm(ctx0, inpL, hparams.norm_eps); - - // inpL = ln_f_g*inpL + ln_f_b - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.ln_f_g, inpL), - inpL), - ggml_repeat(ctx0, model.ln_f_b, inpL)); - } - - ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - - // lm_head - { - inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); - - //inpL = ggml_add(ctx0, - // ggml_repeat(ctx0, model.lmh_b, inpL), - // inpL); - } - - // logits -> probs - //inpL = ggml_soft_max_inplace(ctx0, inpL); - - // run the computation - ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); - - //if (n_past%100 == 0) { - // ggml_graph_print (&gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); - //} - - //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); - - // return result for just the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); - - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; - } - //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); - - ggml_free(ctx0); - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - return 1; - } - - int64_t t_load_us = 0; - - gpt2bpe_vocab vocab; - gpt_neox_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt_neox_model_load(params.model, model, vocab)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - } - - if (params.seed < 0) { - params.seed = time(NULL); - } - - if (params.top_k == 0) { - params.top_k = model.hparams.n_vocab; - } - - printf("%s: seed = %d\n", __func__, params.seed); - printf("%s: temp = %.3f\n", __func__, params.temp); - printf("%s: top_k = %d\n", __func__, params.top_k); - printf("%s: top_p = %.3f\n", __func__, params.top_p); - printf("%s: repeat_last_n = %d\n", __func__, params.repeat_last_n); - printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty); - - std::mt19937 rng(params.seed); - - if (params.prompt.empty()) { - params.prompt = "Once upon"; - } - - std::vector last_n_tokens(model.hparams.n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = gpt2bpe_tokenize(vocab, params.prompt,false, false); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); -// for (size_t i = 0; i < embd_inp.size(); i++) { -// printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token[embd_inp[i]].c_str()); -// } - - if( model.hparams.n_ctx < params.n_predict+embd_inp.size() ) { - params.n_predict = model.hparams.n_ctx-embd_inp.size(); - } - - printf("%s: n_predict = %d\n", __func__, params.n_predict); - printf("\n"); - - std::vector embd; - - // determine the required inference memory per token: - size_t mem_per_token = 0; - gpt_neox_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); - - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt_neox_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { - printf("Failed to predict\n"); - return 1; - } - - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - const int repeat_last_n = params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - - const int n_vocab = model.hparams.n_vocab; - - gpt2bpe_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng); - - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (embd.size() > params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str() ); - } - fflush(stdout); - - // end of text token - if (vocab.special_eos_id != -1 && embd.back() == vocab.special_eos_id) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt new file mode 100644 index 000000000..d688a1620 --- /dev/null +++ b/examples/imatrix/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET imatrix) +add_executable(${TARGET} imatrix.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md new file mode 100644 index 000000000..578e8fc27 --- /dev/null +++ b/examples/imatrix/README.md @@ -0,0 +1,32 @@ +# llama.cpp/examples/imatrix + +Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models. +More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861 + +## Usage + +``` +./imatrix -m -f [-o ] [--verbosity ] + [-ofreq num_chunks] [-ow <0 or 1>] [other common params] +``` + +Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory. +The parameters in square brackets are optional and have the following meaning: +* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. +* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. +* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) +* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. + +For faster computation, make sure to use GPU offloading via the `-ngl` argument + +## Example + +```bash +LLAMA_CUBLAS=1 make -j + +# generate importance matrix (imatrix.dat) +./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 + +# use the imatrix to perform a Q4_K_M quantization +./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m +``` diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp new file mode 100644 index 000000000..f21bc48f3 --- /dev/null +++ b/examples/imatrix/imatrix.cpp @@ -0,0 +1,622 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +struct Stats { + std::vector values; + int ncall = 0; +}; + +struct StatParams { + std::string ofile = "imatrix.dat"; + int n_output_frequency = 10; + int verbosity = 1; + int keep_every = 0; + bool collect_output_weight = false; +}; + +class IMatrixCollector { +public: + IMatrixCollector() = default; + void set_parameters(StatParams&& params) { m_params = std::move(params); } + bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); + void save_imatrix() const; + bool load_imatrix(const char * file_name, bool add); + static bool load_imatrix(const char * file_name, std::unordered_map& imatrix); +private: + std::unordered_map m_stats; + StatParams m_params; + std::mutex m_mutex; + int m_last_call = 0; + std::vector m_src1_data; + std::vector m_ids; // the expert ids from ggml_mul_mat_id + // + void save_imatrix(const char * file_name) const; + void keep_imatrix(int ncall) const; +}; + +bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { + GGML_UNUSED(user_data); + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + + // when ask is true, the scheduler wants to know if we are interested in data from this tensor + // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection + if (ask) { + if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications + if (t->op != GGML_OP_MUL_MAT) return false; + if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; + if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false; + return true; + } + + std::lock_guard lock(m_mutex); + + // copy the data from the GPU memory if needed + const bool is_host = ggml_backend_buffer_is_host(src1->buffer); + + if (!is_host) { + m_src1_data.resize(ggml_nelements(src1)); + ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1)); + } + + const float * data = is_host ? (const float *) src1->data : m_src1_data.data(); + + if (t->op == GGML_OP_MUL_MAT_ID) { + const int idx = ((int32_t *) t->op_params)[0]; + const int n_as = ((int32_t *) t->op_params)[1]; + + // the top-k selected expert ids are stored in the src0 tensor + // for simplicity, always copy src0 to host, because it is small + // take into account that src0 is not contiguous! + GGML_ASSERT(src0->ne[1] == src1->ne[1]); + GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int))); + m_ids.resize(ggml_nbytes(src0)/sizeof(int)); + ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0)); + + // loop over all possible experts, regardless if they are used or not in the batch + // this is necessary to guarantee equal number of "ncall" for each tensor + for (int ex = 0; ex < n_as; ++ex) { + src0 = t->src[2 + ex]; + auto& e = m_stats[src0->name]; + if (e.values.empty()) { + e.values.resize(src1->ne[0], 0); + } + else if (e.values.size() != (size_t)src1->ne[0]) { + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + exit(1); //GGML_ASSERT(false); + } + // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger + // using the following line, we can correct for that if needed + //if (idx == t->src[0]->ne[0] - 1) ++e.ncall; + ++e.ncall; + if (m_params.verbosity > 1) { + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + } + for (int row = 0; row < (int)src1->ne[1]; ++row) { + const int excur = m_ids[row*n_as + idx]; + GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check + if (excur != ex) continue; + const float * x = data + row * src1->ne[0]; + for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.values[j] += x[j]*x[j]; + } + } + if (e.ncall > m_last_call) { + m_last_call = e.ncall; + if (m_last_call % m_params.n_output_frequency == 0) { + save_imatrix(); + } + if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { + keep_imatrix(m_last_call); + } + } + } + } else { + auto& e = m_stats[src0->name]; + if (e.values.empty()) { + e.values.resize(src1->ne[0], 0); + } + else if (e.values.size() != (size_t)src1->ne[0]) { + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + exit(1); //GGML_ASSERT(false); + } + ++e.ncall; + if (m_params.verbosity > 1) { + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + } + for (int row = 0; row < (int)src1->ne[1]; ++row) { + const float * x = data + row * src1->ne[0]; + for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.values[j] += x[j]*x[j]; + } + } + if (e.ncall > m_last_call) { + m_last_call = e.ncall; + if (m_last_call % m_params.n_output_frequency == 0) { + save_imatrix(); + } + if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { + keep_imatrix(m_last_call); + } + } + } + + return true; +} + +void IMatrixCollector::save_imatrix() const { + save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str()); +} + +void IMatrixCollector::keep_imatrix(int ncall) const { + auto file_name = m_params.ofile; + if (file_name.empty()) file_name = "imatrix.dat"; + file_name += ".at_"; + file_name += std::to_string(ncall); + save_imatrix(file_name.c_str()); +} + +void IMatrixCollector::save_imatrix(const char * fname) const { + std::ofstream out(fname, std::ios::binary); + int n_entries = m_stats.size(); + out.write((const char*)&n_entries, sizeof(n_entries)); + for (auto& p : m_stats) { + int len = p.first.size(); + out.write((const char*)&len, sizeof(len)); + out.write(p.first.c_str(), len); + out.write((const char*)&p.second.ncall, sizeof(p.second.ncall)); + int nval = p.second.values.size(); + out.write((const char*)&nval, sizeof(nval)); + if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float)); + } + if (m_params.verbosity > 0) { + fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname); + } +} + +bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map& imatrix_data) { + std::ifstream in(imatrix_file, std::ios::binary); + if (!in) { + printf("%s: failed to open %s\n",__func__,imatrix_file); + return false; + } + int n_entries; + in.read((char*)&n_entries, sizeof(n_entries)); + if (in.fail() || n_entries < 1) { + printf("%s: no data in file %s\n", __func__, imatrix_file); + return false; + } + for (int i = 0; i < n_entries; ++i) { + int len; in.read((char *)&len, sizeof(len)); + std::vector name_as_vec(len+1); + in.read((char *)name_as_vec.data(), len); + if (in.fail()) { + printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file); + return false; + } + name_as_vec[len] = 0; + std::string name{name_as_vec.data()}; + auto& e = imatrix_data[std::move(name)]; + int ncall; + in.read((char*)&ncall, sizeof(ncall)); + int nval; + in.read((char *)&nval, sizeof(nval)); + if (in.fail() || nval < 1) { + printf("%s: failed reading number of values for entry %d\n",__func__,i); + imatrix_data = {}; + return false; + } + e.values.resize(nval); + in.read((char*)e.values.data(), nval*sizeof(float)); + if (in.fail()) { + printf("%s: failed reading data for entry %d\n",__func__,i); + imatrix_data = {}; + return false; + } + e.ncall = ncall; + } + return true; +} + +bool IMatrixCollector::load_imatrix(const char * file_name, bool add) { + if (!add) { + m_stats.clear(); + } + return load_imatrix(file_name, m_stats); +} + +static IMatrixCollector g_collector; + +static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { + return g_collector.collect_imatrix(t, ask, user_data); +} + + +struct results_log_softmax { + double log_softmax; + float logit; + float prob; +}; + +static std::vector softmax(const std::vector& logits) { + std::vector probs(logits.size()); + float max_logit = logits[0]; + for (float v : logits) { + max_logit = std::max(max_logit, v); + } + double sum_exp = 0.0; + for (size_t i = 0; i < logits.size(); i++) { + // Subtract the maximum logit value from the current logit value for numerical stability + const float logit = logits[i] - max_logit; + const float exp_logit = expf(logit); + sum_exp += exp_logit; + probs[i] = exp_logit; + } + for (size_t i = 0; i < probs.size(); i++) { + probs[i] /= sum_exp; + } + return probs; +} + +static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { + float max_logit = logits[0]; + for (int i = 1; i < n_vocab; ++i) { + max_logit = std::max(max_logit, logits[i]); + } + double sum_exp = 0.0; + for (int i = 0; i < n_vocab; ++i) { + sum_exp += expf(logits[i] - max_logit); + } + return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; +} + +static void process_logits( + int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, + double & nll, double & nll2, float * logit_history, float * prob_history +) { + std::mutex mutex; + int counter = 0; + auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { + double local_nll = 0; + double local_nll2 = 0; + while (true) { + std::unique_lock lock(mutex); + int i = counter++; + if (i >= n_token) { + nll += local_nll; nll2 += local_nll2; + break; + } + lock.unlock(); + const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); + const double v = -results.log_softmax; + local_nll += v; + local_nll2 += v*v; + + logit_history[i] = results.logit; + prob_history[i] = results.prob; + } + }; + for (auto & w : workers) { + w = std::thread(compute); + } + compute(); + for (auto & w : workers) { + w.join(); + } +} + +static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) { + + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + const int n_ctx = llama_n_ctx(ctx); + + auto tim1 = std::chrono::high_resolution_clock::now(); + fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + + std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + + auto tim2 = std::chrono::high_resolution_clock::now(); + fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + + if (from_chunk > 0) { + if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) { + fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk); + return false; + } + fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx); + tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx); + } + + if (int(tokens.size()) < 2*n_ctx) { + fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx, + n_ctx); + fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + return false; + } + + std::vector logit_history; + std::vector prob_history; + + if (compute_ppl) { + logit_history.resize(tokens.size()); + prob_history.resize(tokens.size()); + } + + const int n_chunk_max = tokens.size() / n_ctx; + + const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_batch = params.n_batch; + + int count = 0; + double nll = 0.0; + double nll2 = 0.0; + + fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); + + std::vector workers(std::thread::hardware_concurrency() - 1); + + const int num_batches = (n_ctx + n_batch - 1) / n_batch; + + std::vector logits; + if (compute_ppl && num_batches > 1) { + logits.reserve((size_t)n_ctx * n_vocab); + } + + for (int i = 0; i < n_chunk; ++i) { + const int start = i * n_ctx; + const int end = start + n_ctx; + + std::vector logits; + + const auto t_start = std::chrono::high_resolution_clock::now(); + + // clear the KV cache + llama_kv_cache_clear(ctx); + + for (int j = 0; j < num_batches; ++j) { + const int batch_start = start + j * n_batch; + const int batch_size = std::min(end - batch_start, n_batch); + + // save original token and restore it after eval + const auto token_org = tokens[batch_start]; + + // add BOS token for the first batch of each chunk + if (add_bos && j == 0) { + tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); + } + + if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + + // restore the original token in case it was set to BOS + tokens[batch_start] = token_org; + + if (compute_ppl && num_batches > 1) { + const auto * batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + } + } + + const auto t_end = std::chrono::high_resolution_clock::now(); + + if (i == 0) { + const float t_total = std::chrono::duration(t_end - t_start).count(); + fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + int total_seconds = (int)(t_total * n_chunk); + if (total_seconds >= 60*60) { + fprintf(stderr, "%d hours ", total_seconds / (60*60)); + total_seconds = total_seconds % (60*60); + } + fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + } + + if (compute_ppl) { + const int first = n_ctx/2; + const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); + process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); + count += n_ctx - first - 1; + + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + fflush(stdout); + + logits.clear(); + } + } + printf("\n"); + + if (compute_ppl) { + nll2 /= count; + nll /= count; + const double ppl = exp(nll); + nll2 -= nll * nll; + if (nll2 > 0) { + nll2 = sqrt(nll2/(count-1)); + printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + } else { + printf("Unexpected negative standard deviation of log(prob)\n"); + } + } + + return true; +} + +int main(int argc, char ** argv) { + + StatParams sparams; + std::string prev_result_file; + std::string combine_files; + bool compute_ppl = true; + int from_chunk = 0; + std::vector args; + args.push_back(argv[0]); + int iarg = 1; + for (; iarg < argc-1; ++iarg) { + std::string arg{argv[iarg]}; + if (arg == "-o" || arg == "--output-file") { + sparams.ofile = argv[++iarg]; + } + else if (arg == "-ofreq" || arg == "--output-frequency") { + sparams.n_output_frequency = std::stoi(argv[++iarg]); + } + else if (arg == "-ow" || arg == "--output-weight") { + sparams.collect_output_weight = std::stoi(argv[++iarg]); + } + else if (arg == "--verbosity") { + sparams.verbosity = std::stoi(argv[++iarg]); + } else if (arg == "--no-ppl") { + compute_ppl = false; + } else if (arg == "--keep-imatrix") { + sparams.keep_every = std::stoi(argv[++iarg]); + } else if (arg == "--continue-from") { + prev_result_file = argv[++iarg]; + } else if (arg == "--combine") { + combine_files = argv[++iarg]; + } + else if (arg == "--from-chunk") { + from_chunk = std::stoi(argv[++iarg]); + } else { + args.push_back(argv[iarg]); + } + } + if (iarg < argc) { + std::string arg{argv[iarg]}; + if (arg == "--no-ppl") { + compute_ppl = false; + } else { + args.push_back(argv[iarg]); + } + } + + g_collector.set_parameters(std::move(sparams)); + + if (!combine_files.empty()) { + std::vector files; + size_t pos = 0; + while (true) { + auto new_pos = combine_files.find(',', pos); + if (new_pos != std::string::npos) { + files.emplace_back(combine_files.substr(pos, new_pos - pos)); + pos = new_pos + 1; + } else { + files.emplace_back(combine_files.substr(pos)); + break; + } + } + if (files.size() < 2) { + fprintf(stderr, "You must provide at least two comma separated files to use --combine\n"); + return 1; + } + printf("Combining the following %d files\n", int(files.size())); + for (auto& file : files) { + printf(" %s\n", file.c_str()); + if (!g_collector.load_imatrix(file.c_str(), true)) { + fprintf(stderr, "Failed to load %s\n", file.c_str()); + return 1; + } + } + g_collector.save_imatrix(); + return 0; + } + + if (!prev_result_file.empty()) { + if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) { + fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str()); + return 1; + } + } + + gpt_params params; + params.n_batch = 512; + if (!gpt_params_parse(args.size(), args.data(), params)) { + return 1; + } + + params.logits_all = true; + params.n_batch = std::min(params.n_batch, params.n_ctx); + + print_build_info(); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model_params mparams = llama_model_params_from_gpt_params(params); + + llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); + if (model == NULL) { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + return 1; + } + + llama_context_params cparams = llama_context_params_from_gpt_params(params); + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + cparams.cb_eval = ik_collect_imatrix; + cparams.cb_eval_user_data = NULL; + + llama_context * ctx = llama_new_context_with_model(model, cparams); + if (ctx == NULL) { + fprintf(stderr, "%s: error: unable to create context\n", __func__); + return 1; + } + + const int n_ctx_train = llama_n_ctx_train(model); + if (params.n_ctx > n_ctx_train) { + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, params.n_ctx); + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", get_system_info(params).c_str()); + } + + bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk); + if (!OK) { + return 1; + } + + g_collector.save_imatrix(); + + llama_print_timings(ctx); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + return 0; +} diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt index 046f9b1e7..e4e8028da 100644 --- a/examples/infill/CMakeLists.txt +++ b/examples/infill/CMakeLists.txt @@ -3,6 +3,3 @@ add_executable(${TARGET} infill.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 128d67080..92c67b7cf 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -2,7 +2,6 @@ #include "console.h" #include "llama.h" -#include "build-info.h" #include "grammar-parser.h" #include @@ -39,8 +38,8 @@ static gpt_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; -static bool is_interacting = false; +static bool is_interacting = false; static void write_logfile( const llama_context * ctx, const gpt_params & params, const llama_model * model, @@ -104,7 +103,7 @@ static void sigint_handler(int signo) { int main(int argc, char ** argv) { gpt_params params; - llama_sampling_params & sparams = params.sampling_params; + llama_sampling_params & sparams = params.sparams; g_params = ¶ms; if (!gpt_params_parse(argc, argv, params)) { @@ -147,6 +146,13 @@ int main(int argc, char ** argv) { return 0; } + if (params.chatml) { + printf("\n************\n"); + printf("%s: please use the 'main' tool for chatml mode\n", __func__); + printf("************\n\n"); + + return 0; + } if (!params.antiprompt.empty()) { printf("\n************\n"); printf("%s: please use the 'main' tool for antiprompt mode\n", __func__); @@ -184,8 +190,8 @@ int main(int argc, char ** argv) { LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); - LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); + LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); @@ -196,7 +202,8 @@ int main(int argc, char ** argv) { std::mt19937 rng(params.seed); LOG("%s: llama backend init\n", __func__); - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model; llama_context * ctx; @@ -231,11 +238,11 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); LOG_TEE("%s\n", get_system_info(params).c_str()); } - const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM; + const bool add_bos = llama_should_add_bos_token(model); LOG("add_bos: %d\n", add_bos); bool suff_rm_leading_spc = params.escape; - if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) { + if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); suff_rm_leading_spc = false; } @@ -246,14 +253,14 @@ int main(int argc, char ** argv) { if (suff_rm_leading_spc && inp_sfx[0] == space_token) { inp_sfx.erase(inp_sfx.begin()); } - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx)); + inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); if (add_bos) { - inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx)); + inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); } - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx)); + inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); embd_inp = inp_pfx; embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - embd_inp.push_back(llama_token_middle(ctx)); + embd_inp.push_back(llama_token_middle(model)); LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); @@ -261,7 +268,7 @@ int main(int argc, char ** argv) { // Should not run without any tokens if (embd_inp.empty()) { - embd_inp.push_back(llama_token_bos(ctx)); + embd_inp.push_back(llama_token_bos(model)); LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); } @@ -358,36 +365,10 @@ int main(int argc, char ** argv) { LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); } } - LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", - sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau); + LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("\n\n"); - struct llama_grammar * grammar = NULL; - grammar_parser::parse_state parsed_grammar; - - if (!params.grammar.empty()) { - parsed_grammar = grammar_parser::parse(params.grammar.c_str()); - // will be empty (default) if there are parse errors - if (parsed_grammar.rules.empty()) { - return 1; - } - LOG_TEE("%s: grammar:\n", __func__); - grammar_parser::print_grammar(stderr, parsed_grammar); - LOG_TEE("\n"); - - { - auto it = sparams.logit_bias.find(llama_token_eos(ctx)); - if (it != sparams.logit_bias.end() && it->second == -INFINITY) { - LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__); - } - } - - std::vector grammar_rules(parsed_grammar.c_rules()); - grammar = llama_grammar_init( - grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); - } - LOG_TEE("\n##### Infill mode #####\n\n"); if (params.infill) { printf("\n************\n"); @@ -430,7 +411,7 @@ int main(int argc, char ** argv) { std::vector embd; std::vector embd_guidance; - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); while (n_remain != 0 || params.interactive) { // predict @@ -549,7 +530,7 @@ int main(int argc, char ** argv) { const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); - llama_sampling_accept(ctx_sampling, ctx, id); + llama_sampling_accept(ctx_sampling, ctx, id, true); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); @@ -567,8 +548,11 @@ int main(int argc, char ** argv) { LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); while ((int) embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); - ctx_sampling->prev.erase(ctx_sampling->prev.begin()); - ctx_sampling->prev.push_back(embd_inp[n_consumed]); + + // push the prompt in the sampling context in order to apply repetition penalties later + // for the prompt, we don't apply grammar rules + llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); + ++n_consumed; if ((int) embd.size() >= params.n_batch) { break; @@ -600,10 +584,10 @@ int main(int argc, char ** argv) { if ((int) embd_inp.size() <= n_consumed) { // deal with eot token in infill mode - if ((ctx_sampling->prev.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){ + if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){ if(is_interacting && !params.interactive_first) { // print an eot token - printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str()); + printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); } fflush(stdout); printf("\n"); @@ -617,7 +601,7 @@ int main(int argc, char ** argv) { buffer += line; } while (another_line); // check if we got an empty line, if so we use the old input - if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { + if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { params.input_prefix = buffer; } buffer.clear(); @@ -627,7 +611,7 @@ int main(int argc, char ** argv) { buffer += line; } while (another_line); // check if we got an empty line - if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { + if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { params.input_suffix = buffer; } buffer.clear(); @@ -640,7 +624,7 @@ int main(int argc, char ** argv) { process_escapes(params.input_suffix); } suff_rm_leading_spc = params.escape; - if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) { + if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); suff_rm_leading_spc = false; } @@ -650,14 +634,14 @@ int main(int argc, char ** argv) { if (suff_rm_leading_spc && inp_sfx[0] == space_token) { inp_sfx.erase(inp_sfx.begin()); } - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx)); + inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); if (add_bos) { - inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx)); + inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); } - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx)); + inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); embd_inp = inp_pfx; embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - embd_inp.push_back(llama_token_middle(ctx)); + embd_inp.push_back(llama_token_middle(model)); embd.clear(); embd_guidance.clear(); n_remain = params.n_predict; @@ -667,7 +651,7 @@ int main(int argc, char ** argv) { is_interacting = false; } // deal with end of text token in interactive mode - else if (ctx_sampling->prev.back() == llama_token_eos(ctx)) { + else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { LOG("found EOS token\n"); if (params.interactive) { @@ -684,7 +668,7 @@ int main(int argc, char ** argv) { if (params.input_prefix_bos) { LOG("adding input prefix BOS token\n"); - embd_inp.push_back(llama_token_bos(ctx)); + embd_inp.push_back(llama_token_bos(model)); } std::string buffer; @@ -740,22 +724,14 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - // reset grammar state if we're restarting generation - if (grammar != NULL) { - llama_grammar_free(grammar); - - std::vector grammar_rules(parsed_grammar.c_rules()); - grammar = llama_grammar_init( - grammar_rules.data(), grammar_rules.size(), - parsed_grammar.symbol_ids.at("root")); - } + llama_sampling_reset(ctx_sampling); } is_interacting = false; } } // end of text token - if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) { + if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) { break; } @@ -767,7 +743,7 @@ int main(int argc, char ** argv) { } } if (!params.interactive && n_remain <= 0) { - printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str()); + printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); fflush(stdout); } @@ -778,9 +754,7 @@ int main(int argc, char ** argv) { llama_free(ctx); llama_free_model(model); - if (grammar != NULL) { - llama_grammar_free(grammar); - } + llama_sampling_free(ctx_sampling); llama_backend_free(); #ifndef LOG_DISABLE_LOGS diff --git a/examples/json-schema-to-grammar.py b/examples/json-schema-to-grammar.py index 2a4cb65bc..6a977f031 100755 --- a/examples/json-schema-to-grammar.py +++ b/examples/json-schema-to-grammar.py @@ -87,7 +87,21 @@ class SchemaConverter: elif schema_type == 'array' and 'items' in schema: # TODO `prefixItems` keyword item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item') - rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space' + list_item_operator = f'("," space {item_rule_name})' + successive_items = "" + min_items = schema.get("minItems", 0) + if min_items > 0: + first_item = f"({item_rule_name})" + successive_items = list_item_operator * (min_items - 1) + min_items -= 1 + else: + first_item = f"({item_rule_name})?" + max_items = schema.get("maxItems") + if max_items is not None and max_items > min_items: + successive_items += (list_item_operator + "?") * (max_items - min_items - 1) + else: + successive_items += list_item_operator + "*" + rule = f'"[" space {first_item} {successive_items} "]" space' return self._add_rule(rule_name, rule) else: diff --git a/examples/llama-bench/CMakeLists.txt b/examples/llama-bench/CMakeLists.txt index 7e395afd0..5bdbea4e2 100644 --- a/examples/llama-bench/CMakeLists.txt +++ b/examples/llama-bench/CMakeLists.txt @@ -3,6 +3,3 @@ add_executable(${TARGET} llama-bench.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index d02824bfa..374e40a7d 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -23,19 +23,23 @@ usage: ./llama-bench [options] options: -h, --help - -m, --model (default: models/7B/ggml-model-q4_0.gguf) - -p, --n-prompt (default: 512) - -n, --n-gen (default: 128) - -b, --batch-size (default: 512) - --memory-f32 <0|1> (default: 0) - -t, --threads (default: 16) - -ngl N, --n-gpu-layers (default: 99) - -mg i, --main-gpu (default: 0) - -mmq, --mul-mat-q <0|1> (default: 1) - -ts, --tensor_split - -r, --repetitions (default: 5) - -o, --output (default: md) - -v, --verbose (default: 0) + -m, --model (default: models/7B/ggml-model-q4_0.gguf) + -p, --n-prompt (default: 512) + -n, --n-gen (default: 128) + -b, --batch-size (default: 512) + -ctk , --cache-type-k (default: f16) + -ctv , --cache-type-v (default: f16) + -t, --threads (default: 112) + -ngl, --n-gpu-layers (default: 99) + -sm, --split-mode (default: layer) + -mg, --main-gpu (default: 0) + -nkvo, --no-kv-offload <0|1> (default: 0) + -mmp, --mmap <0|1> (default: 1) + -mmq, --mul-mat-q <0|1> (default: 1) + -ts, --tensor_split (default: 0) + -r, --repetitions (default: 5) + -o, --output (default: md) + -v, --verbose (default: 0) Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. ``` @@ -51,6 +55,10 @@ Each test is repeated the number of times given by `-r`, and the results are ave For a description of the other options, see the [main example](../main/README.md). +Note: + +- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`. + ## Examples ### Text generation with different models diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index a04115c96..8fec3d43d 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -19,8 +19,8 @@ #include "ggml.h" #include "llama.h" #include "common.h" -#include "build-info.h" #include "ggml-cuda.h" +#include "ggml-sycl.h" // utils static uint64_t get_time_ns() { @@ -54,6 +54,13 @@ static std::vector split(const std::string & str, char delim) { return values; } +template +static std::vector transform_to_str(const std::vector & values, F f) { + std::vector str_values; + std::transform(values.begin(), values.end(), std::back_inserter(str_values), f); + return str_values; +} + template static T avg(const std::vector & v) { if (v.empty()) { @@ -114,6 +121,22 @@ static std::string get_gpu_info() { id += "/"; } } +#endif +#ifdef GGML_USE_SYCL + int device_list[GGML_SYCL_MAX_DEVICES]; + ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES); + + for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) { + if (device_list[i] >0 ){ + char buf[128]; + ggml_sycl_get_device_description(i, buf, sizeof(buf)); + id += buf; + id += "/"; + } + } + if (id.length() >2 ) { + id.pop_back(); + } #endif // TODO: other backends return id; @@ -122,17 +145,40 @@ static std::string get_gpu_info() { // command line params enum output_formats {CSV, JSON, MARKDOWN, SQL}; +static const char * output_format_str(output_formats format) { + switch (format) { + case CSV: return "csv"; + case JSON: return "json"; + case MARKDOWN: return "md"; + case SQL: return "sql"; + default: GGML_ASSERT(!"invalid output format"); + } +} + +static const char * split_mode_str(llama_split_mode mode) { + switch (mode) { + case LLAMA_SPLIT_MODE_NONE: return "none"; + case LLAMA_SPLIT_MODE_LAYER: return "layer"; + case LLAMA_SPLIT_MODE_ROW: return "row"; + default: GGML_ASSERT(!"invalid split mode"); + } +} + struct cmd_params { std::vector model; std::vector n_prompt; std::vector n_gen; std::vector n_batch; - std::vector f32_kv; + std::vector type_k; + std::vector type_v; std::vector n_threads; std::vector n_gpu_layers; + std::vector split_mode; std::vector main_gpu; + std::vector no_kv_offload; std::vector mul_mat_q; - std::vector> tensor_split; + std::vector> tensor_split; + std::vector use_mmap; int reps; bool verbose; output_formats output_format; @@ -143,12 +189,16 @@ static const cmd_params cmd_params_defaults = { /* n_prompt */ {512}, /* n_gen */ {128}, /* n_batch */ {512}, - /* f32_kv */ {false}, + /* type_k */ {GGML_TYPE_F16}, + /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, + /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, + /* no_kv_offload */ {false}, /* mul_mat_q */ {true}, - /* tensor_split */ {{}}, + /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, + /* use_mmap */ {true}, /* reps */ 5, /* verbose */ false, /* output_format */ MARKDOWN @@ -159,24 +209,51 @@ static void print_usage(int /* argc */, char ** argv) { printf("\n"); printf("options:\n"); printf(" -h, --help\n"); - printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); - printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); - printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); - printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str()); - printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); - printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); - printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); - printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); - printf(" -ts, --tensor_split \n"); - printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); - printf(" -o, --output (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql"); - printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); + printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); + printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); + printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); + printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" -ctk , --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv , --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); + printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); + printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); + printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); + printf(" -ts, --tensor_split (default: 0)\n"); + printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); + printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); + printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf("\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); - } +static ggml_type ggml_type_from_name(const std::string & s) { + if (s == "f16") { + return GGML_TYPE_F16; + } + if (s == "q8_0") { + return GGML_TYPE_Q8_0; + } + if (s == "q4_0") { + return GGML_TYPE_Q4_0; + } + if (s == "q4_1") { + return GGML_TYPE_Q4_1; + } + if (s == "q5_0") { + return GGML_TYPE_Q5_0; + } + if (s == "q5_1") { + return GGML_TYPE_Q5_1; + } + + return GGML_TYPE_COUNT; +} + + static cmd_params parse_cmd_params(int argc, char ** argv) { cmd_params params; std::string arg; @@ -225,13 +302,38 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); - } else if (arg == "--memory-f32") { + } else if (arg == "-ctk" || arg == "--cache-type-k") { if (++i >= argc) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); - params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end()); + auto p = split(argv[i], split_delim); + std::vector types; + for (const auto & t : p) { + ggml_type gt = ggml_type_from_name(t); + if (gt == GGML_TYPE_COUNT) { + invalid_param = true; + break; + } + types.push_back(gt); + } + params.type_k.insert(params.type_k.end(), types.begin(), types.end()); + } else if (arg == "-ctv" || arg == "--cache-type-v") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + std::vector types; + for (const auto & t : p) { + ggml_type gt = ggml_type_from_name(t); + if (gt == GGML_TYPE_COUNT) { + invalid_param = true; + break; + } + types.push_back(gt); + } + params.type_v.insert(params.type_v.end(), types.begin(), types.end()); } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; @@ -246,12 +348,41 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); + } else if (arg == "-sm" || arg == "--split-mode") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + std::vector modes; + for (const auto & m : p) { + llama_split_mode mode; + if (m == "none") { + mode = LLAMA_SPLIT_MODE_NONE; + } else if (m == "layer") { + mode = LLAMA_SPLIT_MODE_LAYER; + } else if (m == "row") { + mode = LLAMA_SPLIT_MODE_ROW; + } else { + invalid_param = true; + break; + } + modes.push_back(mode); + } + params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); } else if (arg == "-mg" || arg == "--main-gpu") { if (++i >= argc) { invalid_param = true; break; } params.main_gpu = split(argv[i], split_delim); + } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); } else if (arg == "-mmq" || arg == "--mul-mat-q") { if (++i >= argc) { invalid_param = true; @@ -259,6 +390,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end()); + } else if (arg == "-mmp" || arg == "--mmap") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); } else if (arg == "-ts" || arg == "--tensor-split") { if (++i >= argc) { invalid_param = true; @@ -269,10 +407,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { const std::regex regex{R"([;/]+)"}; std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + GGML_ASSERT(split_arg.size() <= llama_max_devices()); - std::array tensor_split; - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { + std::vector tensor_split(llama_max_devices()); + for (size_t i = 0; i < llama_max_devices(); ++i) { if (i < split_arg.size()) { tensor_split[i] = std::stof(split_arg[i]); } else { @@ -322,11 +460,15 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } - if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; } + if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } + if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } + if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } + if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } + if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } return params; @@ -337,19 +479,25 @@ struct cmd_params_instance { int n_prompt; int n_gen; int n_batch; - bool f32_kv; + ggml_type type_k; + ggml_type type_v; int n_threads; int n_gpu_layers; + llama_split_mode split_mode; int main_gpu; + bool no_kv_offload; bool mul_mat_q; - std::array tensor_split; + std::vector tensor_split; + bool use_mmap; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); mparams.n_gpu_layers = n_gpu_layers; + mparams.split_mode = split_mode; mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); + mparams.use_mmap = use_mmap; return mparams; } @@ -357,7 +505,9 @@ struct cmd_params_instance { bool equal_mparams(const cmd_params_instance & other) const { return model == other.model && n_gpu_layers == other.n_gpu_layers && + split_mode == other.split_mode && main_gpu == other.main_gpu && + use_mmap == other.use_mmap && tensor_split == other.tensor_split; } @@ -366,53 +516,30 @@ struct cmd_params_instance { cparams.n_ctx = n_prompt + n_gen; cparams.n_batch = n_batch; - cparams.f16_kv = !f32_kv; + cparams.type_k = type_k; + cparams.type_v = type_v; cparams.mul_mat_q = mul_mat_q; + cparams.offload_kqv = !no_kv_offload; return cparams; } }; -static std::vector get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) { - std::vector instances; - - for (const auto & m : params.model) - for (const auto & nl : params.n_gpu_layers) - for (const auto & mg : params.main_gpu) - for (const auto & ts : params.tensor_split) - for (const auto & nb : params.n_batch) - for (const auto & fk : params.f32_kv) - for (const auto & mmq : params.mul_mat_q) - for (const auto & nt : params.n_threads) { - cmd_params_instance instance = { - /* .model = */ m, - /* .n_prompt = */ n_prompt, - /* .n_gen = */ n_gen, - /* .n_batch = */ nb, - /* .f32_kv = */ fk, - /* .n_threads = */ nt, - /* .n_gpu_layers = */ nl, - /* .main_gpu = */ mg, - /* .mul_mat_q = */ mmq, - /* .tensor_split = */ ts, - }; - instances.push_back(instance); - } - return instances; -} - static std::vector get_cmd_params_instances(const cmd_params & params) { std::vector instances; -#if 1 // this ordering minimizes the number of times that each model needs to be reloaded for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) + for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) for (const auto & ts : params.tensor_split) + for (const auto & mmp : params.use_mmap) for (const auto & nb : params.n_batch) - for (const auto & fk : params.f32_kv) + for (const auto & tk : params.type_k) + for (const auto & tv : params.type_v) for (const auto & mmq : params.mul_mat_q) + for (const auto & nkvo : params.no_kv_offload) for (const auto & nt : params.n_threads) { for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { @@ -423,12 +550,16 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_prompt = */ n_prompt, /* .n_gen = */ 0, /* .n_batch = */ nb, - /* .f32_kv = */ fk, + /* .type_k = */ tk, + /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, + /* .split_mode = */ sm, /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, + /* .use_mmap = */ mmp, }; instances.push_back(instance); } @@ -442,34 +573,20 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_prompt = */ 0, /* .n_gen = */ n_gen, /* .n_batch = */ nb, - /* .f32_kv = */ fk, + /* .type_k = */ tk, + /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, + /* .split_mode = */ sm, /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, + /* .use_mmap = */ mmp, }; instances.push_back(instance); } } -#else - // this ordering separates the prompt and generation tests - for (const auto & n_prompt : params.n_prompt) { - if (n_prompt == 0) { - continue; - } - auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt); - instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end()); - } - - for (const auto & n_gen : params.n_gen) { - if (n_gen == 0) { - continue; - } - auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0); - instances.insert(instances.end(), instances_gen.begin(), instances_gen.end()); - } -#endif return instances; } @@ -479,7 +596,10 @@ struct test { static const int build_number; static const bool cuda; static const bool opencl; + static const bool vulkan; + static const bool kompute; static const bool metal; + static const bool sycl; static const bool gpu_blas; static const bool blas; static const std::string cpu_info; @@ -490,11 +610,15 @@ struct test { uint64_t model_n_params; int n_batch; int n_threads; - bool f32_kv; + ggml_type type_k; + ggml_type type_v; int n_gpu_layers; + llama_split_mode split_mode; int main_gpu; + bool no_kv_offload; bool mul_mat_q; - std::array tensor_split; + std::vector tensor_split; + bool use_mmap; int n_prompt; int n_gen; std::string test_time; @@ -509,11 +633,15 @@ struct test { model_n_params = llama_model_n_params(lmodel); n_batch = inst.n_batch; n_threads = inst.n_threads; - f32_kv = inst.f32_kv; + type_k = inst.type_k; + type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; + split_mode = inst.split_mode; main_gpu = inst.main_gpu; + no_kv_offload = inst.no_kv_offload; mul_mat_q = inst.mul_mat_q; tensor_split = inst.tensor_split; + use_mmap = inst.use_mmap; n_prompt = inst.n_prompt; n_gen = inst.n_gen; // RFC 3339 date-time format @@ -554,26 +682,38 @@ struct test { if (opencl) { return "OpenCL"; } + if (vulkan) { + return "Vulkan"; + } + if (kompute) { + return "Kompute"; + } if (metal) { return "Metal"; } + if (sycl) { + return GGML_SYCL_NAME; + } if (gpu_blas) { return "GPU BLAS"; } if (blas) { return "BLAS"; } + return "CPU"; } static const std::vector & get_fields() { static const std::vector fields = { "build_commit", "build_number", - "cuda", "opencl", "metal", "gpu_blas", "blas", + "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", - "n_batch", "n_threads", "f16_kv", - "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split", + "n_batch", "n_threads", "type_k", "type_v", + "n_gpu_layers", "split_mode", + "main_gpu", "no_kv_offload", + "mul_mat_q", "tensor_split", "use_mmap", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" @@ -591,8 +731,9 @@ struct test { field == "avg_ns" || field == "stddev_ns") { return INT; } - if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" || - field == "f16_kv" || field == "mul_mat_q") { + if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" || + field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || + field == "mul_mat_q" || field == "use_mmap") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -604,7 +745,7 @@ struct test { std::vector get_values() const { std::string tensor_split_str; int max_nonzero = 0; - for (int i = 0; i < LLAMA_MAX_DEVICES; i++) { + for (size_t i = 0; i < llama_max_devices(); i++) { if (tensor_split[i] > 0) { max_nonzero = i; } @@ -619,11 +760,14 @@ struct test { } std::vector values = { build_commit, std::to_string(build_number), - std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), + std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan), + std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas), cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), - std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv), - std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str, + std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), + std::to_string(n_gpu_layers), split_mode_str(split_mode), + std::to_string(main_gpu), std::to_string(no_kv_offload), + std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap), std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ts()), std::to_string(stdev_ts()) @@ -641,13 +785,16 @@ struct test { } }; -const std::string test::build_commit = BUILD_COMMIT; -const int test::build_number = BUILD_NUMBER; +const std::string test::build_commit = LLAMA_COMMIT; +const int test::build_number = LLAMA_BUILD_NUMBER; const bool test::cuda = !!ggml_cpu_has_cublas(); const bool test::opencl = !!ggml_cpu_has_clblast(); +const bool test::vulkan = !!ggml_cpu_has_vulkan(); +const bool test::kompute = !!ggml_cpu_has_kompute(); const bool test::metal = !!ggml_cpu_has_metal(); const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); const bool test::blas = !!ggml_cpu_has_blas(); +const bool test::sycl = !!ggml_cpu_has_sycl(); const std::string test::cpu_info = get_cpu_info(); const std::string test::gpu_info = get_gpu_info(); @@ -778,12 +925,21 @@ struct markdown_printer : public printer { if (field == "n_gpu_layers") { return "ngl"; } + if (field == "split_mode") { + return "sm"; + } if (field == "n_threads") { return "threads"; } if (field == "mul_mat_q") { return "mmq"; } + if (field == "no_kv_offload") { + return "nkvo"; + } + if (field == "use_mmap") { + return "mmap"; + } if (field == "tensor_split") { return "ts"; } @@ -792,34 +948,46 @@ struct markdown_printer : public printer { void print_header(const cmd_params & params) override { // select fields to print - fields.push_back("model"); - fields.push_back("size"); - fields.push_back("params"); - fields.push_back("backend"); + fields.emplace_back("model"); + fields.emplace_back("size"); + fields.emplace_back("params"); + fields.emplace_back("backend"); bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS"; if (!is_cpu_backend) { - fields.push_back("n_gpu_layers"); + fields.emplace_back("n_gpu_layers"); } if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { - fields.push_back("n_threads"); + fields.emplace_back("n_threads"); } if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { - fields.push_back("n_batch"); + fields.emplace_back("n_batch"); } - if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) { - fields.push_back("f16_kv"); + if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) { + fields.emplace_back("type_k"); + } + if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) { + fields.emplace_back("type_v"); } if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { - fields.push_back("main_gpu"); + fields.emplace_back("main_gpu"); + } + if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) { + fields.emplace_back("split_mode"); } if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { - fields.push_back("mul_mat_q"); + fields.emplace_back("mul_mat_q"); + } + if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { + fields.emplace_back("no_kv_offload"); } if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { - fields.push_back("tensor_split"); + fields.emplace_back("tensor_split"); } - fields.push_back("test"); - fields.push_back("t/s"); + if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) { + fields.emplace_back("use_mmap"); + } + fields.emplace_back("test"); + fields.emplace_back("t/s"); fprintf(fout, "|"); for (const auto & field : fields) { @@ -933,7 +1101,7 @@ struct sql_printer : public printer { }; static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { - std::vector tokens(n_batch, llama_token_bos(ctx)); + std::vector tokens(n_batch, llama_token_bos(llama_get_model(ctx))); int n_processed = 0; llama_set_n_threads(ctx, n_threads, n_threads); @@ -946,7 +1114,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat } static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { - llama_token token = llama_token_bos(ctx); + llama_token token = llama_token_bos(llama_get_model(ctx)); llama_set_n_threads(ctx, n_threads, n_threads); @@ -983,8 +1151,7 @@ int main(int argc, char ** argv) { if (!params.verbose) { llama_log_set(llama_null_log_callback, NULL); } - bool numa = false; - llama_backend_init(numa); + llama_backend_init(); // initialize printer std::unique_ptr p; @@ -1037,7 +1204,7 @@ int main(int argc, char ** argv) { test t(inst, lmodel, ctx); - llama_kv_cache_tokens_rm(ctx, -1, -1); + llama_kv_cache_clear(ctx); // warmup run if (t.n_prompt > 0) { @@ -1048,7 +1215,7 @@ int main(int argc, char ** argv) { } for (int i = 0; i < params.reps; i++) { - llama_kv_cache_tokens_rm(ctx, -1, -1); + llama_kv_cache_clear(ctx); uint64_t t_start = get_time_ns(); if (t.n_prompt > 0) { diff --git a/examples/llama.android/.gitignore b/examples/llama.android/.gitignore new file mode 100644 index 000000000..347e252ef --- /dev/null +++ b/examples/llama.android/.gitignore @@ -0,0 +1,33 @@ +# Gradle files +.gradle/ +build/ + +# Local configuration file (sdk path, etc) +local.properties + +# Log/OS Files +*.log + +# Android Studio generated files and folders +captures/ +.externalNativeBuild/ +.cxx/ +*.apk +output.json + +# IntelliJ +*.iml +.idea/ +misc.xml +deploymentTargetDropDown.xml +render.experimental.xml + +# Keystore files +*.jks +*.keystore + +# Google Services (e.g. APIs or Firebase) +google-services.json + +# Android Profiling +*.hprof diff --git a/examples/llama.android/README.md b/examples/llama.android/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/examples/llama.android/app/.gitignore b/examples/llama.android/app/.gitignore new file mode 100644 index 000000000..796b96d1c --- /dev/null +++ b/examples/llama.android/app/.gitignore @@ -0,0 +1 @@ +/build diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts new file mode 100644 index 000000000..aadbe22c9 --- /dev/null +++ b/examples/llama.android/app/build.gradle.kts @@ -0,0 +1,92 @@ +plugins { + id("com.android.application") + id("org.jetbrains.kotlin.android") +} + +android { + namespace = "com.example.llama" + compileSdk = 34 + + ndkVersion = "26.1.10909125" + + defaultConfig { + applicationId = "com.example.llama" + minSdk = 33 + targetSdk = 34 + versionCode = 1 + versionName = "1.0" + + testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" + vectorDrawables { + useSupportLibrary = true + } + ndk { + // Workaround for https://github.com/llvm/llvm-project/issues/65820 + // affecting armeabi-v7a. Skip armeabi-v7a when invoked with + // -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a). + if (project.hasProperty("skip-armeabi-v7a")) { + abiFilters += listOf("arm64-v8a", "x86_64", "x86") + } + } + externalNativeBuild { + cmake { + arguments += "-DCMAKE_BUILD_TYPE=Release" + cppFlags += listOf() + arguments += listOf() + } + } + } + + buildTypes { + release { + isMinifyEnabled = false + proguardFiles( + getDefaultProguardFile("proguard-android-optimize.txt"), + "proguard-rules.pro" + ) + } + } + compileOptions { + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 + } + kotlinOptions { + jvmTarget = "1.8" + } + buildFeatures { + compose = true + } + composeOptions { + kotlinCompilerExtensionVersion = "1.5.1" + } + packaging { + resources { + excludes += "/META-INF/{AL2.0,LGPL2.1}" + } + } + externalNativeBuild { + cmake { + path = file("src/main/cpp/CMakeLists.txt") + version = "3.22.1" + } + } +} + +dependencies { + + implementation("androidx.core:core-ktx:1.12.0") + implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2") + implementation("androidx.activity:activity-compose:1.8.2") + implementation(platform("androidx.compose:compose-bom:2023.08.00")) + implementation("androidx.compose.ui:ui") + implementation("androidx.compose.ui:ui-graphics") + implementation("androidx.compose.ui:ui-tooling-preview") + implementation("androidx.compose.material3:material3") + testImplementation("junit:junit:4.13.2") + androidTestImplementation("androidx.test.ext:junit:1.1.5") + androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") + androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00")) + androidTestImplementation("androidx.compose.ui:ui-test-junit4") + debugImplementation("androidx.compose.ui:ui-tooling") + debugImplementation("androidx.compose.ui:ui-test-manifest") +} diff --git a/examples/llama.android/app/proguard-rules.pro b/examples/llama.android/app/proguard-rules.pro new file mode 100644 index 000000000..f1b424510 --- /dev/null +++ b/examples/llama.android/app/proguard-rules.pro @@ -0,0 +1,21 @@ +# Add project specific ProGuard rules here. +# You can control the set of applied configuration files using the +# proguardFiles setting in build.gradle. +# +# For more details, see +# http://developer.android.com/guide/developing/tools/proguard.html + +# If your project uses WebView with JS, uncomment the following +# and specify the fully qualified class name to the JavaScript interface +# class: +#-keepclassmembers class fqcn.of.javascript.interface.for.webview { +# public *; +#} + +# Uncomment this to preserve the line number information for +# debugging stack traces. +#-keepattributes SourceFile,LineNumberTable + +# If you keep the line number information, uncomment this to +# hide the original source file name. +#-renamesourcefileattribute SourceFile diff --git a/examples/llama.android/app/src/main/AndroidManifest.xml b/examples/llama.android/app/src/main/AndroidManifest.xml new file mode 100644 index 000000000..41a358a29 --- /dev/null +++ b/examples/llama.android/app/src/main/AndroidManifest.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/cpp/CMakeLists.txt b/examples/llama.android/app/src/main/cpp/CMakeLists.txt new file mode 100644 index 000000000..85139329a --- /dev/null +++ b/examples/llama.android/app/src/main/cpp/CMakeLists.txt @@ -0,0 +1,50 @@ + +# For more information about using CMake with Android Studio, read the +# documentation: https://d.android.com/studio/projects/add-native-code.html. +# For more examples on how to use CMake, see https://github.com/android/ndk-samples. + +# Sets the minimum CMake version required for this project. +cmake_minimum_required(VERSION 3.22.1) + +# Declares the project name. The project name can be accessed via ${ PROJECT_NAME}, +# Since this is the top level CMakeLists.txt, the project name is also accessible +# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level +# build script scope). +project("llama-android") + +include(FetchContent) +FetchContent_Declare( + llama + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp + GIT_TAG master +) + +# Also provides "common" +FetchContent_MakeAvailable(llama) + +# Creates and names a library, sets it as either STATIC +# or SHARED, and provides the relative paths to its source code. +# You can define multiple libraries, and CMake builds them for you. +# Gradle automatically packages shared libraries with your APK. +# +# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define +# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME} +# is preferred for the same purpose. +# +# In order to load a library into your app from Java/Kotlin, you must call +# System.loadLibrary() and pass the name of the library defined here; +# for GameActivity/NativeActivity derived applications, the same library name must be +# used in the AndroidManifest.xml file. +add_library(${CMAKE_PROJECT_NAME} SHARED + # List C/C++ source files with relative paths to this CMakeLists.txt. + llama-android.cpp) + +# Specifies libraries CMake should link to your target library. You +# can link libraries from various origins, such as libraries defined in this +# build script, prebuilt third-party libraries, or Android system libraries. +target_link_libraries(${CMAKE_PROJECT_NAME} + # List libraries link to the target library + llama + common + android + log) diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp new file mode 100644 index 000000000..2beb1e0d5 --- /dev/null +++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp @@ -0,0 +1,394 @@ +#include +#include +#include +#include +#include +#include +#include "llama.h" +#include "common/common.h" + +// Write C++ code here. +// +// Do not forget to dynamically load the C++ library into your application. +// +// For instance, +// +// In MainActivity.java: +// static { +// System.loadLibrary("llama-android"); +// } +// +// Or, in MainActivity.kt: +// companion object { +// init { +// System.loadLibrary("llama-android") +// } +// } + +#define TAG "llama-android.cpp" +#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__) +#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__) + +jclass la_int_var; +jmethodID la_int_var_value; +jmethodID la_int_var_inc; + +static void log_callback(ggml_log_level level, const char * fmt, void * data) { + if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data); + else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data); + else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data); + else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) { + llama_model_params model_params = llama_model_default_params(); + + auto path_to_model = env->GetStringUTFChars(filename, 0); + LOGi("Loading model from %s", path_to_model); + + auto model = llama_load_model_from_file(path_to_model, model_params); + env->ReleaseStringUTFChars(filename, path_to_model); + + if (!model) { + LOGe("load_model() failed"); + env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed"); + return 0; + } + + return reinterpret_cast(model); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) { + llama_free_model(reinterpret_cast(model)); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) { + auto model = reinterpret_cast(jmodel); + + if (!model) { + LOGe("new_context(): model cannot be null"); + env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null"); + return 0; + } + + int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2)); + LOGi("Using %d threads", n_threads); + + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.seed = 1234; + ctx_params.n_ctx = 2048; + ctx_params.n_threads = n_threads; + ctx_params.n_threads_batch = n_threads; + + llama_context * context = llama_new_context_with_model(model, ctx_params); + + if (!context) { + LOGe("llama_new_context_with_model() returned null)"); + env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), + "llama_new_context_with_model() returned null)"); + return 0; + } + + return reinterpret_cast(context); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) { + llama_free(reinterpret_cast(context)); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) { + llama_backend_free(); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) { + llama_log_set(log_callback, NULL); +} + +extern "C" +JNIEXPORT jstring JNICALL +Java_com_example_llama_Llm_bench_1model( + JNIEnv *env, + jobject, + jlong context_pointer, + jlong model_pointer, + jlong batch_pointer, + jint pp, + jint tg, + jint pl, + jint nr + ) { + auto pp_avg = 0.0; + auto tg_avg = 0.0; + auto pp_std = 0.0; + auto tg_std = 0.0; + + const auto context = reinterpret_cast(context_pointer); + const auto model = reinterpret_cast(model_pointer); + const auto batch = reinterpret_cast(batch_pointer); + + const int n_ctx = llama_n_ctx(context); + + LOGi("n_ctx = %d", n_ctx); + + int i, j; + int nri; + for (nri = 0; nri < nr; nri++) { + LOGi("Benchmark prompt processing (pp)"); + + llama_batch_clear(*batch); + + const int n_tokens = pp; + for (i = 0; i < n_tokens; i++) { + llama_batch_add(*batch, 0, i, { 0 }, false); + } + + batch->logits[batch->n_tokens - 1] = true; + llama_kv_cache_clear(context); + + const auto t_pp_start = ggml_time_us(); + if (llama_decode(context, *batch) != 0) { + LOGi("llama_decode() failed during prompt processing"); + } + const auto t_pp_end = ggml_time_us(); + + // bench text generation + + LOGi("Benchmark text generation (tg)"); + + llama_kv_cache_clear(context); + const auto t_tg_start = ggml_time_us(); + for (i = 0; i < tg; i++) { + + llama_batch_clear(*batch); + for (j = 0; j < pl; j++) { + llama_batch_add(*batch, 0, i, { j }, true); + } + + LOGi("llama_decode() text generation: %d", i); + if (llama_decode(context, *batch) != 0) { + LOGi("llama_decode() failed during text generation"); + } + } + + const auto t_tg_end = ggml_time_us(); + + llama_kv_cache_clear(context); + + const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0; + const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0; + + const auto speed_pp = double(pp) / t_pp; + const auto speed_tg = double(pl * tg) / t_tg; + + pp_avg += speed_pp; + tg_avg += speed_tg; + + pp_std += speed_pp * speed_pp; + tg_std += speed_tg * speed_tg; + + LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg); + } + + pp_avg /= double(nr); + tg_avg /= double(nr); + + if (nr > 1) { + pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1)); + tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1)); + } else { + pp_std = 0; + tg_std = 0; + } + + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + + const auto model_size = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0; + const auto model_n_params = double(llama_model_n_params(model)) / 1e9; + + const auto backend = "(Android)"; // TODO: What should this be? + + std::stringstream result; + result << std::setprecision(2); + result << "| model | size | params | backend | test | t/s |\n"; + result << "| --- | --- | --- | --- | --- | --- |\n"; + result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n"; + result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n"; + + return env->NewStringUTF(result.str().c_str()); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { + llama_batch_free(*reinterpret_cast(batch_pointer)); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) { + + // Source: Copy of llama.cpp:llama_batch_init but heap-allocated. + + llama_batch *batch = new llama_batch { + 0, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + 0, + 0, + 0, + }; + + if (embd) { + batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd); + } else { + batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens); + } + + batch->pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens); + batch->n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens); + batch->seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens); + for (int i = 0; i < n_tokens; ++i) { + batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max); + } + batch->logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens); + + return reinterpret_cast(batch); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) { + llama_backend_init(); +} + +extern "C" +JNIEXPORT jstring JNICALL +Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) { + return env->NewStringUTF(llama_print_system_info()); +} + +extern "C" +JNIEXPORT jint JNICALL +Java_com_example_llama_Llm_completion_1init( + JNIEnv *env, + jobject, + jlong context_pointer, + jlong batch_pointer, + jstring jtext, + jint n_len + ) { + + const auto text = env->GetStringUTFChars(jtext, 0); + const auto context = reinterpret_cast(context_pointer); + const auto batch = reinterpret_cast(batch_pointer); + + const auto tokens_list = llama_tokenize(context, text, 1); + + auto n_ctx = llama_n_ctx(context); + auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); + + LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req); + + if (n_kv_req > n_ctx) { + LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough"); + } + + for (auto id : tokens_list) { + LOGi("%s", llama_token_to_piece(context, id).c_str()); + } + + llama_batch_clear(*batch); + + // evaluate the initial prompt + for (auto i = 0; i < tokens_list.size(); i++) { + llama_batch_add(*batch, tokens_list[i], i, { 0 }, false); + } + + // llama_decode will output logits only for the last token of the prompt + batch->logits[batch->n_tokens - 1] = true; + + if (llama_decode(context, *batch) != 0) { + LOGe("llama_decode() failed"); + } + + env->ReleaseStringUTFChars(jtext, text); + + return batch->n_tokens; +} + +extern "C" +JNIEXPORT jstring JNICALL +Java_com_example_llama_Llm_completion_1loop( + JNIEnv * env, + jobject, + jlong context_pointer, + jlong batch_pointer, + jint n_len, + jobject intvar_ncur +) { + const auto context = reinterpret_cast(context_pointer); + const auto batch = reinterpret_cast(batch_pointer); + const auto model = llama_get_model(context); + + if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur); + if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I"); + if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V"); + + auto n_vocab = llama_n_vocab(model); + auto logits = llama_get_logits_ith(context, batch->n_tokens - 1); + + std::vector candidates; + candidates.reserve(n_vocab); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // sample the most likely token + const auto new_token_id = llama_sample_token_greedy(context, &candidates_p); + + const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); + if (new_token_id == llama_token_eos(model) || n_cur == n_len) { + return env->NewStringUTF(""); + } + + auto new_token_chars = llama_token_to_piece(context, new_token_id); + LOGi("new_token_chars: `%s`", new_token_chars.c_str()); + auto new_token = env->NewStringUTF(new_token_chars.c_str()); + + llama_batch_clear(*batch); + llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true); + + env->CallVoidMethod(intvar_ncur, la_int_var_inc); + + if (llama_decode(context, *batch) != 0) { + LOGe("llama_decode() returned null"); + } + + return new_token; +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) { + llama_kv_cache_clear(reinterpret_cast(context)); +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt new file mode 100644 index 000000000..78c231ae5 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt @@ -0,0 +1,119 @@ +package com.example.llama + +import android.app.DownloadManager +import android.net.Uri +import android.util.Log +import androidx.compose.material3.Button +import androidx.compose.material3.Text +import androidx.compose.runtime.Composable +import androidx.compose.runtime.getValue +import androidx.compose.runtime.mutableDoubleStateOf +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.remember +import androidx.compose.runtime.rememberCoroutineScope +import androidx.compose.runtime.setValue +import androidx.core.database.getLongOrNull +import androidx.core.net.toUri +import kotlinx.coroutines.delay +import kotlinx.coroutines.launch +import java.io.File + +data class Downloadable(val name: String, val source: Uri, val destination: File) { + companion object { + @JvmStatic + private val tag: String? = this::class.qualifiedName + + sealed interface State + data object Ready: State + data class Downloading(val id: Long): State + data class Downloaded(val downloadable: Downloadable): State + data class Error(val message: String): State + + @JvmStatic + @Composable + fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) { + var status: State by remember { + mutableStateOf( + if (item.destination.exists()) Downloaded(item) + else Ready + ) + } + var progress by remember { mutableDoubleStateOf(0.0) } + + val coroutineScope = rememberCoroutineScope() + + suspend fun waitForDownload(result: Downloading, item: Downloadable): State { + while (true) { + val cursor = dm.query(DownloadManager.Query().setFilterById(result.id)) + + if (cursor == null) { + Log.e(tag, "dm.query() returned null") + return Error("dm.query() returned null") + } + + if (!cursor.moveToFirst() || cursor.count < 1) { + cursor.close() + Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?") + return Ready + } + + val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR) + val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES) + val sofar = cursor.getLongOrNull(pix) ?: 0 + val total = cursor.getLongOrNull(tix) ?: 1 + cursor.close() + + if (sofar == total) { + return Downloaded(item) + } + + progress = (sofar * 1.0) / total + + delay(1000L) + } + } + + fun onClick() { + when (val s = status) { + is Downloaded -> { + viewModel.load(item.destination.path) + } + + is Downloading -> { + coroutineScope.launch { + status = waitForDownload(s, item) + } + } + + else -> { + item.destination.delete() + + val request = DownloadManager.Request(item.source).apply { + setTitle("Downloading model") + setDescription("Downloading model: ${item.name}") + setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI) + setDestinationUri(item.destination.toUri()) + } + + viewModel.log("Saving ${item.name} to ${item.destination.path}") + Log.i(tag, "Saving ${item.name} to ${item.destination.path}") + + val id = dm.enqueue(request) + status = Downloading(id) + onClick() + } + } + } + + Button(onClick = { onClick() }, enabled = status !is Downloading) { + when (status) { + is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%") + is Downloaded -> Text("Load ${item.name}") + is Ready -> Text("Download ${item.name}") + is Error -> Text("Download ${item.name}") + } + } + } + + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt new file mode 100644 index 000000000..5f3270372 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt @@ -0,0 +1,172 @@ +package com.example.llama + +import android.util.Log +import kotlinx.coroutines.CoroutineDispatcher +import kotlinx.coroutines.asCoroutineDispatcher +import kotlinx.coroutines.flow.Flow +import kotlinx.coroutines.flow.flow +import kotlinx.coroutines.flow.flowOn +import kotlinx.coroutines.withContext +import java.util.concurrent.Executors +import kotlin.concurrent.thread + +class Llm { + private val tag: String? = this::class.simpleName + + private val threadLocalState: ThreadLocal = ThreadLocal.withInitial { State.Idle } + + private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor { + thread(start = false, name = "Llm-RunLoop") { + Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}") + + // No-op if called more than once. + System.loadLibrary("llama-android") + + // Set llama log handler to Android + log_to_android() + backend_init(false) + + Log.d(tag, system_info()) + + it.run() + }.apply { + uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable -> + Log.e(tag, "Unhandled exception", exception) + } + } + }.asCoroutineDispatcher() + + private val nlen: Int = 64 + + private external fun log_to_android() + private external fun load_model(filename: String): Long + private external fun free_model(model: Long) + private external fun new_context(model: Long): Long + private external fun free_context(context: Long) + private external fun backend_init(numa: Boolean) + private external fun backend_free() + private external fun free_batch(batch: Long) + private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long + private external fun bench_model( + context: Long, + model: Long, + batch: Long, + pp: Int, + tg: Int, + pl: Int, + nr: Int + ): String + + private external fun system_info(): String + + private external fun completion_init( + context: Long, + batch: Long, + text: String, + nLen: Int + ): Int + + private external fun completion_loop( + context: Long, + batch: Long, + nLen: Int, + ncur: IntVar + ): String + + private external fun kv_cache_clear(context: Long) + + suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String { + return withContext(runLoop) { + when (val state = threadLocalState.get()) { + is State.Loaded -> { + Log.d(tag, "bench(): $state") + bench_model(state.context, state.model, state.batch, pp, tg, pl, nr) + } + + else -> throw IllegalStateException("No model loaded") + } + } + } + + suspend fun load(pathToModel: String) { + withContext(runLoop) { + when (threadLocalState.get()) { + is State.Idle -> { + val model = load_model(pathToModel) + if (model == 0L) throw IllegalStateException("load_model() failed") + + val context = new_context(model) + if (context == 0L) throw IllegalStateException("new_context() failed") + + val batch = new_batch(512, 0, 1) + if (batch == 0L) throw IllegalStateException("new_batch() failed") + + Log.i(tag, "Loaded model $pathToModel") + threadLocalState.set(State.Loaded(model, context, batch)) + } + else -> throw IllegalStateException("Model already loaded") + } + } + } + + fun send(message: String): Flow = flow { + when (val state = threadLocalState.get()) { + is State.Loaded -> { + val ncur = IntVar(completion_init(state.context, state.batch, message, nlen)) + while (ncur.value <= nlen) { + val str = completion_loop(state.context, state.batch, nlen, ncur) + if (str.isEmpty()) { + break + } + emit(str) + } + kv_cache_clear(state.context) + } + else -> {} + } + }.flowOn(runLoop) + + /** + * Unloads the model and frees resources. + * + * This is a no-op if there's no model loaded. + */ + suspend fun unload() { + withContext(runLoop) { + when (val state = threadLocalState.get()) { + is State.Loaded -> { + free_context(state.context) + free_model(state.model) + free_batch(state.batch) + + threadLocalState.set(State.Idle) + } + else -> {} + } + } + } + + companion object { + private class IntVar(value: Int) { + @Volatile + var value: Int = value + private set + + fun inc() { + synchronized(this) { + value += 1 + } + } + } + + private sealed interface State { + data object Idle: State + data class Loaded(val model: Long, val context: Long, val batch: Long): State + } + + // Enforce only one instance of Llm. + private val _instance: Llm = Llm() + + fun instance(): Llm = _instance + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt new file mode 100644 index 000000000..9da04f7d3 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt @@ -0,0 +1,154 @@ +package com.example.llama + +import android.app.ActivityManager +import android.app.DownloadManager +import android.content.ClipData +import android.content.ClipboardManager +import android.net.Uri +import android.os.Bundle +import android.os.StrictMode +import android.os.StrictMode.VmPolicy +import android.text.format.Formatter +import androidx.activity.ComponentActivity +import androidx.activity.compose.setContent +import androidx.activity.viewModels +import androidx.compose.foundation.layout.Box +import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.Row +import androidx.compose.foundation.layout.fillMaxSize +import androidx.compose.foundation.layout.padding +import androidx.compose.foundation.lazy.LazyColumn +import androidx.compose.foundation.lazy.items +import androidx.compose.foundation.lazy.rememberLazyListState +import androidx.compose.material3.Button +import androidx.compose.material3.LocalContentColor +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.OutlinedTextField +import androidx.compose.material3.Surface +import androidx.compose.material3.Text +import androidx.compose.runtime.Composable +import androidx.compose.ui.Modifier +import androidx.compose.ui.unit.dp +import androidx.core.content.getSystemService +import com.example.llama.ui.theme.LlamaAndroidTheme +import java.io.File + +class MainActivity( + activityManager: ActivityManager? = null, + downloadManager: DownloadManager? = null, + clipboardManager: ClipboardManager? = null, +): ComponentActivity() { + private val tag: String? = this::class.simpleName + + private val activityManager by lazy { activityManager ?: getSystemService()!! } + private val downloadManager by lazy { downloadManager ?: getSystemService()!! } + private val clipboardManager by lazy { clipboardManager ?: getSystemService()!! } + + private val viewModel: MainViewModel by viewModels() + + // Get a MemoryInfo object for the device's current memory status. + private fun availableMemory(): ActivityManager.MemoryInfo { + return ActivityManager.MemoryInfo().also { memoryInfo -> + activityManager.getMemoryInfo(memoryInfo) + } + } + + override fun onCreate(savedInstanceState: Bundle?) { + super.onCreate(savedInstanceState) + + StrictMode.setVmPolicy( + VmPolicy.Builder(StrictMode.getVmPolicy()) + .detectLeakedClosableObjects() + .build() + ) + + val free = Formatter.formatFileSize(this, availableMemory().availMem) + val total = Formatter.formatFileSize(this, availableMemory().totalMem) + + viewModel.log("Current memory: $free / $total") + viewModel.log("Downloads directory: ${getExternalFilesDir(null)}") + + val extFilesDir = getExternalFilesDir(null) + + val models = listOf( + Downloadable( + "Phi-2 7B (Q4_0, 1.6 GiB)", + Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"), + File(extFilesDir, "phi-2-q4_0.gguf"), + ), + Downloadable( + "TinyLlama 1.1B (f16, 2.2 GiB)", + Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"), + File(extFilesDir, "tinyllama-1.1-f16.gguf"), + ), + Downloadable( + "Phi 2 DPO (Q3_K_M, 1.48 GiB)", + Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"), + File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf") + ), + ) + + setContent { + LlamaAndroidTheme { + // A surface container using the 'background' color from the theme + Surface( + modifier = Modifier.fillMaxSize(), + color = MaterialTheme.colorScheme.background + ) { + MainCompose( + viewModel, + clipboardManager, + downloadManager, + models, + ) + } + + } + } + } +} + +@Composable +fun MainCompose( + viewModel: MainViewModel, + clipboard: ClipboardManager, + dm: DownloadManager, + models: List +) { + Column { + val scrollState = rememberLazyListState() + + Box(modifier = Modifier.weight(1f)) { + LazyColumn(state = scrollState) { + items(viewModel.messages) { + Text( + it, + style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current), + modifier = Modifier.padding(16.dp) + ) + } + } + } + OutlinedTextField( + value = viewModel.message, + onValueChange = { viewModel.updateMessage(it) }, + label = { Text("Message") }, + ) + Row { + Button({ viewModel.send() }) { Text("Send") } + Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") } + Button({ viewModel.clear() }) { Text("Clear") } + Button({ + viewModel.messages.joinToString("\n").let { + clipboard.setPrimaryClip(ClipData.newPlainText("", it)) + } + }) { Text("Copy") } + } + + Column { + for (model in models) { + Downloadable.Button(viewModel, dm, model) + } + } + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt new file mode 100644 index 000000000..be95e2221 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt @@ -0,0 +1,104 @@ +package com.example.llama + +import android.util.Log +import androidx.compose.runtime.getValue +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.setValue +import androidx.lifecycle.ViewModel +import androidx.lifecycle.viewModelScope +import kotlinx.coroutines.flow.catch +import kotlinx.coroutines.launch + +class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() { + companion object { + @JvmStatic + private val NanosPerSecond = 1_000_000_000.0 + } + + private val tag: String? = this::class.simpleName + + var messages by mutableStateOf(listOf("Initializing...")) + private set + + var message by mutableStateOf("") + private set + + override fun onCleared() { + super.onCleared() + + viewModelScope.launch { + try { + llm.unload() + } catch (exc: IllegalStateException) { + messages += exc.message!! + } + } + } + + fun send() { + val text = message + message = "" + + // Add to messages console. + messages += text + messages += "" + + viewModelScope.launch { + llm.send(text) + .catch { + Log.e(tag, "send() failed", it) + messages += it.message!! + } + .collect { messages = messages.dropLast(1) + (messages.last() + it) } + } + } + + fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) { + viewModelScope.launch { + try { + val start = System.nanoTime() + val warmupResult = llm.bench(pp, tg, pl, nr) + val end = System.nanoTime() + + messages += warmupResult + + val warmup = (end - start).toDouble() / NanosPerSecond + messages += "Warm up time: $warmup seconds, please wait..." + + if (warmup > 5.0) { + messages += "Warm up took too long, aborting benchmark" + return@launch + } + + messages += llm.bench(512, 128, 1, 3) + } catch (exc: IllegalStateException) { + Log.e(tag, "bench() failed", exc) + messages += exc.message!! + } + } + } + + fun load(pathToModel: String) { + viewModelScope.launch { + try { + llm.load(pathToModel) + messages += "Loaded $pathToModel" + } catch (exc: IllegalStateException) { + Log.e(tag, "load() failed", exc) + messages += exc.message!! + } + } + } + + fun updateMessage(newMessage: String) { + message = newMessage + } + + fun clear() { + messages = listOf() + } + + fun log(message: String) { + messages += message + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt new file mode 100644 index 000000000..40c30e8d9 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt @@ -0,0 +1,11 @@ +package com.example.llama.ui.theme + +import androidx.compose.ui.graphics.Color + +val Purple80 = Color(0xFFD0BCFF) +val PurpleGrey80 = Color(0xFFCCC2DC) +val Pink80 = Color(0xFFEFB8C8) + +val Purple40 = Color(0xFF6650a4) +val PurpleGrey40 = Color(0xFF625b71) +val Pink40 = Color(0xFF7D5260) diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt new file mode 100644 index 000000000..e742220a8 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt @@ -0,0 +1,70 @@ +package com.example.llama.ui.theme + +import android.app.Activity +import android.os.Build +import androidx.compose.foundation.isSystemInDarkTheme +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.darkColorScheme +import androidx.compose.material3.dynamicDarkColorScheme +import androidx.compose.material3.dynamicLightColorScheme +import androidx.compose.material3.lightColorScheme +import androidx.compose.runtime.Composable +import androidx.compose.runtime.SideEffect +import androidx.compose.ui.graphics.toArgb +import androidx.compose.ui.platform.LocalContext +import androidx.compose.ui.platform.LocalView +import androidx.core.view.WindowCompat + +private val DarkColorScheme = darkColorScheme( + primary = Purple80, + secondary = PurpleGrey80, + tertiary = Pink80 +) + +private val LightColorScheme = lightColorScheme( + primary = Purple40, + secondary = PurpleGrey40, + tertiary = Pink40 + + /* Other default colors to override + background = Color(0xFFFFFBFE), + surface = Color(0xFFFFFBFE), + onPrimary = Color.White, + onSecondary = Color.White, + onTertiary = Color.White, + onBackground = Color(0xFF1C1B1F), + onSurface = Color(0xFF1C1B1F), + */ +) + +@Composable +fun LlamaAndroidTheme( + darkTheme: Boolean = isSystemInDarkTheme(), + // Dynamic color is available on Android 12+ + dynamicColor: Boolean = true, + content: @Composable () -> Unit +) { + val colorScheme = when { + dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> { + val context = LocalContext.current + if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context) + } + + darkTheme -> DarkColorScheme + else -> LightColorScheme + } + val view = LocalView.current + if (!view.isInEditMode) { + SideEffect { + val window = (view.context as Activity).window + window.statusBarColor = colorScheme.primary.toArgb() + WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme + } + } + + MaterialTheme( + colorScheme = colorScheme, + typography = Typography, + content = content + ) +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt new file mode 100644 index 000000000..0b87946ca --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt @@ -0,0 +1,34 @@ +package com.example.llama.ui.theme + +import androidx.compose.material3.Typography +import androidx.compose.ui.text.TextStyle +import androidx.compose.ui.text.font.FontFamily +import androidx.compose.ui.text.font.FontWeight +import androidx.compose.ui.unit.sp + +// Set of Material typography styles to start with +val Typography = Typography( + bodyLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 16.sp, + lineHeight = 24.sp, + letterSpacing = 0.5.sp + ) + /* Other default text styles to override + titleLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 22.sp, + lineHeight = 28.sp, + letterSpacing = 0.sp + ), + labelSmall = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Medium, + fontSize = 11.sp, + lineHeight = 16.sp, + letterSpacing = 0.5.sp + ) + */ +) diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml new file mode 100644 index 000000000..07d5da9cb --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml new file mode 100644 index 000000000..7706ab9e6 --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml new file mode 100644 index 000000000..b3e26b4c6 --- /dev/null +++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml new file mode 100644 index 000000000..b3e26b4c6 --- /dev/null +++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp new file mode 100644 index 000000000..c209e78ec Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp new file mode 100644 index 000000000..b2dfe3d1b Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp new file mode 100644 index 000000000..4f0f1d64e Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp new file mode 100644 index 000000000..62b611da0 Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp new file mode 100644 index 000000000..948a3070f Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp new file mode 100644 index 000000000..1b9a6956b Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp new file mode 100644 index 000000000..28d4b77f9 Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp new file mode 100644 index 000000000..9287f5083 Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp new file mode 100644 index 000000000..aa7d6427e Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp new file mode 100644 index 000000000..9126ae37c Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/values/colors.xml b/examples/llama.android/app/src/main/res/values/colors.xml new file mode 100644 index 000000000..ca1931bca --- /dev/null +++ b/examples/llama.android/app/src/main/res/values/colors.xml @@ -0,0 +1,10 @@ + + + #FFBB86FC + #FF6200EE + #FF3700B3 + #FF03DAC5 + #FF018786 + #FF000000 + #FFFFFFFF + diff --git a/examples/llama.android/app/src/main/res/values/strings.xml b/examples/llama.android/app/src/main/res/values/strings.xml new file mode 100644 index 000000000..7a9d314e2 --- /dev/null +++ b/examples/llama.android/app/src/main/res/values/strings.xml @@ -0,0 +1,3 @@ + + LlamaAndroid + diff --git a/examples/llama.android/app/src/main/res/values/themes.xml b/examples/llama.android/app/src/main/res/values/themes.xml new file mode 100644 index 000000000..8a24fda56 --- /dev/null +++ b/examples/llama.android/app/src/main/res/values/themes.xml @@ -0,0 +1,5 @@ + + + +