* llama : remove write/read of output ids/logits/embeddings This commit removes the write/read of output ids, logits and embeddings from the llama context state. Refs: https://github.com/ggml-org/llama.cpp/pull/18862#issuecomment-3756330941 * completion : add replying of session state This commit updates the session handing in the completion tool to handle the that logits are no longer stored in the session file. Instead, we need to replay the last token to get the logits for sampling. * common : add common_prompt_batch_decode function This commit adds a new function which is responsible for decoding prompt and optionally handle the saving for session data. * update save-state.cpp to use llama_state_load_file This commit updates the save-load-state example to utilize the new llama_state_load_file function for loading the model state from a file. And it also replays the last token after loading since this state is now stored before the last token is processed. * examples : set n_seq_max = 2 for ctx3 This commit updates the save-load-state example to set the n_seq_max parameter to 2 when initializing the ctx3 context. The motivation for this change is that using 1 as n_parallel/n_seq_max the context only supports one sequence, but the test laster tries to use a second sequence which results in the following error: ```console main : loaded state with 4 tokens main : seq 0 copied, 225760 bytes main : kv cache cleared find_slot: seq_id=1 >= n_seq_max=1 Try using a bigger --parallel value state_read_meta: failed to find available cells in kv cache ``` This seems to only happen for recurrent/hybrid models.
1824 lines
61 KiB
C++
1824 lines
61 KiB
C++
#include "ggml.h"
|
|
#include "gguf.h"
|
|
|
|
#include "common.h"
|
|
#include "log.h"
|
|
#include "llama.h"
|
|
#include "sampling.h"
|
|
#include "unicode.h"
|
|
|
|
#include <algorithm>
|
|
#include <cinttypes>
|
|
#include <climits>
|
|
#include <cmath>
|
|
#include <chrono>
|
|
#include <cstdarg>
|
|
#include <cstring>
|
|
#include <ctime>
|
|
#include <filesystem>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <iterator>
|
|
#include <regex>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <thread>
|
|
#include <unordered_set>
|
|
#include <vector>
|
|
|
|
#if defined(__APPLE__) && defined(__MACH__)
|
|
#include <sys/types.h>
|
|
#include <sys/sysctl.h>
|
|
#endif
|
|
|
|
#if defined(_WIN32)
|
|
#define WIN32_LEAN_AND_MEAN
|
|
#ifndef NOMINMAX
|
|
# define NOMINMAX
|
|
#endif
|
|
#include <locale>
|
|
#include <windows.h>
|
|
#include <string.h>
|
|
#include <fcntl.h>
|
|
#include <io.h>
|
|
#else
|
|
#include <sys/ioctl.h>
|
|
#include <sys/stat.h>
|
|
#include <unistd.h>
|
|
#endif
|
|
|
|
#if defined(__linux__)
|
|
#include <sys/types.h>
|
|
#include <pwd.h>
|
|
#endif
|
|
|
|
#if defined(_MSC_VER)
|
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
#endif
|
|
|
|
common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
|
|
|
|
common_time_meas::~common_time_meas() {
|
|
if (t_start_us >= 0) {
|
|
t_acc += ggml_time_us() - t_start_us;
|
|
}
|
|
}
|
|
|
|
//
|
|
// CPU utils
|
|
//
|
|
|
|
int32_t cpu_get_num_physical_cores() {
|
|
#ifdef __linux__
|
|
// enumerate the set of thread siblings, num entries is num cores
|
|
std::unordered_set<std::string> siblings;
|
|
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
|
|
std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
|
|
+ std::to_string(cpu) + "/topology/thread_siblings");
|
|
if (!thread_siblings.is_open()) {
|
|
break; // no more cpus
|
|
}
|
|
std::string line;
|
|
if (std::getline(thread_siblings, line)) {
|
|
siblings.insert(line);
|
|
}
|
|
}
|
|
if (!siblings.empty()) {
|
|
return static_cast<int32_t>(siblings.size());
|
|
}
|
|
#elif defined(__APPLE__) && defined(__MACH__)
|
|
int32_t num_physical_cores;
|
|
size_t len = sizeof(num_physical_cores);
|
|
int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
|
|
if (result == 0) {
|
|
return num_physical_cores;
|
|
}
|
|
result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
|
|
if (result == 0) {
|
|
return num_physical_cores;
|
|
}
|
|
#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
|
// TODO: windows + arm64 + mingw64
|
|
unsigned int n_threads_win = std::thread::hardware_concurrency();
|
|
unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
|
|
|
|
DWORD buffer_size = 0;
|
|
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
|
|
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
|
|
return default_threads;
|
|
}
|
|
}
|
|
|
|
std::vector<char> buffer(buffer_size);
|
|
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
|
|
return default_threads;
|
|
}
|
|
|
|
int32_t num_physical_cores = 0;
|
|
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
|
|
while (buffer_size > 0) {
|
|
if (info->Relationship == RelationProcessorCore) {
|
|
num_physical_cores += info->Processor.GroupCount;
|
|
}
|
|
buffer_size -= info->Size;
|
|
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
|
|
}
|
|
|
|
return num_physical_cores > 0 ? num_physical_cores : default_threads;
|
|
#endif
|
|
unsigned int n_threads = std::thread::hardware_concurrency();
|
|
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
|
}
|
|
|
|
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
|
|
#include <pthread.h>
|
|
|
|
static void cpuid(unsigned leaf, unsigned subleaf,
|
|
unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
|
|
__asm__("movq\t%%rbx,%%rsi\n\t"
|
|
"cpuid\n\t"
|
|
"xchgq\t%%rbx,%%rsi"
|
|
: "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
|
|
: "0"(leaf), "2"(subleaf));
|
|
}
|
|
|
|
static int pin_cpu(int cpu) {
|
|
cpu_set_t mask;
|
|
CPU_ZERO(&mask);
|
|
CPU_SET(cpu, &mask);
|
|
return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
|
|
}
|
|
|
|
static bool is_hybrid_cpu(void) {
|
|
unsigned eax, ebx, ecx, edx;
|
|
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
|
|
return !!(edx & (1u << 15));
|
|
}
|
|
|
|
static bool is_running_on_efficiency_core(void) {
|
|
unsigned eax, ebx, ecx, edx;
|
|
cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
|
|
int intel_atom = 0x20;
|
|
int core_type = (eax & 0xff000000u) >> 24;
|
|
return core_type == intel_atom;
|
|
}
|
|
|
|
static int cpu_count_math_cpus(int n_cpu) {
|
|
int result = 0;
|
|
for (int cpu = 0; cpu < n_cpu; ++cpu) {
|
|
if (pin_cpu(cpu)) {
|
|
return -1;
|
|
}
|
|
if (is_running_on_efficiency_core()) {
|
|
continue; // efficiency cores harm lockstep threading
|
|
}
|
|
++cpu; // hyperthreading isn't useful for linear algebra
|
|
++result;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
#endif // __x86_64__ && __linux__
|
|
|
|
/**
|
|
* Returns number of CPUs on system that are useful for math.
|
|
*/
|
|
int32_t cpu_get_num_math() {
|
|
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
|
|
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
|
|
if (n_cpu < 1) {
|
|
return cpu_get_num_physical_cores();
|
|
}
|
|
if (is_hybrid_cpu()) {
|
|
cpu_set_t affinity;
|
|
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
|
|
int result = cpu_count_math_cpus(n_cpu);
|
|
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
|
|
if (result > 0) {
|
|
return result;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
return cpu_get_num_physical_cores();
|
|
}
|
|
|
|
// Helper for setting process priority
|
|
|
|
#if defined(_WIN32)
|
|
|
|
bool set_process_priority(enum ggml_sched_priority prio) {
|
|
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
|
return true;
|
|
}
|
|
|
|
DWORD p = NORMAL_PRIORITY_CLASS;
|
|
switch (prio) {
|
|
case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
|
|
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
|
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
|
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
|
case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
|
|
}
|
|
|
|
if (!SetPriorityClass(GetCurrentProcess(), p)) {
|
|
LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
#else // MacOS and POSIX
|
|
#include <sys/types.h>
|
|
#include <sys/resource.h>
|
|
|
|
bool set_process_priority(enum ggml_sched_priority prio) {
|
|
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
|
return true;
|
|
}
|
|
|
|
int p = 0;
|
|
switch (prio) {
|
|
case GGML_SCHED_PRIO_LOW: p = 5; break;
|
|
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
|
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
|
case GGML_SCHED_PRIO_HIGH: p = -10; break;
|
|
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
|
|
}
|
|
|
|
if (setpriority(PRIO_PROCESS, 0, p) != 0) {
|
|
LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
#endif
|
|
|
|
//
|
|
// CLI argument parsing
|
|
//
|
|
|
|
|
|
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
|
|
int32_t n_set = 0;
|
|
|
|
if (cpuparams.n_threads < 0) {
|
|
// Assuming everything about cpuparams is invalid
|
|
if (role_model != nullptr) {
|
|
cpuparams = *role_model;
|
|
} else {
|
|
cpuparams.n_threads = cpu_get_num_math();
|
|
}
|
|
}
|
|
|
|
for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
|
|
if (cpuparams.cpumask[i]) {
|
|
n_set++;
|
|
}
|
|
}
|
|
|
|
if (n_set && n_set < cpuparams.n_threads) {
|
|
// Not enough set bits, may experience performance issues.
|
|
LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
|
|
}
|
|
}
|
|
|
|
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
|
size_t dash_loc = range.find('-');
|
|
if (dash_loc == std::string::npos) {
|
|
LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
|
|
return false;
|
|
}
|
|
|
|
size_t start_i;
|
|
size_t end_i;
|
|
|
|
if (dash_loc == 0) {
|
|
start_i = 0;
|
|
} else {
|
|
start_i = std::stoull(range.substr(0, dash_loc));
|
|
if (start_i >= GGML_MAX_N_THREADS) {
|
|
LOG_ERR("Start index out of bounds!\n");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (dash_loc == range.length() - 1) {
|
|
end_i = GGML_MAX_N_THREADS - 1;
|
|
} else {
|
|
end_i = std::stoull(range.substr(dash_loc + 1));
|
|
if (end_i >= GGML_MAX_N_THREADS) {
|
|
LOG_ERR("End index out of bounds!\n");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
for (size_t i = start_i; i <= end_i; i++) {
|
|
boolmask[i] = true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
|
// Discard potential 0x prefix
|
|
size_t start_i = 0;
|
|
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
|
|
start_i = 2;
|
|
}
|
|
|
|
size_t num_digits = mask.length() - start_i;
|
|
if (num_digits > 128) num_digits = 128;
|
|
|
|
size_t end_i = num_digits + start_i;
|
|
|
|
for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
|
|
char c = mask.at(i);
|
|
int8_t id = c;
|
|
|
|
if ((c >= '0' && c <= '9')) {
|
|
id -= '0';
|
|
} else if (c >= 'a' && c <= 'f') {
|
|
id -= 'a' - 10;
|
|
} else if (c >= 'A' && c <= 'F') {
|
|
id -= 'A' - 10;
|
|
} else {
|
|
LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
|
|
return false;
|
|
}
|
|
|
|
boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
|
|
boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
|
|
boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
|
|
boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void common_init() {
|
|
llama_log_set(common_log_default_callback, NULL);
|
|
|
|
#ifdef NDEBUG
|
|
const char * build_type = "";
|
|
#else
|
|
const char * build_type = " (debug)";
|
|
#endif
|
|
|
|
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
|
}
|
|
|
|
std::string common_params_get_system_info(const common_params & params) {
|
|
std::ostringstream os;
|
|
|
|
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
|
if (params.cpuparams_batch.n_threads != -1) {
|
|
os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
|
|
}
|
|
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
|
// TODO: windows + arm64 + mingw64
|
|
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
|
|
os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
|
|
#else
|
|
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
|
|
#endif
|
|
|
|
return os.str();
|
|
}
|
|
|
|
//
|
|
// String utils
|
|
//
|
|
|
|
std::string string_format(const char * fmt, ...) {
|
|
va_list ap;
|
|
va_list ap2;
|
|
va_start(ap, fmt);
|
|
va_copy(ap2, ap);
|
|
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
std::vector<char> buf(size + 1);
|
|
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
GGML_ASSERT(size2 == size);
|
|
va_end(ap2);
|
|
va_end(ap);
|
|
return std::string(buf.data(), size);
|
|
}
|
|
|
|
std::string string_strip(const std::string & str) {
|
|
size_t start = 0;
|
|
size_t end = str.size();
|
|
while (start < end && std::isspace(str[start])) {
|
|
start++;
|
|
}
|
|
while (end > start && std::isspace(str[end - 1])) {
|
|
end--;
|
|
}
|
|
return str.substr(start, end - start);
|
|
}
|
|
|
|
std::string string_get_sortable_timestamp() {
|
|
using clock = std::chrono::system_clock;
|
|
|
|
const clock::time_point current_time = clock::now();
|
|
const time_t as_time_t = clock::to_time_t(current_time);
|
|
char timestamp_no_ns[100];
|
|
std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
|
|
|
|
const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
|
|
current_time.time_since_epoch() % 1000000000).count();
|
|
char timestamp_ns[11];
|
|
snprintf(timestamp_ns, 11, "%09" PRId64, ns);
|
|
|
|
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
|
}
|
|
|
|
void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
if (search.empty()) {
|
|
return;
|
|
}
|
|
std::string builder;
|
|
builder.reserve(s.length());
|
|
size_t pos = 0;
|
|
size_t last_pos = 0;
|
|
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
|
builder.append(s, last_pos, pos - last_pos);
|
|
builder.append(replace);
|
|
last_pos = pos + search.length();
|
|
}
|
|
builder.append(s, last_pos, std::string::npos);
|
|
s = std::move(builder);
|
|
}
|
|
|
|
std::string regex_escape(const std::string & s) {
|
|
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
|
return std::regex_replace(s, special_chars, "\\$&");
|
|
}
|
|
|
|
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
|
std::ostringstream result;
|
|
for (size_t i = 0; i < values.size(); ++i) {
|
|
if (i > 0) {
|
|
result << separator;
|
|
}
|
|
result << values[i];
|
|
}
|
|
return result.str();
|
|
}
|
|
|
|
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
|
|
std::vector<std::string> parts;
|
|
size_t start = 0;
|
|
size_t end = str.find(delimiter);
|
|
|
|
while (end != std::string::npos) {
|
|
parts.push_back(str.substr(start, end - start));
|
|
start = end + delimiter.length();
|
|
end = str.find(delimiter, start);
|
|
}
|
|
|
|
parts.push_back(str.substr(start));
|
|
|
|
return parts;
|
|
}
|
|
|
|
std::string string_repeat(const std::string & str, size_t n) {
|
|
if (n == 0) {
|
|
return "";
|
|
}
|
|
|
|
std::string result;
|
|
result.reserve(str.length() * n);
|
|
|
|
for (size_t i = 0; i < n; ++i) {
|
|
result += str;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
std::string string_from(bool value) {
|
|
return value ? "true" : "false";
|
|
}
|
|
|
|
std::string string_from(const std::vector<int> & values) {
|
|
std::stringstream buf;
|
|
|
|
buf << "[ ";
|
|
bool first = true;
|
|
for (auto e : values) {
|
|
if (first) {
|
|
first = false;
|
|
} else {
|
|
buf << ", ";
|
|
}
|
|
buf << std::to_string(e);
|
|
}
|
|
buf << " ]";
|
|
|
|
return buf.str();
|
|
}
|
|
|
|
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
|
|
std::stringstream buf;
|
|
|
|
buf << "[ ";
|
|
|
|
bool first = true;
|
|
for (const auto & token : tokens) {
|
|
if (!first) {
|
|
buf << ", ";
|
|
} else {
|
|
first = false;
|
|
}
|
|
|
|
auto detokenized = common_token_to_piece(ctx, token);
|
|
|
|
buf << "'" << detokenized << "'"
|
|
<< ":" << std::to_string(token);
|
|
}
|
|
|
|
buf << " ]";
|
|
|
|
return buf.str();
|
|
}
|
|
|
|
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
|
|
std::stringstream buf;
|
|
|
|
buf << "[ ";
|
|
|
|
bool first = true;
|
|
for (int i = 0; i < batch.n_tokens; ++i) {
|
|
if (!first) {
|
|
buf << ", ";
|
|
} else {
|
|
first = false;
|
|
}
|
|
|
|
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
|
|
|
|
buf << "\n" << std::to_string(i)
|
|
<< ", token '" << detokenized << "'"
|
|
<< ", pos " << std::to_string(batch.pos[i])
|
|
<< ", n_seq_id " << std::to_string(batch.n_seq_id[i])
|
|
<< ", seq_id " << std::to_string(batch.seq_id[i][0])
|
|
<< ", logits " << std::to_string(batch.logits[i]);
|
|
}
|
|
|
|
buf << " ]";
|
|
|
|
return buf.str();
|
|
}
|
|
|
|
void string_process_escapes(std::string & input) {
|
|
std::size_t input_len = input.length();
|
|
std::size_t output_idx = 0;
|
|
|
|
for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
|
|
if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
|
|
switch (input[++input_idx]) {
|
|
case 'n': input[output_idx++] = '\n'; break;
|
|
case 'r': input[output_idx++] = '\r'; break;
|
|
case 't': input[output_idx++] = '\t'; break;
|
|
case '\'': input[output_idx++] = '\''; break;
|
|
case '\"': input[output_idx++] = '\"'; break;
|
|
case '\\': input[output_idx++] = '\\'; break;
|
|
case 'x':
|
|
// Handle \x12, etc
|
|
if (input_idx + 2 < input_len) {
|
|
const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
|
|
char *err_p = nullptr;
|
|
const long val = std::strtol(x, &err_p, 16);
|
|
if (err_p == x + 2) {
|
|
input_idx += 2;
|
|
input[output_idx++] = char(val);
|
|
break;
|
|
}
|
|
}
|
|
// fall through
|
|
default: input[output_idx++] = '\\';
|
|
input[output_idx++] = input[input_idx]; break;
|
|
}
|
|
} else {
|
|
input[output_idx++] = input[input_idx];
|
|
}
|
|
}
|
|
|
|
input.resize(output_idx);
|
|
}
|
|
|
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
|
const char * sep = strchr(data, '=');
|
|
if (sep == nullptr || sep - data >= 128) {
|
|
LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
|
|
return false;
|
|
}
|
|
llama_model_kv_override kvo;
|
|
std::strncpy(kvo.key, data, sep - data);
|
|
kvo.key[sep - data] = 0;
|
|
sep++;
|
|
if (strncmp(sep, "int:", 4) == 0) {
|
|
sep += 4;
|
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
|
kvo.val_i64 = std::atol(sep);
|
|
} else if (strncmp(sep, "float:", 6) == 0) {
|
|
sep += 6;
|
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
|
kvo.val_f64 = std::atof(sep);
|
|
} else if (strncmp(sep, "bool:", 5) == 0) {
|
|
sep += 5;
|
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
|
if (std::strcmp(sep, "true") == 0) {
|
|
kvo.val_bool = true;
|
|
} else if (std::strcmp(sep, "false") == 0) {
|
|
kvo.val_bool = false;
|
|
} else {
|
|
LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
|
return false;
|
|
}
|
|
} else if (strncmp(sep, "str:", 4) == 0) {
|
|
sep += 4;
|
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
|
if (strlen(sep) > 127) {
|
|
LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
|
return false;
|
|
}
|
|
strncpy(kvo.val_str, sep, 127);
|
|
kvo.val_str[127] = '\0';
|
|
} else {
|
|
LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
|
|
return false;
|
|
}
|
|
overrides.emplace_back(std::move(kvo));
|
|
return true;
|
|
}
|
|
|
|
//
|
|
// Filesystem utils
|
|
//
|
|
|
|
// Validate if a filename is safe to use
|
|
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
|
bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
|
|
if (!filename.length()) {
|
|
// Empty filename invalid
|
|
return false;
|
|
}
|
|
if (filename.length() > 255) {
|
|
// Limit at common largest possible filename on Linux filesystems
|
|
// to avoid unnecessary further validation
|
|
// (On systems with smaller limits it will be caught by the OS)
|
|
return false;
|
|
}
|
|
|
|
size_t offset = 0;
|
|
while (offset < filename.size()) {
|
|
utf8_parse_result result = parse_utf8_codepoint(filename, offset);
|
|
|
|
if (result.status != utf8_parse_result::SUCCESS) {
|
|
return false;
|
|
}
|
|
uint32_t c = result.codepoint;
|
|
|
|
if ((result.bytes_consumed == 2 && c < 0x80) ||
|
|
(result.bytes_consumed == 3 && c < 0x800) ||
|
|
(result.bytes_consumed == 4 && c < 0x10000)) {
|
|
return false;
|
|
}
|
|
|
|
// Check for forbidden codepoints:
|
|
// - Control characters
|
|
// - Unicode equivalents of illegal characters
|
|
// - UTF-16 surrogate pairs
|
|
// - UTF-8 replacement character
|
|
// - Byte order mark (BOM)
|
|
// - Illegal characters: / \ : * ? " < > |
|
|
if (c <= 0x1F // Control characters (C0)
|
|
|| c == 0x7F // Control characters (DEL)
|
|
|| (c >= 0x80 && c <= 0x9F) // Control characters (C1)
|
|
|| c == 0xFF0E // Fullwidth Full Stop (period equivalent)
|
|
|| c == 0x2215 // Division Slash (forward slash equivalent)
|
|
|| c == 0x2216 // Set Minus (backslash equivalent)
|
|
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|
|
|| c > 0x10FFFF // Max Unicode limit
|
|
|| c == 0xFFFD // Replacement Character (UTF-8)
|
|
|| c == 0xFEFF // Byte Order Mark (BOM)
|
|
|| c == ':' || c == '*' // Illegal characters
|
|
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
|
|
return false;
|
|
}
|
|
if (!allow_subdirs && (c == '/' || c == '\\')) {
|
|
// Subdirectories not allowed, reject path separators
|
|
return false;
|
|
}
|
|
offset += result.bytes_consumed;
|
|
}
|
|
|
|
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
|
|
// Unicode and other whitespace is not affected, only 0x20 space
|
|
if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
|
|
return false;
|
|
}
|
|
|
|
// Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
|
|
if (filename.find("..") != std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
// Reject "."
|
|
if (filename == ".") {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
#include <iostream>
|
|
|
|
|
|
#ifdef _WIN32
|
|
static std::wstring utf8_to_wstring(const std::string & str) {
|
|
if (str.empty()) {
|
|
return std::wstring();
|
|
}
|
|
|
|
int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
|
|
|
|
if (size <= 0) {
|
|
return std::wstring();
|
|
}
|
|
|
|
std::wstring wstr(size, 0);
|
|
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
|
|
|
|
return wstr;
|
|
}
|
|
#endif
|
|
|
|
// returns true if successful, false otherwise
|
|
bool fs_create_directory_with_parents(const std::string & path) {
|
|
#ifdef _WIN32
|
|
std::wstring wpath = utf8_to_wstring(path);
|
|
|
|
// if the path already exists, check whether it's a directory
|
|
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
|
if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
|
return true;
|
|
}
|
|
|
|
size_t pos_slash = 0;
|
|
|
|
// process path from front to back, procedurally creating directories
|
|
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
|
const std::wstring subpath = wpath.substr(0, pos_slash);
|
|
|
|
pos_slash += 1;
|
|
|
|
// skip the drive letter, in some systems it can return an access denied error
|
|
if (subpath.length() == 2 && subpath[1] == ':') {
|
|
continue;
|
|
}
|
|
|
|
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
|
|
|
|
if (!success) {
|
|
const DWORD error = GetLastError();
|
|
|
|
// if the path already exists, ensure that it's a directory
|
|
if (error == ERROR_ALREADY_EXISTS) {
|
|
const DWORD attributes = GetFileAttributesW(subpath.c_str());
|
|
if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
|
return false;
|
|
}
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
#else
|
|
// if the path already exists, check whether it's a directory
|
|
struct stat info;
|
|
if (stat(path.c_str(), &info) == 0) {
|
|
return S_ISDIR(info.st_mode);
|
|
}
|
|
|
|
size_t pos_slash = 1; // skip leading slashes for directory creation
|
|
|
|
// process path from front to back, procedurally creating directories
|
|
while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
|
|
const std::string subpath = path.substr(0, pos_slash);
|
|
struct stat info;
|
|
|
|
// if the path already exists, ensure that it's a directory
|
|
if (stat(subpath.c_str(), &info) == 0) {
|
|
if (!S_ISDIR(info.st_mode)) {
|
|
return false;
|
|
}
|
|
} else {
|
|
// create parent directories
|
|
const int ret = mkdir(subpath.c_str(), 0755);
|
|
if (ret != 0) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
pos_slash += 1;
|
|
}
|
|
|
|
return true;
|
|
#endif // _WIN32
|
|
}
|
|
|
|
bool fs_is_directory(const std::string & path) {
|
|
std::filesystem::path dir(path);
|
|
return std::filesystem::exists(dir) && std::filesystem::is_directory(dir);
|
|
}
|
|
|
|
std::string fs_get_cache_directory() {
|
|
std::string cache_directory = "";
|
|
auto ensure_trailing_slash = [](std::string p) {
|
|
// Make sure to add trailing slash
|
|
if (p.back() != DIRECTORY_SEPARATOR) {
|
|
p += DIRECTORY_SEPARATOR;
|
|
}
|
|
return p;
|
|
};
|
|
if (getenv("LLAMA_CACHE")) {
|
|
cache_directory = std::getenv("LLAMA_CACHE");
|
|
} else {
|
|
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || \
|
|
defined(__OpenBSD__) || defined(__NetBSD__)
|
|
if (std::getenv("XDG_CACHE_HOME")) {
|
|
cache_directory = std::getenv("XDG_CACHE_HOME");
|
|
} else if (std::getenv("HOME")) {
|
|
cache_directory = std::getenv("HOME") + std::string("/.cache/");
|
|
} else {
|
|
#if defined(__linux__)
|
|
/* no $HOME is defined, fallback to getpwuid */
|
|
struct passwd *pw = getpwuid(getuid());
|
|
if ((!pw) || (!pw->pw_dir)) {
|
|
throw std::runtime_error("Failed to find $HOME directory");
|
|
}
|
|
|
|
cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
|
|
#else /* defined(__linux__) */
|
|
throw std::runtime_error("Failed to find $HOME directory");
|
|
#endif /* defined(__linux__) */
|
|
}
|
|
#elif defined(__APPLE__)
|
|
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
|
#elif defined(_WIN32)
|
|
cache_directory = std::getenv("LOCALAPPDATA");
|
|
#elif defined(__EMSCRIPTEN__)
|
|
GGML_ABORT("not implemented on this platform");
|
|
#else
|
|
# error Unknown architecture
|
|
#endif
|
|
cache_directory = ensure_trailing_slash(cache_directory);
|
|
cache_directory += "llama.cpp";
|
|
}
|
|
return ensure_trailing_slash(cache_directory);
|
|
}
|
|
|
|
std::string fs_get_cache_file(const std::string & filename) {
|
|
GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
|
|
std::string cache_directory = fs_get_cache_directory();
|
|
const bool success = fs_create_directory_with_parents(cache_directory);
|
|
if (!success) {
|
|
throw std::runtime_error("failed to create cache directory: " + cache_directory);
|
|
}
|
|
return cache_directory + filename;
|
|
}
|
|
|
|
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
|
|
std::vector<common_file_info> files;
|
|
if (path.empty()) return files;
|
|
|
|
std::filesystem::path dir(path);
|
|
if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
|
|
return files;
|
|
}
|
|
|
|
for (const auto & entry : std::filesystem::directory_iterator(dir)) {
|
|
try {
|
|
// Only include regular files (skip directories)
|
|
const auto & p = entry.path();
|
|
if (std::filesystem::is_regular_file(p)) {
|
|
common_file_info info;
|
|
info.path = p.string();
|
|
info.name = p.filename().string();
|
|
info.is_dir = false;
|
|
try {
|
|
info.size = static_cast<size_t>(std::filesystem::file_size(p));
|
|
} catch (const std::filesystem::filesystem_error &) {
|
|
info.size = 0;
|
|
}
|
|
files.push_back(std::move(info));
|
|
} else if (include_directories && std::filesystem::is_directory(p)) {
|
|
common_file_info info;
|
|
info.path = p.string();
|
|
info.name = p.filename().string();
|
|
info.size = 0; // Directories have no size
|
|
info.is_dir = true;
|
|
files.push_back(std::move(info));
|
|
}
|
|
} catch (const std::filesystem::filesystem_error &) {
|
|
// skip entries we cannot inspect
|
|
continue;
|
|
}
|
|
}
|
|
|
|
return files;
|
|
}
|
|
|
|
//
|
|
// TTY utils
|
|
//
|
|
|
|
bool tty_can_use_colors() {
|
|
// Check NO_COLOR environment variable (https://no-color.org/)
|
|
if (const char * no_color = std::getenv("NO_COLOR")) {
|
|
if (no_color[0] != '\0') {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check TERM environment variable
|
|
if (const char * term = std::getenv("TERM")) {
|
|
if (std::strcmp(term, "dumb") == 0) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check if stdout and stderr are connected to a terminal
|
|
// We check both because log messages can go to either
|
|
bool stdout_is_tty = isatty(fileno(stdout));
|
|
bool stderr_is_tty = isatty(fileno(stderr));
|
|
|
|
return stdout_is_tty || stderr_is_tty;
|
|
}
|
|
|
|
//
|
|
// Model utils
|
|
//
|
|
|
|
// TODO: move to common/sampling
|
|
static void common_init_sampler_from_model(
|
|
const llama_model * model,
|
|
common_params_sampling & sparams) {
|
|
|
|
const uint64_t config = sparams.user_sampling_config;
|
|
|
|
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
|
|
if (config & user_config) {
|
|
return;
|
|
}
|
|
|
|
char buf[64] = {0};
|
|
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
|
char * end = nullptr;
|
|
int32_t v = strtol(buf, &end, 10);
|
|
if (end && end != buf) {
|
|
dst = v;
|
|
}
|
|
}
|
|
};
|
|
|
|
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
|
|
if (config & user_config) {
|
|
return;
|
|
}
|
|
|
|
char buf[128] = {0};
|
|
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
|
char * end = nullptr;
|
|
float v = strtof(buf, &end);
|
|
if (end && end != buf) {
|
|
dst = v;
|
|
}
|
|
}
|
|
};
|
|
|
|
// Sampling sequence
|
|
if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
|
|
char buf[512] = {0};
|
|
if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
|
|
const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
|
|
if (!sampler_names.empty()) {
|
|
sparams.samplers = common_sampler_types_from_names(sampler_names, true);
|
|
}
|
|
}
|
|
}
|
|
|
|
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K), sparams.top_k, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
|
|
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P), sparams.top_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
|
|
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P), sparams.min_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
|
|
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
|
|
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD), sparams.xtc_threshold, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
|
|
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP), sparams.temp, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
|
|
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N), sparams.penalty_last_n, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
|
|
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT), sparams.penalty_repeat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
|
|
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT), sparams.mirostat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
|
|
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU), sparams.mirostat_tau, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
|
|
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
|
|
}
|
|
|
|
struct common_init_result::impl {
|
|
impl() = default;
|
|
~impl() = default;
|
|
|
|
// note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
|
|
|
|
llama_model_ptr model;
|
|
llama_context_ptr context;
|
|
|
|
std::vector<llama_adapter_lora_ptr> lora;
|
|
|
|
std::vector<common_sampler_ptr> samplers;
|
|
std::vector<llama_sampler_seq_config> samplers_seq_config;
|
|
};
|
|
|
|
common_init_result::common_init_result(common_params & params) :
|
|
pimpl(new impl{}) {
|
|
auto mparams = common_model_params_to_llama(params);
|
|
auto cparams = common_context_params_to_llama(params);
|
|
|
|
if (params.fit_params) {
|
|
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
|
params.tensor_split,
|
|
params.tensor_buft_overrides.data(),
|
|
params.fit_params_target.data(),
|
|
params.fit_params_min_ctx,
|
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
|
}
|
|
|
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
|
if (model == NULL) {
|
|
return;
|
|
}
|
|
|
|
pimpl->model.reset(model);
|
|
|
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
|
|
// load and optionally apply lora adapters (must be loaded before context creation)
|
|
for (auto & la : params.lora_adapters) {
|
|
llama_adapter_lora_ptr lora;
|
|
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
|
if (lora == nullptr) {
|
|
LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
|
|
pimpl->model.reset(model);
|
|
return;
|
|
}
|
|
|
|
char buf[1024];
|
|
la.ptr = lora.get();
|
|
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
|
|
la.task_name = buf;
|
|
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
|
la.prompt_prefix = buf;
|
|
pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
|
}
|
|
|
|
// updates params.sampling
|
|
// TODO: fix naming
|
|
common_init_sampler_from_model(model, params.sampling);
|
|
|
|
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
|
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
|
params.sampling.ignore_eos = false;
|
|
}
|
|
|
|
// initialize once
|
|
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
|
if (llama_vocab_is_eog(vocab, i)) {
|
|
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
|
|
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
|
}
|
|
}
|
|
|
|
if (params.sampling.ignore_eos) {
|
|
// add EOG biases to the active set of logit biases
|
|
params.sampling.logit_bias.insert(
|
|
params.sampling.logit_bias.end(),
|
|
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
|
}
|
|
|
|
//if (params.sampling.penalty_last_n == -1) {
|
|
// LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
// params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
|
//}
|
|
|
|
//if (params.sampling.dry_penalty_last_n == -1) {
|
|
// LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
|
//}
|
|
|
|
// init the backend samplers as part of the context creation
|
|
pimpl->samplers.resize(cparams.n_seq_max);
|
|
pimpl->samplers_seq_config.resize(cparams.n_seq_max);
|
|
|
|
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
|
|
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
|
|
pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
|
|
}
|
|
|
|
if (params.sampling.backend_sampling) {
|
|
cparams.samplers = pimpl->samplers_seq_config.data();
|
|
cparams.n_samplers = pimpl->samplers_seq_config.size();
|
|
}
|
|
|
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
if (lctx == NULL) {
|
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
|
return;
|
|
}
|
|
|
|
pimpl->context.reset(lctx);
|
|
}
|
|
|
|
llama_model * common_init_result::model() {
|
|
return pimpl->model.get();
|
|
}
|
|
|
|
llama_context * common_init_result::context() {
|
|
return pimpl->context.get();
|
|
}
|
|
|
|
common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
|
|
return pimpl->samplers[seq_id].get();
|
|
}
|
|
|
|
void common_init_result::reset_samplers() {
|
|
for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
|
|
llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
|
|
}
|
|
}
|
|
|
|
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
|
|
return pimpl->lora;
|
|
}
|
|
|
|
common_init_result_ptr common_init_from_params(common_params & params) {
|
|
common_init_result_ptr res(new common_init_result(params));
|
|
|
|
llama_model * model = res->model();
|
|
if (model == NULL) {
|
|
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
|
return res;
|
|
}
|
|
|
|
llama_context * lctx = res->context();
|
|
if (lctx == NULL) {
|
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
|
return res;
|
|
}
|
|
|
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
|
|
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
|
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
|
params.ctx_shift = false;
|
|
}
|
|
|
|
if (!params.control_vectors.empty()) {
|
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
|
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
|
|
|
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
|
if (cvec.n_embd == -1) {
|
|
return res;
|
|
}
|
|
|
|
int err = llama_set_adapter_cvec(
|
|
lctx,
|
|
cvec.data.data(),
|
|
cvec.data.size(),
|
|
cvec.n_embd,
|
|
params.control_vector_layer_start,
|
|
params.control_vector_layer_end);
|
|
if (err) {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
|
|
bool ok = true;
|
|
|
|
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
|
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
|
ok = false;
|
|
}
|
|
|
|
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
|
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
|
|
|
|
if (!has_eos && !has_sep && !has_rerank_prompt) {
|
|
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
|
|
ok = false;
|
|
} else if (!has_eos) {
|
|
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
|
}
|
|
|
|
if (!ok) {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
if (!params.lora_init_without_apply) {
|
|
common_set_adapter_lora(lctx, params.lora_adapters);
|
|
}
|
|
|
|
if (params.warmup) {
|
|
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
|
|
|
llama_set_warmup(lctx, true);
|
|
|
|
std::vector<llama_token> tmp;
|
|
llama_token bos = llama_vocab_bos(vocab);
|
|
llama_token eos = llama_vocab_eos(vocab);
|
|
|
|
// some models (e.g. T5) don't have a BOS token
|
|
if (bos != LLAMA_TOKEN_NULL) {
|
|
tmp.push_back(bos);
|
|
}
|
|
if (eos != LLAMA_TOKEN_NULL) {
|
|
tmp.push_back(eos);
|
|
}
|
|
if (tmp.empty()) {
|
|
tmp.push_back(0);
|
|
}
|
|
|
|
if (llama_model_has_encoder(model)) {
|
|
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
|
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
|
decoder_start_token_id = bos;
|
|
}
|
|
tmp.clear();
|
|
tmp.push_back(decoder_start_token_id);
|
|
}
|
|
if (llama_model_has_decoder(model)) {
|
|
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
|
}
|
|
llama_memory_clear(llama_get_memory(lctx), true);
|
|
llama_synchronize(lctx);
|
|
llama_perf_context_reset(lctx);
|
|
llama_set_warmup(lctx, false);
|
|
|
|
// reset samplers to reset RNG state after warmup to the seeded state
|
|
res->reset_samplers();
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
common_init_result::~common_init_result() = default;
|
|
|
|
std::string get_model_endpoint() {
|
|
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
|
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
|
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
|
|
const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
|
|
std::string model_endpoint = "https://huggingface.co/";
|
|
if (endpoint_env) {
|
|
model_endpoint = endpoint_env;
|
|
if (model_endpoint.back() != '/') {
|
|
model_endpoint += '/';
|
|
}
|
|
}
|
|
return model_endpoint;
|
|
}
|
|
|
|
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
|
std::vector<llama_adapter_lora *> loras;
|
|
std::vector<float> scales;
|
|
|
|
for (auto & la: lora) {
|
|
loras.push_back(la.ptr);
|
|
scales.push_back(la.scale);
|
|
}
|
|
|
|
llama_set_adapters_lora(ctx, loras.data(), loras.size(), scales.data());
|
|
}
|
|
|
|
struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
auto mparams = llama_model_default_params();
|
|
|
|
if (!params.devices.empty()) {
|
|
mparams.devices = params.devices.data();
|
|
}
|
|
|
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
mparams.main_gpu = params.main_gpu;
|
|
mparams.split_mode = params.split_mode;
|
|
mparams.tensor_split = params.tensor_split;
|
|
mparams.use_mmap = params.use_mmap;
|
|
mparams.use_direct_io = params.use_direct_io;
|
|
mparams.use_mlock = params.use_mlock;
|
|
mparams.check_tensors = params.check_tensors;
|
|
mparams.use_extra_bufts = !params.no_extra_bufts;
|
|
mparams.no_host = params.no_host;
|
|
|
|
if (params.kv_overrides.empty()) {
|
|
mparams.kv_overrides = NULL;
|
|
} else {
|
|
GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
|
|
mparams.kv_overrides = params.kv_overrides.data();
|
|
}
|
|
|
|
if (params.tensor_buft_overrides.empty()) {
|
|
mparams.tensor_buft_overrides = NULL;
|
|
} else {
|
|
GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
|
|
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
|
|
}
|
|
|
|
mparams.progress_callback = params.load_progress_callback;
|
|
mparams.progress_callback_user_data = params.load_progress_callback_user_data;
|
|
|
|
return mparams;
|
|
}
|
|
|
|
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
|
auto cparams = llama_context_default_params();
|
|
|
|
cparams.n_ctx = params.n_ctx;
|
|
cparams.n_seq_max = params.n_parallel;
|
|
cparams.n_batch = params.n_batch;
|
|
cparams.n_ubatch = params.n_ubatch;
|
|
cparams.n_threads = params.cpuparams.n_threads;
|
|
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
|
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
|
cparams.embeddings = params.embedding;
|
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
|
cparams.rope_freq_base = params.rope_freq_base;
|
|
cparams.rope_freq_scale = params.rope_freq_scale;
|
|
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
|
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
|
cparams.pooling_type = params.pooling_type;
|
|
cparams.attention_type = params.attention_type;
|
|
cparams.flash_attn_type = params.flash_attn_type;
|
|
cparams.cb_eval = params.cb_eval;
|
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
|
cparams.offload_kqv = !params.no_kv_offload;
|
|
cparams.no_perf = params.no_perf;
|
|
cparams.op_offload = !params.no_op_offload;
|
|
cparams.swa_full = params.swa_full;
|
|
cparams.kv_unified = params.kv_unified;
|
|
|
|
cparams.type_k = params.cache_type_k;
|
|
cparams.type_v = params.cache_type_v;
|
|
|
|
return cparams;
|
|
}
|
|
|
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
|
|
struct ggml_threadpool_params tpp;
|
|
|
|
ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
|
|
|
|
if (params.mask_valid) {
|
|
std::memcpy(&tpp.cpumask, ¶ms.cpumask, GGML_MAX_N_THREADS);
|
|
}
|
|
|
|
tpp.prio = params.priority;
|
|
tpp.poll = params.poll;
|
|
tpp.strict_cpu = params.strict_cpu;
|
|
|
|
return tpp;
|
|
}
|
|
|
|
//
|
|
// Batch utils
|
|
//
|
|
|
|
void common_batch_clear(struct llama_batch & batch) {
|
|
batch.n_tokens = 0;
|
|
}
|
|
|
|
void common_batch_add(
|
|
struct llama_batch & batch,
|
|
llama_token id,
|
|
llama_pos pos,
|
|
const std::vector<llama_seq_id> & seq_ids,
|
|
bool logits) {
|
|
GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
|
|
|
|
batch.token [batch.n_tokens] = id;
|
|
batch.pos [batch.n_tokens] = pos;
|
|
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
|
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
|
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
|
}
|
|
batch.logits [batch.n_tokens] = logits;
|
|
|
|
batch.n_tokens++;
|
|
}
|
|
|
|
//
|
|
// Vocab utils
|
|
//
|
|
|
|
std::vector<llama_token> common_tokenize(
|
|
const struct llama_context * ctx,
|
|
const std::string & text,
|
|
bool add_special,
|
|
bool parse_special) {
|
|
const llama_model * model = llama_get_model(ctx);
|
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
return common_tokenize(vocab, text, add_special, parse_special);
|
|
}
|
|
|
|
std::vector<llama_token> common_tokenize(
|
|
const struct llama_vocab * vocab,
|
|
const std::string & text,
|
|
bool add_special,
|
|
bool parse_special) {
|
|
// upper limit for the number of tokens
|
|
int n_tokens = text.length() + 2 * add_special;
|
|
std::vector<llama_token> result(n_tokens);
|
|
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
if (n_tokens == std::numeric_limits<int32_t>::min()) {
|
|
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
|
|
}
|
|
if (n_tokens < 0) {
|
|
result.resize(-n_tokens);
|
|
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
GGML_ASSERT(check == -n_tokens);
|
|
} else {
|
|
result.resize(n_tokens);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
|
const llama_model * model = llama_get_model(ctx);
|
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
return common_token_to_piece(vocab, token, special);
|
|
}
|
|
|
|
std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
|
|
std::string piece;
|
|
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
|
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
|
if (n_chars < 0) {
|
|
piece.resize(-n_chars);
|
|
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
|
GGML_ASSERT(check == -n_chars);
|
|
}
|
|
else {
|
|
piece.resize(n_chars);
|
|
}
|
|
|
|
return piece;
|
|
}
|
|
|
|
std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
|
const llama_model * model = llama_get_model(ctx);
|
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
return common_detokenize(vocab, tokens, special);
|
|
}
|
|
|
|
std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
|
|
std::string text;
|
|
text.resize(std::max(text.capacity(), tokens.size()));
|
|
int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
|
if (n_chars < 0) {
|
|
text.resize(-n_chars);
|
|
n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
|
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
|
|
}
|
|
|
|
text.resize(n_chars);
|
|
|
|
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
|
|
return text;
|
|
}
|
|
|
|
//
|
|
// Embedding utils
|
|
//
|
|
|
|
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
|
|
double sum = 0.0;
|
|
|
|
switch (embd_norm) {
|
|
case -1: // no normalisation
|
|
sum = 1.0;
|
|
break;
|
|
case 0: // max absolute
|
|
for (int i = 0; i < n; i++) {
|
|
if (sum < std::abs(inp[i])) {
|
|
sum = std::abs(inp[i]);
|
|
}
|
|
}
|
|
sum /= 32760.0; // make an int16 range
|
|
break;
|
|
case 2: // euclidean
|
|
for (int i = 0; i < n; i++) {
|
|
sum += inp[i] * inp[i];
|
|
}
|
|
sum = std::sqrt(sum);
|
|
break;
|
|
default: // p-norm (euclidean is p-norm p=2)
|
|
for (int i = 0; i < n; i++) {
|
|
sum += std::pow(std::abs(inp[i]), embd_norm);
|
|
}
|
|
sum = std::pow(sum, 1.0 / embd_norm);
|
|
break;
|
|
}
|
|
|
|
const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
out[i] = inp[i] * norm;
|
|
}
|
|
}
|
|
|
|
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
|
|
double sum = 0.0;
|
|
double sum1 = 0.0;
|
|
double sum2 = 0.0;
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
sum += embd1[i] * embd2[i];
|
|
sum1 += embd1[i] * embd1[i];
|
|
sum2 += embd2[i] * embd2[i];
|
|
}
|
|
|
|
// Handle the case where one or both vectors are zero vectors
|
|
if (sum1 == 0.0 || sum2 == 0.0) {
|
|
if (sum1 == 0.0 && sum2 == 0.0) {
|
|
return 1.0f; // two zero vectors are similar
|
|
}
|
|
return 0.0f;
|
|
}
|
|
|
|
return sum / (sqrt(sum1) * sqrt(sum2));
|
|
}
|
|
|
|
//
|
|
// Control vector utils
|
|
//
|
|
|
|
static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
|
|
common_control_vector_data result = { -1, {} };
|
|
|
|
ggml_context * ctx = nullptr;
|
|
struct gguf_init_params meta_gguf_params = {
|
|
/* .no_alloc = */ false,
|
|
/* .ctx = */ &ctx,
|
|
};
|
|
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
|
if (!ctx_gguf) {
|
|
LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
|
|
return result;
|
|
}
|
|
|
|
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
|
|
if (n_tensors == 0) {
|
|
LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
|
|
}
|
|
|
|
for (int i = 0; i < n_tensors; i++) {
|
|
std::string name = gguf_get_tensor_name(ctx_gguf, i);
|
|
|
|
int layer_idx = -1;
|
|
|
|
// split on '.'
|
|
size_t dotpos = name.find('.');
|
|
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
|
|
try {
|
|
layer_idx = std::stoi(name.substr(dotpos + 1));
|
|
} catch (...) {
|
|
layer_idx = -1;
|
|
}
|
|
}
|
|
if (layer_idx < 0) {
|
|
LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
|
result.n_embd = -1;
|
|
break;
|
|
} else if (layer_idx == 0) {
|
|
LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
|
result.n_embd = -1;
|
|
break;
|
|
}
|
|
|
|
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
|
|
if (tensor->type != GGML_TYPE_F32) {
|
|
LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
|
|
result.n_embd = -1;
|
|
break;
|
|
}
|
|
if (ggml_n_dims(tensor) != 1) {
|
|
LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
|
|
result.n_embd = -1;
|
|
break;
|
|
}
|
|
|
|
if (result.n_embd == -1) {
|
|
result.n_embd = ggml_nelements(tensor);
|
|
} else if (ggml_nelements(tensor) != result.n_embd) {
|
|
LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
|
|
result.n_embd = -1;
|
|
break;
|
|
}
|
|
|
|
// extend if necessary - do not store data for layer 0 (it's not used)
|
|
result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
|
|
|
|
const float * src = (const float *) tensor->data;
|
|
float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
|
|
for (int j = 0; j < result.n_embd; j++) {
|
|
dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
|
|
}
|
|
|
|
}
|
|
|
|
if (result.n_embd == -1) {
|
|
LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
|
|
result.data.clear();
|
|
}
|
|
|
|
gguf_free(ctx_gguf);
|
|
ggml_free(ctx);
|
|
|
|
return result;
|
|
}
|
|
|
|
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
|
|
common_control_vector_data result = { -1, {} };
|
|
|
|
for (const auto & info : load_infos) {
|
|
auto cur = common_control_vector_load_one(info);
|
|
|
|
if (cur.n_embd == -1) {
|
|
result.n_embd = -1;
|
|
break;
|
|
}
|
|
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
|
|
LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
|
|
result.n_embd = -1;
|
|
break;
|
|
}
|
|
|
|
if (result.n_embd == -1) {
|
|
result = std::move(cur);
|
|
} else {
|
|
result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
|
|
for (size_t i = 0; i < cur.data.size(); i++) {
|
|
result.data[i] += cur.data[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
if (result.n_embd == -1) {
|
|
LOG_ERR("%s: no valid control vector files passed\n", __func__);
|
|
result.data.clear();
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
|
|
const int64_t ne_datapoint = llama_n_ctx(ctx);
|
|
const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
|
|
ggml_opt_dataset_t result = ggml_opt_dataset_init(
|
|
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
|
|
|
|
llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
|
|
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
|
|
|
|
for (int64_t idata = 0; idata < ndata; ++idata) {
|
|
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
|
|
memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
|
|
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
|
|
const lr_opt & d = *(lr_opt *) userdata;
|
|
result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
|
|
result.sgd.wd = result.adamw.wd = d.wd;
|
|
return result;
|
|
}
|
|
|
|
// TODO make all command line args case-insensitive
|
|
static inline bool eq_case_insensitive(char const* a, char const* b) {
|
|
return !
|
|
#if defined(_MSC_VER)
|
|
_stricmp
|
|
#else
|
|
strcasecmp
|
|
#endif // defined(_MSC_VER)
|
|
(a, b);
|
|
}
|
|
|
|
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
|
|
if (eq_case_insensitive("adamw", n)) {
|
|
return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
|
}
|
|
if (eq_case_insensitive("sgd", n)) {
|
|
return GGML_OPT_OPTIMIZER_TYPE_SGD;
|
|
}
|
|
return GGML_OPT_OPTIMIZER_TYPE_COUNT;
|
|
}
|
|
|
|
// TODO simplify to use just log and exp
|
|
static float const k_log_2 = std::log(2.f);
|
|
|
|
void lr_opt::init() {
|
|
if (lr_min > 0 && lr_min < lr0) {
|
|
float nhalf = std::log(lr0 / lr_min) / k_log_2;
|
|
float e = epochs;
|
|
if (decay_epochs > 0 && decay_epochs < e) {
|
|
e = decay_epochs;
|
|
} else {
|
|
decay_epochs = e;
|
|
}
|
|
scale_epoch = nhalf / e;
|
|
}
|
|
}
|
|
|
|
float lr_opt::get_lr(float epoch) const {
|
|
float r = lr_min <= 0 ? lr0 :
|
|
epoch >= decay_epochs ? lr_min :
|
|
lr0 * std::pow(0.5f, epoch * scale_epoch);
|
|
LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
|
|
return r;
|
|
}
|
|
|
|
bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos) {
|
|
llama_batch batch = llama_batch_get_one(&last_token, 1);
|
|
batch.pos = &pos;
|
|
if (llama_decode(ctx, batch)) {
|
|
LOG_ERR("%s: failed to replay last token\n", __func__);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool common_prompt_batch_decode(
|
|
struct llama_context * ctx,
|
|
const std::vector<llama_token> & tokens,
|
|
int & n_past,
|
|
int n_batch,
|
|
std::string_view state_path,
|
|
bool save_state) {
|
|
const int n_eval = tokens.size();
|
|
if (n_eval == 0) {
|
|
return true;
|
|
}
|
|
|
|
if (save_state && n_eval > 1) {
|
|
const int n_tokens_before_last = n_eval - 1;
|
|
|
|
GGML_ASSERT(n_eval <= n_batch);
|
|
|
|
// Decode all but the last token so we can save the memory state before decoding the last token.
|
|
// This is done so we can restore the session state later and replay the last token.
|
|
// Memory implementations in recurrent/hybrid models don't support removing tokens from their
|
|
// memory, so we can't just remove the last token from the memory and replay the last token which
|
|
// is the reason for this logic.
|
|
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_tokens_before_last))) {
|
|
LOG_ERR("%s : failed to eval\n", __func__);
|
|
return false;
|
|
}
|
|
n_past += n_tokens_before_last;
|
|
|
|
llama_state_save_file(ctx, state_path.data(), tokens.data(), n_tokens_before_last);
|
|
LOG_INF("saved session before last token to %s, n_tokens = %d\n", state_path.data(), n_tokens_before_last);
|
|
|
|
llama_token last_token = tokens.back();
|
|
llama_batch batch = llama_batch_get_one(&last_token, 1);
|
|
int32_t pos = n_past;
|
|
batch.pos = &pos;
|
|
|
|
if (llama_decode(ctx, batch)) {
|
|
LOG_ERR("%s : failed to eval last token\n", __func__);
|
|
return false;
|
|
}
|
|
n_past++;
|
|
} else {
|
|
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_eval))) {
|
|
LOG_ERR("%s : failed to eval\n", __func__);
|
|
return false;
|
|
}
|
|
n_past += n_eval;
|
|
}
|
|
|
|
return true;
|
|
}
|