llama: end-to-end tests (#19802)

* tests: add end-to-end tests per model architecture

* fixup for rebase

* fix use-after-free in llama-model-loader.cpp

* fix CI

* fix WebGPU

* fix CI

* disable CI for macOS-latest-cmake-arm64

* use expert_weights_scale only if != 0.0f

* comments
This commit is contained in:
Johannes Gäßler 2026-03-08 12:30:21 +01:00 committed by GitHub
parent a95047979a
commit a976ff081b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
33 changed files with 1607 additions and 633 deletions

View file

@ -1,5 +1,6 @@
#include "llama.h"
#include "ggml-cpp.h"
#include "llama-impl.h"
#include "llama-chat.h"
@ -12,6 +13,7 @@
#include "ggml.h"
#include "ggml-backend.h"
#include "gguf.h"
#include <algorithm>
#include <cassert>
@ -825,7 +827,8 @@ int64_t llama_time_us(void) {
}
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
// loading time will be recalculated after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = 0;
@ -834,7 +837,8 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
model.t_start_us = tm.t_start_us;
try {
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
ml.print_info();
@ -880,9 +884,13 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
}
static struct llama_model * llama_model_load_from_file_impl(
struct gguf_context * metadata,
llama_model_set_tensor_data_t set_tensor_data,
void * set_tensor_data_ud,
const std::string & path_model,
std::vector<std::string> & splits,
struct llama_model_params params) {
GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
ggml_time_init();
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
@ -1003,7 +1011,7 @@ static struct llama_model * llama_model_load_from_file_impl(
props.memory_free/1024/1024);
}
const int status = llama_model_load(path_model, splits, *model, params);
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
@ -1019,6 +1027,18 @@ static struct llama_model * llama_model_load_from_file_impl(
return model;
}
struct llama_model * llama_model_init_from_user(
struct gguf_context * metadata,
llama_model_set_tensor_data_t set_tensor_data,
void * set_tensor_data_ud,
struct llama_model_params params) {
GGML_ASSERT(metadata != nullptr);
std::string path_model;
std::vector<std::string> splits = {};
params.use_mmap = false;
params.use_extra_bufts = false;
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
}
// deprecated
struct llama_model * llama_load_model_from_file(
const char * path_model,
@ -1030,7 +1050,7 @@ struct llama_model * llama_model_load_from_file(
const char * path_model,
struct llama_model_params params) {
std::vector<std::string> splits = {};
return llama_model_load_from_file_impl(path_model, splits, params);
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
}
struct llama_model * llama_model_load_from_splits(
@ -1046,11 +1066,11 @@ struct llama_model * llama_model_load_from_splits(
for (size_t i = 0; i < n_paths; ++i) {
splits.push_back(paths[i]);
}
return llama_model_load_from_file_impl(splits.front(), splits, params);
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
}
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
llama_model_saver ms(*model);
llama_model_saver ms(model);
ms.add_kv_from_model();
ms.add_tensors_from_model();
ms.save(path_model);