llama: end-to-end tests (#19802)

* tests: add end-to-end tests per model architecture * fixup for rebase * fix use-after-free in llama-model-loader.cpp * fix CI * fix WebGPU * fix CI * disable CI for macOS-latest-cmake-arm64 * use expert_weights_scale only if != 0.0f * comments
2026-03-08 12:30:21 +01:00 · 2026-03-08 12:30:21 +01:00 · a976ff081b
commit a976ff081b
parent a95047979a
33 changed files with 1607 additions and 633 deletions
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -509,6 +509,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
    float * data = (float *) cross_kq_mask->data;

    for (int i = 0; i < n_tokens; ++i) {
+        GGML_ASSERT(!cross->seq_ids_enc.empty() && "llama_encode must be called first");
        for (int j = 0; j < n_enc; ++j) {
            float f = -INFINITY;

@ -1150,6 +1151,7 @@ ggml_tensor * llm_graph_context::build_ffn(
    return cur;
 }

+// TODO remove redundant scale_w argument
 ggml_tensor * llm_graph_context::build_moe_ffn(
         ggml_tensor * cur,
         ggml_tensor * gate_inp,
@ -1607,6 +1609,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
    // this need to be 1x1xN for broadcasting
    cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
    ggml_set_input(cur);
+    ggml_set_name(cur, "attn_scale");

    res->add_input(std::move(inp));