common : add standard Hugging Face cache support (#20775)

* common : add standard Hugging Face cache support - Use HF API to find all files - Migrate all manifests to hugging face cache at startup Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Check with the quant tag Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Cleanup Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Improve error handling and report API errors Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Restore common_cached_model_info and align mmproj filtering Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Prefer main when getting cached ref Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Use cached files when HF API fails Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Use final_path.. Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Check all inputs Signed-off-by: Adrien Gallouët <angt@huggingface.co> --------- Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-03-24 07:30:33 +01:00 · 2026-03-24 07:30:33 +01:00 · 8c7957ca33
commit 8c7957ca33
parent e852eb4901
8 changed files with 1061 additions and 330 deletions
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@ -979,37 +979,20 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
        for (size_t i = 0; i < params.hf_repo.size(); i++) {
            common_params_model model;

-            // step 1: no `-hff` provided, we auto-detect based on the `-hf` flag
            if (params.hf_file.empty() || params.hf_file[i].empty()) {
-                auto auto_detected = common_get_hf_file(params.hf_repo[i], params.hf_token, false);
-                if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
-                    exit(1);
-                }
-
-                model.name    = params.hf_repo[i];
-                model.hf_repo = auto_detected.repo;
-                model.hf_file = auto_detected.ggufFile;
+                model.hf_repo = params.hf_repo[i];
            } else {
+                model.hf_repo = params.hf_repo[i];
                model.hf_file = params.hf_file[i];
            }

-            // step 2: construct the model cache path
-            std::string clean_fname = model.hf_repo + "_" + model.hf_file;
-            string_replace_all(clean_fname, "\\", "_");
-            string_replace_all(clean_fname, "/", "_");
-            model.path = fs_get_cache_file(clean_fname);
-
-            // step 3: download the model if not exists
-            std::string model_endpoint = get_model_endpoint();
-            model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
-
-            bool ok = common_download_model(model, params.hf_token, false);
-            if (!ok) {
-                fprintf(stderr, "error: failed to download model from %s\n", model.url.c_str());
+            auto download_result = common_download_model(model, params.hf_token);
+            if (download_result.model_path.empty()) {
+                fprintf(stderr, "error: failed to download model from HuggingFace\n");
                exit(1);
            }

-            params.model.push_back(model.path);
+            params.model.push_back(download_result.model_path);
        }
    }

--- a/tools/server/tests/unit/test_router.py
+++ b/tools/server/tests/unit/test_router.py
@ -103,8 +103,8 @@ def test_router_models_max_evicts_lru():

    candidate_models = [
        "ggml-org/tinygemma3-GGUF:Q8_0",
-        "ggml-org/test-model-stories260K",
-        "ggml-org/test-model-stories260K-infill",
+        "ggml-org/test-model-stories260K:F32",
+        "ggml-org/test-model-stories260K-infill:F32",
    ]

    # Load only the first 2 models to fill the cache