common : add standard Hugging Face cache support (#20775)

* common : add standard Hugging Face cache support

- Use HF API to find all files
- Migrate all manifests to hugging face cache at startup

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Check with the quant tag

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Cleanup

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Improve error handling and report API errors

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Restore common_cached_model_info and align mmproj filtering

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Prefer main when getting cached ref

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Use cached files when HF API fails

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Use final_path..

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Check all inputs

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

---------

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
Adrien Gallouët 2026-03-24 07:30:33 +01:00 committed by GitHub
parent e852eb4901
commit 8c7957ca33
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 1061 additions and 330 deletions

View file

@ -979,37 +979,20 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
for (size_t i = 0; i < params.hf_repo.size(); i++) {
common_params_model model;
// step 1: no `-hff` provided, we auto-detect based on the `-hf` flag
if (params.hf_file.empty() || params.hf_file[i].empty()) {
auto auto_detected = common_get_hf_file(params.hf_repo[i], params.hf_token, false);
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
exit(1);
}
model.name = params.hf_repo[i];
model.hf_repo = auto_detected.repo;
model.hf_file = auto_detected.ggufFile;
model.hf_repo = params.hf_repo[i];
} else {
model.hf_repo = params.hf_repo[i];
model.hf_file = params.hf_file[i];
}
// step 2: construct the model cache path
std::string clean_fname = model.hf_repo + "_" + model.hf_file;
string_replace_all(clean_fname, "\\", "_");
string_replace_all(clean_fname, "/", "_");
model.path = fs_get_cache_file(clean_fname);
// step 3: download the model if not exists
std::string model_endpoint = get_model_endpoint();
model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
bool ok = common_download_model(model, params.hf_token, false);
if (!ok) {
fprintf(stderr, "error: failed to download model from %s\n", model.url.c_str());
auto download_result = common_download_model(model, params.hf_token);
if (download_result.model_path.empty()) {
fprintf(stderr, "error: failed to download model from HuggingFace\n");
exit(1);
}
params.model.push_back(model.path);
params.model.push_back(download_result.model_path);
}
}

View file

@ -103,8 +103,8 @@ def test_router_models_max_evicts_lru():
candidate_models = [
"ggml-org/tinygemma3-GGUF:Q8_0",
"ggml-org/test-model-stories260K",
"ggml-org/test-model-stories260K-infill",
"ggml-org/test-model-stories260K:F32",
"ggml-org/test-model-stories260K-infill:F32",
]
# Load only the first 2 models to fill the cache