convert : support Qwen3.5/Qwen3.5 Moe NVFP4 and add input scales (#20505)

* convert : fix Qwen3.5 NVFP4 conversion

* Updated copilot concerns and rebased

* move into _LinearAttentionVReorderBase and simplify

* --flake

* new_name not needed

* Added input_scale to gguf

* Fixed input_scale addition as tensor

* Added input scale to loader and named _in_s

* Update convert_hf_to_gguf.py

Re-removed input_scale from aux cleanup

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Michael Wand 2026-03-26 08:52:06 -07:00 committed by GitHub
parent 3d5acab3e7
commit f8d4abae86
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 209 additions and 11 deletions

View file

@ -7578,6 +7578,65 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
if (!layer.ssm_beta_s && layer.ssm_beta) {
layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
// input scales
if (!layer.wq_in_s && layer.wq) {
layer.wq_in_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.wk_in_s && layer.wk) {
layer.wk_in_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.wv_in_s && layer.wv) {
layer.wv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.wo_in_s && layer.wo) {
layer.wo_in_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.wqkv_in_s && layer.wqkv) {
layer.wqkv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.wqkv_gate_in_s && layer.wqkv_gate) {
layer.wqkv_gate_in_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ffn_gate_in_s && layer.ffn_gate) {
layer.ffn_gate_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ffn_down_in_s && layer.ffn_down) {
layer.ffn_down_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ffn_up_in_s && layer.ffn_up) {
layer.ffn_up_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ffn_gate_exps_in_s && layer.ffn_gate_exps) {
layer.ffn_gate_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
}
if (!layer.ffn_down_exps_in_s && layer.ffn_down_exps) {
layer.ffn_down_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
}
if (!layer.ffn_up_exps_in_s && layer.ffn_up_exps) {
layer.ffn_up_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
}
if (!layer.ffn_gate_shexp_in_s && layer.ffn_gate_shexp) {
layer.ffn_gate_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ffn_down_shexp_in_s && layer.ffn_down_shexp) {
layer.ffn_down_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ffn_up_shexp_in_s && layer.ffn_up_shexp) {
layer.ffn_up_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ssm_in_in_s && layer.ssm_in) {
layer.ssm_in_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ssm_out_in_s && layer.ssm_out) {
layer.ssm_out_in_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ssm_alpha_in_s && layer.ssm_alpha) {
layer.ssm_alpha_in_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ssm_beta_in_s && layer.ssm_beta) {
layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
}
}