llama : refactor llama_model_quantize_params to expose a pure C interface (#20346)

* Refactor llama_model_quantize_params to expose a pure C interface

* Restore comment and cleanup struct def

* Code review refactoring

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Code review refactoring

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Ed Addario 2026-04-01 06:43:00 +01:00 committed by GitHub
parent 82764c341a
commit 4951250235
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 66 additions and 50 deletions

View file

@ -380,22 +380,33 @@ extern "C" {
size_t n_samplers;
};
struct llama_model_tensor_override {
const char * pattern;
enum ggml_type type;
};
struct llama_model_imatrix_data {
const char * name;
const float * data;
size_t size;
};
// model quantization parameters
typedef struct llama_model_quantize_params {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
bool dry_run; // calculate and show the final quantization size without performing quantization
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
void * tensor_types; // pointer to vector containing tensor types
void * prune_layers; // pointer to vector containing layer indices to prune
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
bool dry_run; // calculate and show the final quantization size without performing quantization
const struct llama_model_imatrix_data * imatrix; // pointer to importance matrix data
const struct llama_model_kv_override * kv_overrides; // pointer to kv overrides
const struct llama_model_tensor_override * tt_overrides; // pointer to tensor overrides
const int32_t * prune_layers; // pointer to layer indices to prune
} llama_model_quantize_params;
typedef struct llama_logit_bias {