llama : refactor llama_model_quantize_params to expose a pure C interface (#20346)
* Refactor llama_model_quantize_params to expose a pure C interface * Restore comment and cleanup struct def * Code review refactoring Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Code review refactoring --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
82764c341a
commit
4951250235
3 changed files with 66 additions and 50 deletions
|
|
@ -380,22 +380,33 @@ extern "C" {
|
|||
size_t n_samplers;
|
||||
};
|
||||
|
||||
struct llama_model_tensor_override {
|
||||
const char * pattern;
|
||||
enum ggml_type type;
|
||||
};
|
||||
|
||||
struct llama_model_imatrix_data {
|
||||
const char * name;
|
||||
const float * data;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
typedef struct llama_model_quantize_params {
|
||||
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
enum ggml_type output_tensor_type; // output tensor type
|
||||
enum ggml_type token_embedding_type; // token embeddings tensor type
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool pure; // quantize all tensors to the default type
|
||||
bool keep_split; // quantize to the same number of shards
|
||||
bool dry_run; // calculate and show the final quantization size without performing quantization
|
||||
void * imatrix; // pointer to importance matrix data
|
||||
void * kv_overrides; // pointer to vector containing overrides
|
||||
void * tensor_types; // pointer to vector containing tensor types
|
||||
void * prune_layers; // pointer to vector containing layer indices to prune
|
||||
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
enum ggml_type output_tensor_type; // output tensor type
|
||||
enum ggml_type token_embedding_type; // token embeddings tensor type
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool pure; // quantize all tensors to the default type
|
||||
bool keep_split; // quantize to the same number of shards
|
||||
bool dry_run; // calculate and show the final quantization size without performing quantization
|
||||
const struct llama_model_imatrix_data * imatrix; // pointer to importance matrix data
|
||||
const struct llama_model_kv_override * kv_overrides; // pointer to kv overrides
|
||||
const struct llama_model_tensor_override * tt_overrides; // pointer to tensor overrides
|
||||
const int32_t * prune_layers; // pointer to layer indices to prune
|
||||
} llama_model_quantize_params;
|
||||
|
||||
typedef struct llama_logit_bias {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue