llama : refactor llama_model_quantize_params to expose a pure C interface (#20346)
* Refactor llama_model_quantize_params to expose a pure C interface * Restore comment and cleanup struct def * Code review refactoring Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Code review refactoring --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
82764c341a
commit
4951250235
3 changed files with 66 additions and 50 deletions
|
|
@ -13,13 +13,10 @@
|
|||
#include <unordered_map>
|
||||
#include <map>
|
||||
#include <fstream>
|
||||
#include <cmath>
|
||||
#include <cctype>
|
||||
#include <algorithm>
|
||||
#include <filesystem>
|
||||
|
||||
// result of parsing --tensor-type option
|
||||
// (changes to this struct must be reflected in src/llama-quant.cpp)
|
||||
// changes to this struct must also be reflected in src/llama-quant.cpp
|
||||
struct tensor_type_option {
|
||||
std::string name;
|
||||
ggml_type type = GGML_TYPE_COUNT;
|
||||
|
|
@ -491,7 +488,6 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
|
|||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
if (argc < 3) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
|
@ -584,8 +580,16 @@ int main(int argc, char ** argv) {
|
|||
std::vector<std::string> imatrix_datasets;
|
||||
std::unordered_map<std::string, std::vector<float>> imatrix_data;
|
||||
int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
|
||||
|
||||
std::vector<llama_model_imatrix_data> i_data;
|
||||
std::vector<llama_model_tensor_override> t_override;
|
||||
if (!imatrix_data.empty()) {
|
||||
params.imatrix = &imatrix_data;
|
||||
i_data.reserve(imatrix_data.size() + 1);
|
||||
for (const auto & kv : imatrix_data) {
|
||||
i_data.push_back({kv.first.c_str(), kv.second.data(), kv.second.size()});
|
||||
}
|
||||
i_data.push_back({nullptr, nullptr, 0}); // array terminator
|
||||
params.imatrix = i_data.data();
|
||||
{
|
||||
llama_model_kv_override kvo;
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
|
||||
|
|
@ -603,7 +607,6 @@ int main(int argc, char ** argv) {
|
|||
kvo.val_str[127] = '\0';
|
||||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
|
||||
{
|
||||
llama_model_kv_override kvo;
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
|
||||
|
|
@ -611,7 +614,6 @@ int main(int argc, char ** argv) {
|
|||
kvo.val_i64 = imatrix_data.size();
|
||||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
|
||||
if (m_last_call > 0) {
|
||||
llama_model_kv_override kvo;
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
|
||||
|
|
@ -623,13 +625,19 @@ int main(int argc, char ** argv) {
|
|||
if (!kv_overrides.empty()) {
|
||||
kv_overrides.emplace_back();
|
||||
kv_overrides.back().key[0] = 0;
|
||||
params.kv_overrides = &kv_overrides;
|
||||
params.kv_overrides = kv_overrides.data();
|
||||
}
|
||||
if (!tensor_type_opts.empty()) {
|
||||
params.tensor_types = &tensor_type_opts;
|
||||
t_override.reserve(tensor_type_opts.size() + 1);
|
||||
for (const auto & tt : tensor_type_opts) {
|
||||
t_override.push_back({tt.name.c_str(), tt.type});
|
||||
}
|
||||
t_override.push_back({nullptr, GGML_TYPE_COUNT}); // array terminator
|
||||
params.tt_overrides = t_override.data();
|
||||
}
|
||||
if (!prune_layers.empty()) {
|
||||
params.prune_layers = &prune_layers;
|
||||
prune_layers.push_back(-1); // array terminator
|
||||
params.prune_layers = prune_layers.data();
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue