* server: introduce self-speculative decoding * server: moved self-call into speculative.cpp * can_speculate() includes self-speculation Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * server: can_speculate() tests self-spec * server: replace can_speculate() with slot.can_speculate() Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * common: use %zu format specifier for size_t in logging Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * server: can_speculate() requires a task instance * common: ngram map, config self-speculative decoding * common: add enum common_speculative_type * common: add vector of speculative states * common: add option --spec-draftless * server: cleanup (remove slot.batch_spec, rename) * common: moved self-spec impl to ngram-map * common: cleanup (use common_speculative_state_draft) * spec : refactor * cont : naming * spec: remove --spec-config * doc: (draftless) speculative decoding * common: print performance in spec decoding * minor : cleanup * common : better names * minor : cleanup + fix build * minor: comments * CODEOWNERS: add common/ngram-map.* (#18471) * common : rename speculative.draftless_type -> speculative.type * ngram-map : fix uninitialized values * ngram-map : take into account the input can become shorter * ngram-map : revert len check for now * arg : change `--spec-draftless` -> `--spec-type` * spec : add common_speculative_state::accept() * spec : refactor + add common_speculative_begin() * spec : fix begin() call with mtmd * spec : additional refactor + remove common_speculative_params --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
105 lines
4.3 KiB
C++
105 lines
4.3 KiB
C++
#pragma once
|
|
//
|
|
// common/ngram-map.h: structures used to manage a map from n-grams to a list of m-grams
|
|
//
|
|
// These structures are used to do a lookup of n-grams followed by m-grams in token history.
|
|
//
|
|
// There are two algorithms implemented:
|
|
// 1. ngram_simple: lookup of n-grams followed by m-grams in token history.
|
|
// 2. ngram_map: lookup of n-grams followed by m-grams in token history using a map.
|
|
// The map is a vector of key n-grams, and for each key n-gram there is a list of value m-grams.
|
|
//
|
|
|
|
#include "llama.h"
|
|
|
|
#include <vector>
|
|
|
|
// n-gram simple
|
|
//
|
|
|
|
// config of n-gram simple.
|
|
struct common_ngram_simple_config {
|
|
uint16_t size_ngram; // size of n-grams to lookup in self-mode
|
|
uint16_t size_mgram; // size of m-grams to draft in self-mode
|
|
uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
|
|
};
|
|
|
|
// current state (and config) of n-gram simple.
|
|
struct common_ngram_simple_state {
|
|
common_ngram_simple_config config;
|
|
|
|
size_t idx_last_check = 0; // index of last check in context history (mutable)
|
|
|
|
common_ngram_simple_state(const common_ngram_simple_config & config)
|
|
: config(config) {}
|
|
};
|
|
|
|
// Searches for a n-gram in the history and checks whether a draft sequence should be generated.
|
|
// state: the ngram simple state to search in.
|
|
// inp: the tokens generated so far.
|
|
// sampled: the token that was just sampled.
|
|
// draft: vector to store the draft tokens, initially empty.
|
|
llama_tokens common_ngram_simple_draft(
|
|
common_ngram_simple_state & state,
|
|
const llama_tokens & tokens, llama_token sampled);
|
|
|
|
|
|
// n-gram map
|
|
//
|
|
|
|
// maximum number of m-gram values stored for each key n-gram.
|
|
#define COMMON_NGRAM_MAX_VALUES 4
|
|
|
|
// statistics of a m-gram after a known n-gram
|
|
struct common_ngram_map_value {
|
|
size_t value_idx = 0; // index of value m-gram in token-history (0 if unused)
|
|
uint16_t value_num = 0; // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
|
|
int16_t n_accepted = -1; // number of accepted tokens at last draft (-1 if unused)
|
|
};
|
|
|
|
// statistics of a n-gram
|
|
struct common_ngram_map_key {
|
|
size_t key_idx; // index of key n-gram in token-history
|
|
size_t stat_idx; // index of last token of stastistics computation (key_num, values)
|
|
|
|
uint16_t key_num; // number of occurences of this key n-gram in token-history
|
|
common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key
|
|
};
|
|
|
|
// map from n-grams to following m-grams in token-history
|
|
struct common_ngram_map {
|
|
uint16_t size_key; // size of key n-grams
|
|
uint16_t size_value; // size of value m-grams
|
|
|
|
bool key_only; // true if only key n-grams are used, no values.
|
|
|
|
// first draft: vector only, no map.
|
|
std::vector<common_ngram_map_key> keys; // key n-grams which occur several times in token-history
|
|
uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
|
|
uint16_t min_hits; // minimum number of key hits to consider a draft
|
|
|
|
common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys,
|
|
uint16_t check_rate, uint16_t min_hits)
|
|
: size_key(sz_key), size_value(sz_value), key_only(only_keys),
|
|
check_rate(check_rate), min_hits(min_hits) {}
|
|
|
|
bool last_draft_created = false; // true if a draft was created at last call.
|
|
size_t last_draft_key_idx = 0; // index of last key used for draft generation.
|
|
uint16_t last_draft_value_idx = 0; // index of last value used for draft generation.
|
|
|
|
size_t idx_last_check = 0; // index of last check in context history
|
|
};
|
|
|
|
|
|
// Searches for the n-gram in the history and checks whether a draft sequence should be generated.
|
|
// map: the ngram map to search in.
|
|
// inp: the tokens generated so far.
|
|
// sampled: the token that was just sampled.
|
|
// draft: vector to store the draft tokens, initially empty.
|
|
void common_ngram_map_draft(
|
|
common_ngram_map & map,
|
|
const llama_tokens & inp, llama_token sampled,
|
|
llama_tokens & draft);
|
|
|
|
// Update the statistics of a value after a draft was processed.
|
|
void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted);
|