llama: add custom newline split for Gemma 4 (#21406)
This commit is contained in:
parent
d006858316
commit
b7ad48ebda
1 changed files with 31 additions and 0 deletions
|
|
@ -753,6 +753,35 @@ static std::vector<size_t> unicode_regex_split_custom_afmoe(const std::string &
|
||||||
return bpe_offsets;
|
return bpe_offsets;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// regex: [^\n]+|[\n]+
|
||||||
|
// splits text into runs of non-newline characters and runs of newline characters
|
||||||
|
static std::vector<size_t> unicode_regex_split_custom_newlines(const std::string & text, const std::vector<size_t> & offsets) {
|
||||||
|
std::vector<size_t> bpe_offsets;
|
||||||
|
bpe_offsets.reserve(offsets.size());
|
||||||
|
|
||||||
|
const auto cpts = unicode_cpts_from_utf8(text);
|
||||||
|
|
||||||
|
size_t start = 0;
|
||||||
|
for (auto offset : offsets) {
|
||||||
|
const size_t offset_ini = start;
|
||||||
|
const size_t offset_end = start + offset;
|
||||||
|
assert(offset_end <= cpts.size());
|
||||||
|
start = offset_end;
|
||||||
|
|
||||||
|
size_t pos = offset_ini;
|
||||||
|
while (pos < offset_end) {
|
||||||
|
const bool is_newline = (cpts[pos] == '\n');
|
||||||
|
const size_t run_start = pos;
|
||||||
|
while (pos < offset_end && (cpts[pos] == '\n') == is_newline) {
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
bpe_offsets.push_back(pos - run_start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bpe_offsets;
|
||||||
|
}
|
||||||
|
|
||||||
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
||||||
std::vector<size_t> bpe_offsets;
|
std::vector<size_t> bpe_offsets;
|
||||||
|
|
||||||
|
|
@ -769,6 +798,8 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
|
||||||
} else if (regex_expr == "\\p{AFMoE_digits}") {
|
} else if (regex_expr == "\\p{AFMoE_digits}") {
|
||||||
// AFMOE digit pattern - use custom implementation for proper splitting
|
// AFMOE digit pattern - use custom implementation for proper splitting
|
||||||
bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
|
bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
|
||||||
|
} else if (regex_expr == "[^\\n]+|[\\n]+") {
|
||||||
|
bpe_offsets = unicode_regex_split_custom_newlines(text, offsets);
|
||||||
} else if (regex_expr == "\\d{1,3}(?=(?:\\d{3})*\\b)") {
|
} else if (regex_expr == "\\d{1,3}(?=(?:\\d{3})*\\b)") {
|
||||||
// tiny_aya digit grouping pattern from tokenizer.json:
|
// tiny_aya digit grouping pattern from tokenizer.json:
|
||||||
// {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
|
// {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue