Autoparser - complete refactoring of parser architecture (#18675)

* Autoparser - full single commit squish

* Final pre-merge changes: minor fixes, Kimi 2.5 model parser
This commit is contained in:
Piotr Wilkin (ilintar) 2026-03-06 21:01:00 +01:00 committed by GitHub
parent 34df42f7be
commit 566059a26b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
63 changed files with 12967 additions and 10071 deletions

View file

@ -1,14 +1,18 @@
#include "unicode.h"
#include <cassert>
#include <stdexcept>
#include <vector>
#include <string>
// implementation adopted from src/unicode.cpp
size_t utf8_sequence_length(unsigned char first_byte) {
size_t common_utf8_sequence_length(unsigned char first_byte) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
return lookup[highbits];
}
utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset) {
if (offset >= input.size()) {
return utf8_parse_result(utf8_parse_result::INCOMPLETE);
}
@ -62,3 +66,43 @@ utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
// Invalid first byte
return utf8_parse_result(utf8_parse_result::INVALID);
}
std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
std::string result;
for (size_t i = 0; i < cps.size(); ++i) {
result.append(common_unicode_cpt_to_utf8(cps[i]));
}
return result;
}
std::string common_unicode_cpt_to_utf8(uint32_t cpt) {
std::string result;
if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
result.push_back(cpt);
return result;
}
if (0x80 <= cpt && cpt <= 0x7ff) {
result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
result.push_back(0x80 | (cpt & 0x3f));
return result;
}
if (0x800 <= cpt && cpt <= 0xffff) {
result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
result.push_back(0x80 | (cpt & 0x3f));
return result;
}
if (0x10000 <= cpt && cpt <= 0x10ffff) {
result.push_back(0xf0 | ((cpt >> 18) & 0x07));
result.push_back(0x80 | ((cpt >> 12) & 0x3f));
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
result.push_back(0x80 | (cpt & 0x3f));
return result;
}
throw std::invalid_argument("invalid codepoint");
}