common : gracefully handle incomplete output (#20191)
* common : handle incomplete UTF-8 at end of input in PEG parser * cont : if reached end prematurely, emit needs_more_input to propagate partial output * cont: refactor peg parse context to add lenient flag * cont : remove partial flag, keep lenient flag
This commit is contained in:
parent
9b24886f78
commit
451ef08432
11 changed files with 139 additions and 125 deletions
|
|
@ -349,7 +349,7 @@ struct parser_executor {
|
|||
auto pos = start_pos;
|
||||
for (auto i = 0u; i < p.literal.size(); ++i) {
|
||||
if (pos >= ctx.input.size()) {
|
||||
if (!ctx.is_partial) {
|
||||
if (!ctx.is_lenient()) {
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
||||
|
|
@ -364,7 +364,7 @@ struct parser_executor {
|
|||
}
|
||||
|
||||
common_peg_parse_result operator()(const common_peg_sequence_parser & p) {
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
LOG_DBG("%sSEQ start at %zu '%s' (%zu children)\n", debug_indent().c_str(), start_pos,
|
||||
debug_input_snippet(start_pos).c_str(), p.children.size());
|
||||
}
|
||||
|
|
@ -375,26 +375,19 @@ struct parser_executor {
|
|||
|
||||
for (size_t i = 0; i < p.children.size(); i++) {
|
||||
const auto & child_id = p.children[i];
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sSEQ child %zu: %s\n", debug_indent().c_str(), i, arena.dump(child_id).c_str());
|
||||
}
|
||||
auto result = arena.parse(child_id, ctx, pos);
|
||||
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sSEQ child %zu: %s at %zu->%zu\n", debug_indent().c_str(), i,
|
||||
common_peg_parse_result_type_name(result.type), result.start, result.end);
|
||||
}
|
||||
|
||||
if (result.fail()) {
|
||||
ctx.parse_depth--;
|
||||
if (ctx.is_partial && result.end >= ctx.input.size()) {
|
||||
if (ctx.debug) {
|
||||
fprintf(stderr, "%sSEQ -> NEED_MORE (child failed at end)\n", debug_indent().c_str());
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end,
|
||||
std::move(nodes));
|
||||
}
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sSEQ -> FAIL\n", debug_indent().c_str());
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, result.end);
|
||||
|
|
@ -406,7 +399,7 @@ struct parser_executor {
|
|||
|
||||
if (result.need_more_input()) {
|
||||
ctx.parse_depth--;
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sSEQ -> NEED_MORE\n", debug_indent().c_str());
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
|
||||
|
|
@ -416,14 +409,14 @@ struct parser_executor {
|
|||
}
|
||||
|
||||
ctx.parse_depth--;
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sSEQ -> SUCCESS at %zu->%zu\n", debug_indent().c_str(), start_pos, pos);
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
|
||||
}
|
||||
|
||||
common_peg_parse_result operator()(const common_peg_choice_parser & p) {
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sCHOICE start at %zu '%s' (%zu options)\n", debug_indent().c_str(), start_pos,
|
||||
debug_input_snippet(start_pos).c_str(), p.children.size());
|
||||
}
|
||||
|
|
@ -432,17 +425,17 @@ struct parser_executor {
|
|||
auto pos = start_pos;
|
||||
for (size_t i = 0; i < p.children.size(); i++) {
|
||||
const auto & child_id = p.children[i];
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sCHOICE option %zu: %s\n", debug_indent().c_str(), i, arena.dump(child_id).c_str());
|
||||
}
|
||||
auto result = arena.parse(child_id, ctx, pos);
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sCHOICE option %zu: %s\n", debug_indent().c_str(), i,
|
||||
common_peg_parse_result_type_name(result.type));
|
||||
}
|
||||
if (!result.fail()) {
|
||||
ctx.parse_depth--;
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sCHOICE -> %s (option %zu)\n", debug_indent().c_str(),
|
||||
common_peg_parse_result_type_name(result.type), i);
|
||||
}
|
||||
|
|
@ -451,14 +444,14 @@ struct parser_executor {
|
|||
}
|
||||
|
||||
ctx.parse_depth--;
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sCHOICE -> FAIL (no options matched)\n", debug_indent().c_str());
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
||||
}
|
||||
|
||||
common_peg_parse_result operator()(const common_peg_repetition_parser & p) {
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sREPEAT start at %zu '%s' (min=%d, max=%d)\n", debug_indent().c_str(), start_pos,
|
||||
debug_input_snippet(start_pos).c_str(), p.min_count, p.max_count);
|
||||
}
|
||||
|
|
@ -471,7 +464,7 @@ struct parser_executor {
|
|||
// Try to match up to max_count times (or unlimited if max_count is -1)
|
||||
while (p.max_count == -1 || match_count < p.max_count) {
|
||||
if (pos >= ctx.input.size()) {
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sREPEAT: at end of input, count=%d\n", debug_indent().c_str(), match_count);
|
||||
}
|
||||
break;
|
||||
|
|
@ -479,7 +472,7 @@ struct parser_executor {
|
|||
|
||||
auto result = arena.parse(p.child, ctx, pos);
|
||||
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sREPEAT iter %d: %s at %zu->%zu, nodes=%zu\n", debug_indent().c_str(), match_count,
|
||||
common_peg_parse_result_type_name(result.type), result.start, result.end, result.nodes.size());
|
||||
fprintf(stderr, "%sREPEAT CHILD: %s\n", debug_indent().c_str(), arena.dump(p.child).c_str());
|
||||
|
|
@ -488,7 +481,7 @@ struct parser_executor {
|
|||
if (result.success()) {
|
||||
// Prevent infinite loop on empty matches
|
||||
if (result.end == pos) {
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%s REPEAT: empty match, stopping\n", debug_indent().c_str());
|
||||
}
|
||||
break;
|
||||
|
|
@ -509,7 +502,7 @@ struct parser_executor {
|
|||
}
|
||||
|
||||
ctx.parse_depth--;
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sREPEAT -> NEED_MORE (count=%d, nodes=%zu)\n", debug_indent().c_str(),
|
||||
match_count, nodes.size());
|
||||
}
|
||||
|
|
@ -517,7 +510,7 @@ struct parser_executor {
|
|||
}
|
||||
|
||||
// Child failed - stop trying
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sREPEAT: child failed, stopping\n", debug_indent().c_str());
|
||||
}
|
||||
break;
|
||||
|
|
@ -526,14 +519,14 @@ struct parser_executor {
|
|||
// Check if we got enough matches
|
||||
if (p.min_count > 0 && match_count < p.min_count) {
|
||||
ctx.parse_depth--;
|
||||
if (pos >= ctx.input.size() && ctx.is_partial) {
|
||||
if (ctx.debug) {
|
||||
if (pos >= ctx.input.size() && ctx.is_lenient()) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sREPEAT -> NEED_MORE (not enough matches: %d < %d)\n", debug_indent().c_str(),
|
||||
match_count, p.min_count);
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos, std::move(nodes));
|
||||
}
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sREPEAT -> FAIL (not enough matches: %d < %d)\n", debug_indent().c_str(), match_count,
|
||||
p.min_count);
|
||||
}
|
||||
|
|
@ -541,7 +534,7 @@ struct parser_executor {
|
|||
}
|
||||
|
||||
ctx.parse_depth--;
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sREPEAT -> SUCCESS (count=%d, nodes=%zu)\n", debug_indent().c_str(), match_count,
|
||||
nodes.size());
|
||||
}
|
||||
|
|
@ -576,7 +569,7 @@ struct parser_executor {
|
|||
auto result = common_parse_utf8_codepoint(ctx.input, start_pos);
|
||||
|
||||
if (result.status == utf8_parse_result::INCOMPLETE) {
|
||||
if (!ctx.is_partial) {
|
||||
if (!ctx.is_lenient()) {
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
|
||||
|
|
@ -615,7 +608,7 @@ struct parser_executor {
|
|||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
||||
}
|
||||
// Not enough matches yet
|
||||
if (!ctx.is_partial) {
|
||||
if (!ctx.is_lenient()) {
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
||||
|
|
@ -656,7 +649,7 @@ struct parser_executor {
|
|||
|
||||
// Check if we got enough matches
|
||||
if (match_count < p.min_count) {
|
||||
if (pos >= ctx.input.size() && ctx.is_partial) {
|
||||
if (pos >= ctx.input.size() && ctx.is_lenient()) {
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
|
||||
|
|
@ -668,7 +661,7 @@ struct parser_executor {
|
|||
static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) {
|
||||
++pos; // consume '\'
|
||||
if (pos >= ctx.input.size()) {
|
||||
if (!ctx.is_partial) {
|
||||
if (!ctx.is_lenient()) {
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
|
||||
|
|
@ -698,7 +691,7 @@ struct parser_executor {
|
|||
++pos; // consume 'u'
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
if (pos >= ctx.input.size()) {
|
||||
if (!ctx.is_partial) {
|
||||
if (!ctx.is_lenient()) {
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
|
||||
|
|
@ -732,7 +725,7 @@ struct parser_executor {
|
|||
auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos);
|
||||
|
||||
if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
|
||||
if (!ctx.is_partial) {
|
||||
if (!ctx.is_lenient()) {
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
||||
|
|
@ -747,7 +740,7 @@ struct parser_executor {
|
|||
}
|
||||
|
||||
// Reached end without finding closing quote
|
||||
if (!ctx.is_partial) {
|
||||
if (!ctx.is_lenient()) {
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
||||
|
|
@ -774,7 +767,7 @@ struct parser_executor {
|
|||
auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos);
|
||||
|
||||
if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
|
||||
if (!ctx.is_partial) {
|
||||
if (!ctx.is_lenient()) {
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
||||
|
|
@ -789,7 +782,7 @@ struct parser_executor {
|
|||
}
|
||||
|
||||
// Reached end without finding closing quote
|
||||
if (!ctx.is_partial) {
|
||||
if (!ctx.is_lenient()) {
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
|
||||
}
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
||||
|
|
@ -807,7 +800,7 @@ struct parser_executor {
|
|||
|
||||
if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
|
||||
// Incomplete UTF-8 sequence
|
||||
if (!ctx.is_partial) {
|
||||
if (!ctx.is_lenient()) {
|
||||
// Input is complete but UTF-8 is incomplete = malformed
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
||||
}
|
||||
|
|
@ -837,7 +830,7 @@ struct parser_executor {
|
|||
last_valid_pos = pos;
|
||||
}
|
||||
|
||||
if (last_valid_pos == ctx.input.size() && ctx.is_partial) {
|
||||
if (last_valid_pos == ctx.input.size() && ctx.is_lenient()) {
|
||||
// Reached the end of a partial stream, there might still be more input that we need to consume.
|
||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
|
||||
}
|
||||
|
|
@ -876,7 +869,7 @@ struct parser_executor {
|
|||
|
||||
common_peg_parse_result operator()(const common_peg_tag_parser & p) {
|
||||
// Parse the child
|
||||
if (ctx.debug) {
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "%sTAG: %s\n", debug_indent().c_str(), p.tag.c_str());
|
||||
}
|
||||
auto result = arena.parse(p.child, ctx, start_pos);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue