This should fix an issue we have with unclosed strings with document_stream (#1321)
* This should fix an issue we have with unclosed strings. * Patching the fallback kernel. * Better guarding the code. * Patching the fallback.
This commit is contained in:
parent
3fa40b8dc2
commit
d9c4191e8a
|
@ -34,4 +34,4 @@ jobs:
|
|||
./alpine.sh cmake --build build_for_alpine
|
||||
- name: test
|
||||
run: |
|
||||
./alpine.sh bash -c "cd build_for_alpine && ctest"
|
||||
./alpine.sh bash -c "cd build_for_alpine && ctest -E checkperf"
|
||||
|
|
|
@ -91,7 +91,8 @@ simdjson_really_inline void validate_utf8_character() {
|
|||
idx += 4;
|
||||
}
|
||||
|
||||
simdjson_really_inline void validate_string() {
|
||||
// Returns true if the string is unclosed.
|
||||
simdjson_really_inline bool validate_string() {
|
||||
idx++; // skip first quote
|
||||
while (idx < len && buf[idx] != '"') {
|
||||
if (buf[idx] == '\\') {
|
||||
|
@ -103,7 +104,8 @@ simdjson_really_inline void validate_string() {
|
|||
idx++;
|
||||
}
|
||||
}
|
||||
if (idx >= len && !partial) { error = UNCLOSED_STRING; }
|
||||
if (idx >= len) { return true; }
|
||||
return false;
|
||||
}
|
||||
|
||||
simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) {
|
||||
|
@ -120,12 +122,13 @@ simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) {
|
|||
// Parse the entire input in STEP_SIZE-byte chunks.
|
||||
//
|
||||
simdjson_really_inline error_code scan() {
|
||||
bool unclosed_string = false;
|
||||
for (;idx<len;idx++) {
|
||||
switch (buf[idx]) {
|
||||
// String
|
||||
case '"':
|
||||
add_structural();
|
||||
validate_string();
|
||||
unclosed_string |= validate_string();
|
||||
break;
|
||||
// Operator
|
||||
case '{': case '}': case '[': case ']': case ',': case ':':
|
||||
|
@ -150,20 +153,19 @@ simdjson_really_inline error_code scan() {
|
|||
next_structural_index[1] = len;
|
||||
next_structural_index[2] = 0;
|
||||
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
|
||||
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
|
||||
parser.next_structural_index = 0;
|
||||
|
||||
if (simdjson_unlikely(parser.n_structural_indexes == 0)) {
|
||||
return EMPTY;
|
||||
}
|
||||
|
||||
if (partial) {
|
||||
if(unclosed_string) {
|
||||
parser.n_structural_indexes--;
|
||||
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
|
||||
}
|
||||
auto new_structural_indexes = find_next_document_index(parser);
|
||||
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
||||
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
|
||||
}
|
||||
parser.n_structural_indexes = new_structural_indexes;
|
||||
}
|
||||
|
||||
} else if(unclosed_string) { error = UNCLOSED_STRING; }
|
||||
return error;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& i
|
|||
}
|
||||
|
||||
simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
|
||||
error_code error = scanner.finish(false);
|
||||
error_code error = scanner.finish();
|
||||
if (error) { dst_len = 0; return error; }
|
||||
dst_len = dst - dst_start;
|
||||
return SUCCESS;
|
||||
|
|
|
@ -92,7 +92,8 @@ class json_scanner {
|
|||
public:
|
||||
json_scanner() {}
|
||||
simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in);
|
||||
simdjson_really_inline error_code finish(bool streaming);
|
||||
// Returns either UNCLOSED_STRING or SUCCESS
|
||||
simdjson_really_inline error_code finish();
|
||||
|
||||
private:
|
||||
// Whether the last character of the previous iteration is part of a scalar token
|
||||
|
@ -138,8 +139,8 @@ simdjson_really_inline json_block json_scanner::next(const simd::simd8x64<uint8_
|
|||
};
|
||||
}
|
||||
|
||||
simdjson_really_inline error_code json_scanner::finish(bool streaming) {
|
||||
return string_scanner.finish(streaming);
|
||||
simdjson_really_inline error_code json_scanner::finish() {
|
||||
return string_scanner.finish();
|
||||
}
|
||||
|
||||
} // namespace stage1
|
||||
|
|
|
@ -37,7 +37,8 @@ struct json_string_block {
|
|||
class json_string_scanner {
|
||||
public:
|
||||
simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
|
||||
simdjson_really_inline error_code finish(bool streaming);
|
||||
// Returns either UNCLOSED_STRING or SUCCESS
|
||||
simdjson_really_inline error_code finish();
|
||||
|
||||
private:
|
||||
// Intended to be defined by the implementation
|
||||
|
@ -132,8 +133,8 @@ simdjson_really_inline json_string_block json_string_scanner::next(const simd::s
|
|||
};
|
||||
}
|
||||
|
||||
simdjson_really_inline error_code json_string_scanner::finish(bool streaming) {
|
||||
if (prev_in_string and (not streaming)) {
|
||||
simdjson_really_inline error_code json_string_scanner::finish() {
|
||||
if (prev_in_string) {
|
||||
return UNCLOSED_STRING;
|
||||
}
|
||||
return SUCCESS;
|
||||
|
|
|
@ -182,8 +182,14 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp
|
|||
// Write out the final iteration's structurals
|
||||
indexer.write(uint32_t(idx-64), prev_structurals);
|
||||
|
||||
error_code error = scanner.finish(partial);
|
||||
if (simdjson_unlikely(error != SUCCESS)) { return error; }
|
||||
error_code error = scanner.finish();
|
||||
// We deliberately break down the next expression so that it is
|
||||
// human readable.
|
||||
const bool should_we_exit = partial ?
|
||||
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
|
||||
: (error != SUCCESS); // if partial is false, we must have SUCCESS
|
||||
const bool have_unclosed_string = (error == UNCLOSED_STRING);
|
||||
if (simdjson_unlikely(should_we_exit)) { return error; }
|
||||
|
||||
if (unescaped_chars_error) {
|
||||
return UNESCAPED_CHARS;
|
||||
|
@ -216,6 +222,13 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp
|
|||
return UNEXPECTED_ERROR;
|
||||
}
|
||||
if (partial) {
|
||||
// If we have an unclosed string, then the last structural
|
||||
// will be the quote and we want to make sure to omit it.
|
||||
if(have_unclosed_string) {
|
||||
parser.n_structural_indexes--;
|
||||
// a valid JSON file cannot have zero structural indexes - we should have found something
|
||||
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
|
||||
}
|
||||
auto new_structural_indexes = find_next_document_index(parser);
|
||||
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
||||
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
|
||||
|
|
|
@ -143,21 +143,33 @@ namespace document_stream_tests {
|
|||
|
||||
bool issue1310() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
const simdjson::padded_string input = decode_base64("AwA5ICIg");
|
||||
// hex : 20 20 5B 20 33 2C 31 5D 20 22 22 22 22 22 22 22 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20
|
||||
// ascii: __ __[__ __3__,__1__]__ __"__"__"__"__"__"__"__ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __
|
||||
// We have four full documents followed by an unclosed string.
|
||||
const simdjson::padded_string input = decode_base64("ICBbIDMsMV0gIiIiIiIiIiAgICAgICAgICAgICAgICAg");
|
||||
print_hex(input);
|
||||
for(size_t window = 0; window <= 100; window++) {
|
||||
simdjson::dom::parser parser;
|
||||
simdjson::dom::document_stream stream;
|
||||
ASSERT_SUCCESS(parser.parse_many(input, window).get(stream));
|
||||
size_t count{0};
|
||||
for(auto doc: stream) {
|
||||
auto error = doc.error();
|
||||
if(!error) {
|
||||
std::cout << "Expected an error but got " << error << std::endl;
|
||||
std::cout << "Window = " << window << std::endl;
|
||||
return false;
|
||||
count++;
|
||||
if(count <= 4) {
|
||||
if(window < input.size() && error) {
|
||||
std::cout << "Expected no error but got " << error << std::endl;
|
||||
std::cout << "Window = " << window << std::endl;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if(!error) {
|
||||
std::cout << "Expected an error but got " << error << std::endl;
|
||||
std::cout << "Window = " << window << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue