This should fix an issue we have with unclosed strings with document_stream (#1321)

* This should fix an issue we have with unclosed strings.

* Patching the fallback kernel.

* Better guarding the code.

* Patching the fallback.
This commit is contained in:
Daniel Lemire 2020-11-30 13:43:57 -05:00 committed by GitHub
parent 3fa40b8dc2
commit d9c4191e8a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 55 additions and 26 deletions

View File

@ -34,4 +34,4 @@ jobs:
./alpine.sh cmake --build build_for_alpine ./alpine.sh cmake --build build_for_alpine
- name: test - name: test
run: | run: |
./alpine.sh bash -c "cd build_for_alpine && ctest" ./alpine.sh bash -c "cd build_for_alpine && ctest -E checkperf"

View File

@ -91,7 +91,8 @@ simdjson_really_inline void validate_utf8_character() {
idx += 4; idx += 4;
} }
simdjson_really_inline void validate_string() { // Returns true if the string is unclosed.
simdjson_really_inline bool validate_string() {
idx++; // skip first quote idx++; // skip first quote
while (idx < len && buf[idx] != '"') { while (idx < len && buf[idx] != '"') {
if (buf[idx] == '\\') { if (buf[idx] == '\\') {
@ -103,7 +104,8 @@ simdjson_really_inline void validate_string() {
idx++; idx++;
} }
} }
if (idx >= len && !partial) { error = UNCLOSED_STRING; } if (idx >= len) { return true; }
return false;
} }
simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) { simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) {
@ -120,12 +122,13 @@ simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) {
// Parse the entire input in STEP_SIZE-byte chunks. // Parse the entire input in STEP_SIZE-byte chunks.
// //
simdjson_really_inline error_code scan() { simdjson_really_inline error_code scan() {
bool unclosed_string = false;
for (;idx<len;idx++) { for (;idx<len;idx++) {
switch (buf[idx]) { switch (buf[idx]) {
// String // String
case '"': case '"':
add_structural(); add_structural();
validate_string(); unclosed_string |= validate_string();
break; break;
// Operator // Operator
case '{': case '}': case '[': case ']': case ',': case ':': case '{': case '}': case '[': case ']': case ',': case ':':
@ -150,20 +153,19 @@ simdjson_really_inline error_code scan() {
next_structural_index[1] = len; next_structural_index[1] = len;
next_structural_index[2] = 0; next_structural_index[2] = 0;
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get()); parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
parser.next_structural_index = 0; parser.next_structural_index = 0;
if (simdjson_unlikely(parser.n_structural_indexes == 0)) {
return EMPTY;
}
if (partial) { if (partial) {
if(unclosed_string) {
parser.n_structural_indexes--;
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
}
auto new_structural_indexes = find_next_document_index(parser); auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
} }
parser.n_structural_indexes = new_structural_indexes; parser.n_structural_indexes = new_structural_indexes;
} } else if(unclosed_string) { error = UNCLOSED_STRING; }
return error; return error;
} }

View File

@ -32,7 +32,7 @@ simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& i
} }
simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) { simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
error_code error = scanner.finish(false); error_code error = scanner.finish();
if (error) { dst_len = 0; return error; } if (error) { dst_len = 0; return error; }
dst_len = dst - dst_start; dst_len = dst - dst_start;
return SUCCESS; return SUCCESS;

View File

@ -92,7 +92,8 @@ class json_scanner {
public: public:
json_scanner() {} json_scanner() {}
simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in); simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in);
simdjson_really_inline error_code finish(bool streaming); // Returns either UNCLOSED_STRING or SUCCESS
simdjson_really_inline error_code finish();
private: private:
// Whether the last character of the previous iteration is part of a scalar token // Whether the last character of the previous iteration is part of a scalar token
@ -138,8 +139,8 @@ simdjson_really_inline json_block json_scanner::next(const simd::simd8x64<uint8_
}; };
} }
simdjson_really_inline error_code json_scanner::finish(bool streaming) { simdjson_really_inline error_code json_scanner::finish() {
return string_scanner.finish(streaming); return string_scanner.finish();
} }
} // namespace stage1 } // namespace stage1

View File

@ -37,7 +37,8 @@ struct json_string_block {
class json_string_scanner { class json_string_scanner {
public: public:
simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in); simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
simdjson_really_inline error_code finish(bool streaming); // Returns either UNCLOSED_STRING or SUCCESS
simdjson_really_inline error_code finish();
private: private:
// Intended to be defined by the implementation // Intended to be defined by the implementation
@ -132,8 +133,8 @@ simdjson_really_inline json_string_block json_string_scanner::next(const simd::s
}; };
} }
simdjson_really_inline error_code json_string_scanner::finish(bool streaming) { simdjson_really_inline error_code json_string_scanner::finish() {
if (prev_in_string and (not streaming)) { if (prev_in_string) {
return UNCLOSED_STRING; return UNCLOSED_STRING;
} }
return SUCCESS; return SUCCESS;

View File

@ -182,8 +182,14 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp
// Write out the final iteration's structurals // Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals); indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish(partial); error_code error = scanner.finish();
if (simdjson_unlikely(error != SUCCESS)) { return error; } // We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = partial ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) { if (unescaped_chars_error) {
return UNESCAPED_CHARS; return UNESCAPED_CHARS;
@ -216,6 +222,13 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp
return UNEXPECTED_ERROR; return UNEXPECTED_ERROR;
} }
if (partial) { if (partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
auto new_structural_indexes = find_next_document_index(parser); auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.

View File

@ -143,21 +143,33 @@ namespace document_stream_tests {
bool issue1310() { bool issue1310() {
std::cout << "Running " << __func__ << std::endl; std::cout << "Running " << __func__ << std::endl;
const simdjson::padded_string input = decode_base64("AwA5ICIg"); // hex : 20 20 5B 20 33 2C 31 5D 20 22 22 22 22 22 22 22 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20
// ascii: __ __[__ __3__,__1__]__ __"__"__"__"__"__"__"__ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __
// We have four full documents followed by an unclosed string.
const simdjson::padded_string input = decode_base64("ICBbIDMsMV0gIiIiIiIiIiAgICAgICAgICAgICAgICAg");
print_hex(input); print_hex(input);
for(size_t window = 0; window <= 100; window++) { for(size_t window = 0; window <= 100; window++) {
simdjson::dom::parser parser; simdjson::dom::parser parser;
simdjson::dom::document_stream stream; simdjson::dom::document_stream stream;
ASSERT_SUCCESS(parser.parse_many(input, window).get(stream)); ASSERT_SUCCESS(parser.parse_many(input, window).get(stream));
size_t count{0};
for(auto doc: stream) { for(auto doc: stream) {
auto error = doc.error(); auto error = doc.error();
count++;
if(count <= 4) {
if(window < input.size() && error) {
std::cout << "Expected no error but got " << error << std::endl;
std::cout << "Window = " << window << std::endl;
return false;
}
} else {
if(!error) { if(!error) {
std::cout << "Expected an error but got " << error << std::endl; std::cout << "Expected an error but got " << error << std::endl;
std::cout << "Window = " << window << std::endl; std::cout << "Window = " << window << std::endl;
return false; return false;
} }
} }
}
} }
return true; return true;
} }