This should fix an issue we have with unclosed strings with document_stream (#1321)
* This should fix an issue we have with unclosed strings. * Patching the fallback kernel. * Better guarding the code. * Patching the fallback.
This commit is contained in:
parent
3fa40b8dc2
commit
d9c4191e8a
|
@ -34,4 +34,4 @@ jobs:
|
||||||
./alpine.sh cmake --build build_for_alpine
|
./alpine.sh cmake --build build_for_alpine
|
||||||
- name: test
|
- name: test
|
||||||
run: |
|
run: |
|
||||||
./alpine.sh bash -c "cd build_for_alpine && ctest"
|
./alpine.sh bash -c "cd build_for_alpine && ctest -E checkperf"
|
||||||
|
|
|
@ -91,7 +91,8 @@ simdjson_really_inline void validate_utf8_character() {
|
||||||
idx += 4;
|
idx += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
simdjson_really_inline void validate_string() {
|
// Returns true if the string is unclosed.
|
||||||
|
simdjson_really_inline bool validate_string() {
|
||||||
idx++; // skip first quote
|
idx++; // skip first quote
|
||||||
while (idx < len && buf[idx] != '"') {
|
while (idx < len && buf[idx] != '"') {
|
||||||
if (buf[idx] == '\\') {
|
if (buf[idx] == '\\') {
|
||||||
|
@ -103,7 +104,8 @@ simdjson_really_inline void validate_string() {
|
||||||
idx++;
|
idx++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (idx >= len && !partial) { error = UNCLOSED_STRING; }
|
if (idx >= len) { return true; }
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) {
|
simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) {
|
||||||
|
@ -120,12 +122,13 @@ simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) {
|
||||||
// Parse the entire input in STEP_SIZE-byte chunks.
|
// Parse the entire input in STEP_SIZE-byte chunks.
|
||||||
//
|
//
|
||||||
simdjson_really_inline error_code scan() {
|
simdjson_really_inline error_code scan() {
|
||||||
|
bool unclosed_string = false;
|
||||||
for (;idx<len;idx++) {
|
for (;idx<len;idx++) {
|
||||||
switch (buf[idx]) {
|
switch (buf[idx]) {
|
||||||
// String
|
// String
|
||||||
case '"':
|
case '"':
|
||||||
add_structural();
|
add_structural();
|
||||||
validate_string();
|
unclosed_string |= validate_string();
|
||||||
break;
|
break;
|
||||||
// Operator
|
// Operator
|
||||||
case '{': case '}': case '[': case ']': case ',': case ':':
|
case '{': case '}': case '[': case ']': case ',': case ':':
|
||||||
|
@ -150,20 +153,19 @@ simdjson_really_inline error_code scan() {
|
||||||
next_structural_index[1] = len;
|
next_structural_index[1] = len;
|
||||||
next_structural_index[2] = 0;
|
next_structural_index[2] = 0;
|
||||||
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
|
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
|
||||||
|
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
|
||||||
parser.next_structural_index = 0;
|
parser.next_structural_index = 0;
|
||||||
|
|
||||||
if (simdjson_unlikely(parser.n_structural_indexes == 0)) {
|
|
||||||
return EMPTY;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (partial) {
|
if (partial) {
|
||||||
|
if(unclosed_string) {
|
||||||
|
parser.n_structural_indexes--;
|
||||||
|
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
|
||||||
|
}
|
||||||
auto new_structural_indexes = find_next_document_index(parser);
|
auto new_structural_indexes = find_next_document_index(parser);
|
||||||
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
||||||
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
|
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
|
||||||
}
|
}
|
||||||
parser.n_structural_indexes = new_structural_indexes;
|
parser.n_structural_indexes = new_structural_indexes;
|
||||||
}
|
} else if(unclosed_string) { error = UNCLOSED_STRING; }
|
||||||
|
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& i
|
||||||
}
|
}
|
||||||
|
|
||||||
simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
|
simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
|
||||||
error_code error = scanner.finish(false);
|
error_code error = scanner.finish();
|
||||||
if (error) { dst_len = 0; return error; }
|
if (error) { dst_len = 0; return error; }
|
||||||
dst_len = dst - dst_start;
|
dst_len = dst - dst_start;
|
||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
|
|
|
@ -92,7 +92,8 @@ class json_scanner {
|
||||||
public:
|
public:
|
||||||
json_scanner() {}
|
json_scanner() {}
|
||||||
simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in);
|
simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in);
|
||||||
simdjson_really_inline error_code finish(bool streaming);
|
// Returns either UNCLOSED_STRING or SUCCESS
|
||||||
|
simdjson_really_inline error_code finish();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Whether the last character of the previous iteration is part of a scalar token
|
// Whether the last character of the previous iteration is part of a scalar token
|
||||||
|
@ -138,8 +139,8 @@ simdjson_really_inline json_block json_scanner::next(const simd::simd8x64<uint8_
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
simdjson_really_inline error_code json_scanner::finish(bool streaming) {
|
simdjson_really_inline error_code json_scanner::finish() {
|
||||||
return string_scanner.finish(streaming);
|
return string_scanner.finish();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace stage1
|
} // namespace stage1
|
||||||
|
|
|
@ -37,7 +37,8 @@ struct json_string_block {
|
||||||
class json_string_scanner {
|
class json_string_scanner {
|
||||||
public:
|
public:
|
||||||
simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
|
simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
|
||||||
simdjson_really_inline error_code finish(bool streaming);
|
// Returns either UNCLOSED_STRING or SUCCESS
|
||||||
|
simdjson_really_inline error_code finish();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Intended to be defined by the implementation
|
// Intended to be defined by the implementation
|
||||||
|
@ -132,8 +133,8 @@ simdjson_really_inline json_string_block json_string_scanner::next(const simd::s
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
simdjson_really_inline error_code json_string_scanner::finish(bool streaming) {
|
simdjson_really_inline error_code json_string_scanner::finish() {
|
||||||
if (prev_in_string and (not streaming)) {
|
if (prev_in_string) {
|
||||||
return UNCLOSED_STRING;
|
return UNCLOSED_STRING;
|
||||||
}
|
}
|
||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
|
|
|
@ -182,8 +182,14 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp
|
||||||
// Write out the final iteration's structurals
|
// Write out the final iteration's structurals
|
||||||
indexer.write(uint32_t(idx-64), prev_structurals);
|
indexer.write(uint32_t(idx-64), prev_structurals);
|
||||||
|
|
||||||
error_code error = scanner.finish(partial);
|
error_code error = scanner.finish();
|
||||||
if (simdjson_unlikely(error != SUCCESS)) { return error; }
|
// We deliberately break down the next expression so that it is
|
||||||
|
// human readable.
|
||||||
|
const bool should_we_exit = partial ?
|
||||||
|
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
|
||||||
|
: (error != SUCCESS); // if partial is false, we must have SUCCESS
|
||||||
|
const bool have_unclosed_string = (error == UNCLOSED_STRING);
|
||||||
|
if (simdjson_unlikely(should_we_exit)) { return error; }
|
||||||
|
|
||||||
if (unescaped_chars_error) {
|
if (unescaped_chars_error) {
|
||||||
return UNESCAPED_CHARS;
|
return UNESCAPED_CHARS;
|
||||||
|
@ -216,6 +222,13 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp
|
||||||
return UNEXPECTED_ERROR;
|
return UNEXPECTED_ERROR;
|
||||||
}
|
}
|
||||||
if (partial) {
|
if (partial) {
|
||||||
|
// If we have an unclosed string, then the last structural
|
||||||
|
// will be the quote and we want to make sure to omit it.
|
||||||
|
if(have_unclosed_string) {
|
||||||
|
parser.n_structural_indexes--;
|
||||||
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
||||||
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
|
||||||
|
}
|
||||||
auto new_structural_indexes = find_next_document_index(parser);
|
auto new_structural_indexes = find_next_document_index(parser);
|
||||||
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
||||||
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
|
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
|
||||||
|
|
|
@ -143,21 +143,33 @@ namespace document_stream_tests {
|
||||||
|
|
||||||
bool issue1310() {
|
bool issue1310() {
|
||||||
std::cout << "Running " << __func__ << std::endl;
|
std::cout << "Running " << __func__ << std::endl;
|
||||||
const simdjson::padded_string input = decode_base64("AwA5ICIg");
|
// hex : 20 20 5B 20 33 2C 31 5D 20 22 22 22 22 22 22 22 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20
|
||||||
|
// ascii: __ __[__ __3__,__1__]__ __"__"__"__"__"__"__"__ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __
|
||||||
|
// We have four full documents followed by an unclosed string.
|
||||||
|
const simdjson::padded_string input = decode_base64("ICBbIDMsMV0gIiIiIiIiIiAgICAgICAgICAgICAgICAg");
|
||||||
print_hex(input);
|
print_hex(input);
|
||||||
for(size_t window = 0; window <= 100; window++) {
|
for(size_t window = 0; window <= 100; window++) {
|
||||||
simdjson::dom::parser parser;
|
simdjson::dom::parser parser;
|
||||||
simdjson::dom::document_stream stream;
|
simdjson::dom::document_stream stream;
|
||||||
ASSERT_SUCCESS(parser.parse_many(input, window).get(stream));
|
ASSERT_SUCCESS(parser.parse_many(input, window).get(stream));
|
||||||
|
size_t count{0};
|
||||||
for(auto doc: stream) {
|
for(auto doc: stream) {
|
||||||
auto error = doc.error();
|
auto error = doc.error();
|
||||||
|
count++;
|
||||||
|
if(count <= 4) {
|
||||||
|
if(window < input.size() && error) {
|
||||||
|
std::cout << "Expected no error but got " << error << std::endl;
|
||||||
|
std::cout << "Window = " << window << std::endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
if(!error) {
|
if(!error) {
|
||||||
std::cout << "Expected an error but got " << error << std::endl;
|
std::cout << "Expected an error but got " << error << std::endl;
|
||||||
std::cout << "Window = " << window << std::endl;
|
std::cout << "Window = " << window << std::endl;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue