diff --git a/.github/workflows/alpine.yml b/.github/workflows/alpine.yml index b1ea7a92..a24f7e10 100644 --- a/.github/workflows/alpine.yml +++ b/.github/workflows/alpine.yml @@ -34,4 +34,4 @@ jobs: ./alpine.sh cmake --build build_for_alpine - name: test run: | - ./alpine.sh bash -c "cd build_for_alpine && ctest" + ./alpine.sh bash -c "cd build_for_alpine && ctest -E checkperf" diff --git a/src/fallback/dom_parser_implementation.cpp b/src/fallback/dom_parser_implementation.cpp index 7e40d280..a0374963 100644 --- a/src/fallback/dom_parser_implementation.cpp +++ b/src/fallback/dom_parser_implementation.cpp @@ -91,7 +91,8 @@ simdjson_really_inline void validate_utf8_character() { idx += 4; } -simdjson_really_inline void validate_string() { +// Returns true if the string is unclosed. +simdjson_really_inline bool validate_string() { idx++; // skip first quote while (idx < len && buf[idx] != '"') { if (buf[idx] == '\\') { @@ -103,7 +104,8 @@ simdjson_really_inline void validate_string() { idx++; } } - if (idx >= len && !partial) { error = UNCLOSED_STRING; } + if (idx >= len) { return true; } + return false; } simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) { @@ -120,12 +122,13 @@ simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) { // Parse the entire input in STEP_SIZE-byte chunks. // simdjson_really_inline error_code scan() { + bool unclosed_string = false; for (;idx 0) { return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. } parser.n_structural_indexes = new_structural_indexes; - } - + } else if(unclosed_string) { error = UNCLOSED_STRING; } return error; } diff --git a/src/generic/stage1/json_minifier.h b/src/generic/stage1/json_minifier.h index 054f02f0..97fb07de 100644 --- a/src/generic/stage1/json_minifier.h +++ b/src/generic/stage1/json_minifier.h @@ -32,7 +32,7 @@ simdjson_really_inline void json_minifier::next(const simd::simd8x64& i } simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) { - error_code error = scanner.finish(false); + error_code error = scanner.finish(); if (error) { dst_len = 0; return error; } dst_len = dst - dst_start; return SUCCESS; diff --git a/src/generic/stage1/json_scanner.h b/src/generic/stage1/json_scanner.h index 2c03b532..0715f3d3 100644 --- a/src/generic/stage1/json_scanner.h +++ b/src/generic/stage1/json_scanner.h @@ -92,7 +92,8 @@ class json_scanner { public: json_scanner() {} simdjson_really_inline json_block next(const simd::simd8x64& in); - simdjson_really_inline error_code finish(bool streaming); + // Returns either UNCLOSED_STRING or SUCCESS + simdjson_really_inline error_code finish(); private: // Whether the last character of the previous iteration is part of a scalar token @@ -138,8 +139,8 @@ simdjson_really_inline json_block json_scanner::next(const simd::simd8x64& in); - simdjson_really_inline error_code finish(bool streaming); + // Returns either UNCLOSED_STRING or SUCCESS + simdjson_really_inline error_code finish(); private: // Intended to be defined by the implementation @@ -132,8 +133,8 @@ simdjson_really_inline json_string_block json_string_scanner::next(const simd::s }; } -simdjson_really_inline error_code json_string_scanner::finish(bool streaming) { - if (prev_in_string and (not streaming)) { +simdjson_really_inline error_code json_string_scanner::finish() { + if (prev_in_string) { return UNCLOSED_STRING; } return SUCCESS; diff --git a/src/generic/stage1/json_structural_indexer.h b/src/generic/stage1/json_structural_indexer.h index e731642e..756cbcdc 100644 --- a/src/generic/stage1/json_structural_indexer.h +++ b/src/generic/stage1/json_structural_indexer.h @@ -182,8 +182,14 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp // Write out the final iteration's structurals indexer.write(uint32_t(idx-64), prev_structurals); - error_code error = scanner.finish(partial); - if (simdjson_unlikely(error != SUCCESS)) { return error; } + error_code error = scanner.finish(); + // We deliberately break down the next expression so that it is + // human readable. + const bool should_we_exit = partial ? + ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING + : (error != SUCCESS); // if partial is false, we must have SUCCESS + const bool have_unclosed_string = (error == UNCLOSED_STRING); + if (simdjson_unlikely(should_we_exit)) { return error; } if (unescaped_chars_error) { return UNESCAPED_CHARS; @@ -216,6 +222,13 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp return UNEXPECTED_ERROR; } if (partial) { + // If we have an unclosed string, then the last structural + // will be the quote and we want to make sure to omit it. + if(have_unclosed_string) { + parser.n_structural_indexes--; + // a valid JSON file cannot have zero structural indexes - we should have found something + if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; } + } auto new_structural_indexes = find_next_document_index(parser); if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. diff --git a/tests/document_stream_tests.cpp b/tests/document_stream_tests.cpp index 2e88fed9..328b6260 100644 --- a/tests/document_stream_tests.cpp +++ b/tests/document_stream_tests.cpp @@ -143,21 +143,33 @@ namespace document_stream_tests { bool issue1310() { std::cout << "Running " << __func__ << std::endl; - const simdjson::padded_string input = decode_base64("AwA5ICIg"); + // hex : 20 20 5B 20 33 2C 31 5D 20 22 22 22 22 22 22 22 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 + // ascii: __ __[__ __3__,__1__]__ __"__"__"__"__"__"__"__ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ + // We have four full documents followed by an unclosed string. + const simdjson::padded_string input = decode_base64("ICBbIDMsMV0gIiIiIiIiIiAgICAgICAgICAgICAgICAg"); print_hex(input); for(size_t window = 0; window <= 100; window++) { simdjson::dom::parser parser; simdjson::dom::document_stream stream; ASSERT_SUCCESS(parser.parse_many(input, window).get(stream)); + size_t count{0}; for(auto doc: stream) { auto error = doc.error(); - if(!error) { - std::cout << "Expected an error but got " << error << std::endl; - std::cout << "Window = " << window << std::endl; - return false; + count++; + if(count <= 4) { + if(window < input.size() && error) { + std::cout << "Expected no error but got " << error << std::endl; + std::cout << "Window = " << window << std::endl; + return false; + } + } else { + if(!error) { + std::cout << "Expected an error but got " << error << std::endl; + std::cout << "Window = " << window << std::endl; + return false; + } } } - } return true; }