diff --git a/src/arm64/dom_parser_implementation.cpp b/src/arm64/dom_parser_implementation.cpp index 2e95cd68..9116965b 100644 --- a/src/arm64/dom_parser_implementation.cpp +++ b/src/arm64/dom_parser_implementation.cpp @@ -82,6 +82,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len); } +#include "generic/stage1/find_next_document_index.h" #include "generic/stage1/utf8_lookup2_algorithm.h" #include "generic/stage1/json_structural_indexer.h" WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept { diff --git a/src/fallback/dom_parser_implementation.cpp b/src/fallback/dom_parser_implementation.cpp index e77e5e97..ae0944c8 100644 --- a/src/fallback/dom_parser_implementation.cpp +++ b/src/fallback/dom_parser_implementation.cpp @@ -9,15 +9,17 @@ namespace simdjson { namespace fallback { namespace stage1 { +#include "generic/stage1/find_next_document_index.h" + class structural_scanner { public: -really_inline structural_scanner(dom_parser_implementation &_parser, bool _streaming) +really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial) : buf{_parser.buf}, next_structural_index{_parser.structural_indexes.get()}, parser{_parser}, len{static_cast(_parser.len)}, - streaming{_streaming} { + partial{_partial} { } really_inline void add_structural() { @@ -41,7 +43,12 @@ really_inline void validate_utf8_character() { // 2-byte if ((buf[idx] & 0b00100000) == 0) { // missing continuation - if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { error = UTF8_ERROR; idx++; return; } + if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { + if (idx+1 > len && partial) { idx = len; return; } + error = UTF8_ERROR; + idx++; + return; + } // overlong: 1100000_ 10______ if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; } idx += 2; @@ -51,7 +58,12 @@ really_inline void validate_utf8_character() { // 3-byte if ((buf[idx] & 0b00010000) == 0) { // missing continuation - if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { error = UTF8_ERROR; idx++; return; } + if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { + if (idx+2 > len && partial) { idx = len; return; } + error = UTF8_ERROR; + idx++; + return; + } // overlong: 11100000 100_____ ________ if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; } // surrogates: U+D800-U+DFFF 11101101 101_____ @@ -62,7 +74,12 @@ really_inline void validate_utf8_character() { // 4-byte // missing continuation - if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { error = UTF8_ERROR; idx++; return; } + if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { + if (idx+2 > len && partial) { idx = len; return; } + error = UTF8_ERROR; + idx++; + return; + } // overlong: 11110000 1000____ ________ ________ if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; } // too large: > U+10FFFF: @@ -87,7 +104,7 @@ really_inline void validate_string() { idx++; } } - if (idx >= len && !streaming) { error = UNCLOSED_STRING; } + if (idx >= len && !partial) { error = UNCLOSED_STRING; } } really_inline bool is_whitespace_or_operator(uint8_t c) { @@ -128,16 +145,26 @@ really_inline error_code scan() { break; } } - if (unlikely(next_structural_index == parser.structural_indexes.get())) { - return EMPTY; - } *next_structural_index = len; - next_structural_index++; // We pad beyond. // https://github.com/simdjson/simdjson/issues/906 - next_structural_index[0] = len; - next_structural_index[1] = 0; + next_structural_index[1] = len; + next_structural_index[2] = 0; parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get()); + parser.next_structural_index = 0; + + if (unlikely(parser.n_structural_indexes == 0)) { + return EMPTY; + } + + if (partial) { + auto new_structural_indexes = find_next_document_index(parser); + if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { + return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. + } + parser.n_structural_indexes = new_structural_indexes; + } + return error; } @@ -148,16 +175,16 @@ private: uint32_t len; uint32_t idx{0}; error_code error{SUCCESS}; - bool streaming; + bool partial; }; // structural_scanner } // namespace stage1 -WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept { +WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept { this->buf = _buf; this->len = _len; - stage1::structural_scanner scanner(*this, streaming); + stage1::structural_scanner scanner(*this, partial); return scanner.scan(); } diff --git a/src/generic/stage1/find_next_document_index.h b/src/generic/stage1/find_next_document_index.h new file mode 100644 index 00000000..302af175 --- /dev/null +++ b/src/generic/stage1/find_next_document_index.h @@ -0,0 +1,86 @@ +/** + * This algorithm is used to quickly identify the last structural position that + * makes up a complete document. + * + * It does this by going backwards and finding the last *document boundary* (a + * place where one value follows another without a comma between them). If the + * last document (the characters after the boundary) has an equal number of + * start and end brackets, it is considered complete. + * + * Simply put, we iterate over the structural characters, starting from + * the end. We consider that we found the end of a JSON document when the + * first element of the pair is NOT one of these characters: '{' '[' ';' ',' + * and when the second element is NOT one of these characters: '}' '}' ';' ','. + * + * This simple comparison works most of the time, but it does not cover cases + * where the batch's structural indexes contain a perfect amount of documents. + * In such a case, we do not have access to the structural index which follows + * the last document, therefore, we do not have access to the second element in + * the pair, and that means we cannot identify the last document. To fix this + * issue, we keep a count of the open and closed curly/square braces we found + * while searching for the pair. When we find a pair AND the count of open and + * closed curly/square braces is the same, we know that we just passed a + * complete document, therefore the last json buffer location is the end of the + * batch. + */ +really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) { + // TODO don't count separately, just figure out depth + auto arr_cnt = 0; + auto obj_cnt = 0; + for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { + auto idxb = parser.structural_indexes[i]; + switch (parser.buf[idxb]) { + case ':': + case ',': + continue; + case '}': + obj_cnt--; + continue; + case ']': + arr_cnt--; + continue; + case '{': + obj_cnt++; + break; + case '[': + arr_cnt++; + break; + } + auto idxa = parser.structural_indexes[i - 1]; + switch (parser.buf[idxa]) { + case '{': + case '[': + case ':': + case ',': + continue; + } + // Last document is complete, so the next document will appear after! + if (!arr_cnt && !obj_cnt) { + return parser.n_structural_indexes; + } + // Last document is incomplete; mark the document at i + 1 as the next one + return i; + } + return 0; +} + +// Skip the last character if it is partial +really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { + if (unlikely(len < 3)) { + switch (len) { + case 2: + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left + return len; + case 1: + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + return len; + case 0: + return len; + } + } + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left + if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left + return len; +} diff --git a/src/generic/stage1/json_structural_indexer.h b/src/generic/stage1/json_structural_indexer.h index 0cb2c9f3..6f80123e 100644 --- a/src/generic/stage1/json_structural_indexer.h +++ b/src/generic/stage1/json_structural_indexer.h @@ -73,8 +73,6 @@ private: really_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; really_inline void next(simd::simd8x64 in, json_block block, size_t idx); really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial); - static really_inline uint32_t find_next_document_index(dom_parser_implementation &parser); - static really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len); json_scanner scanner{}; utf8_checker checker{}; @@ -98,7 +96,7 @@ really_inline json_structural_indexer::json_structural_indexer(uint32_t *structu // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that // to finish: utf-8 checks and generating the output from the last iteration. // -// The reason we run 2 inputs at a time, is steps 2 and 3 are//still* not enough to soak up all +// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough // workout. // @@ -162,13 +160,6 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati } parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); - // a valid JSON file cannot have zero structural indexes - we should have found something - if (unlikely(parser.n_structural_indexes == 0u)) { - return EMPTY; - } - if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { - return UNEXPECTED_ERROR; - } /*** * This is related to https://github.com/simdjson/simdjson/issues/906 * Basically, we want to make sure that if the parsing continues beyond the last (valid) @@ -186,6 +177,14 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len); parser.structural_indexes[parser.n_structural_indexes + 2] = 0; + parser.next_structural_index = 0; + // a valid JSON file cannot have zero structural indexes - we should have found something + if (unlikely(parser.n_structural_indexes == 0u)) { + return EMPTY; + } + if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { + return UNEXPECTED_ERROR; + } if (partial) { auto new_structural_indexes = find_next_document_index(parser); if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { @@ -193,95 +192,7 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati } parser.n_structural_indexes = new_structural_indexes; } - parser.next_structural_index = 0; return checker.errors(); } -/** - * This algorithm is used to quickly identify the last structural position that - * makes up a complete document. - * - * It does this by going backwards and finding the last *document boundary* (a - * place where one value follows another without a comma between them). If the - * last document (the characters after the boundary) has an equal number of - * start and end brackets, it is considered complete. - * - * Simply put, we iterate over the structural characters, starting from - * the end. We consider that we found the end of a JSON document when the - * first element of the pair is NOT one of these characters: '{' '[' ';' ',' - * and when the second element is NOT one of these characters: '}' '}' ';' ','. - * - * This simple comparison works most of the time, but it does not cover cases - * where the batch's structural indexes contain a perfect amount of documents. - * In such a case, we do not have access to the structural index which follows - * the last document, therefore, we do not have access to the second element in - * the pair, and means that we cannot identify the last document. To fix this - * issue, we keep a count of the open and closed curly/square braces we found - * while searching for the pair. When we find a pair AND the count of open and - * closed curly/square braces is the same, we know that we just passed a - * complete - * document, therefore the last json buffer location is the end of the batch - */ -really_inline uint32_t json_structural_indexer::find_next_document_index(dom_parser_implementation &parser) { - // TODO don't count separately, just figure out depth - auto arr_cnt = 0; - auto obj_cnt = 0; - for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { - auto idxb = parser.structural_indexes[i]; - switch (parser.buf[idxb]) { - case ':': - case ',': - continue; - case '}': - obj_cnt--; - continue; - case ']': - arr_cnt--; - continue; - case '{': - obj_cnt++; - break; - case '[': - arr_cnt++; - break; - } - auto idxa = parser.structural_indexes[i - 1]; - switch (parser.buf[idxa]) { - case '{': - case '[': - case ':': - case ',': - continue; - } - // Last document is complete, so the next document will appear after! - if (!arr_cnt && !obj_cnt) { - return parser.n_structural_indexes; - } - // Last document is incomplete; mark the document at i + 1 as the next one - return i; - } - return 0; -} - -// Skip the last character if it is partial -really_inline size_t json_structural_indexer::trim_partial_utf8(const uint8_t *buf, size_t len) { - if (unlikely(len < 3)) { - switch (len) { - case 2: - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left - return len; - case 1: - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - return len; - case 0: - return len; - } - } - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left - if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left - return len; -} - } // namespace stage1 diff --git a/src/haswell/dom_parser_implementation.cpp b/src/haswell/dom_parser_implementation.cpp index 5377f7eb..c5e6c291 100644 --- a/src/haswell/dom_parser_implementation.cpp +++ b/src/haswell/dom_parser_implementation.cpp @@ -70,6 +70,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len); } +#include "generic/stage1/find_next_document_index.h" #include "generic/stage1/utf8_lookup2_algorithm.h" #include "generic/stage1/json_structural_indexer.h" WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept { diff --git a/src/westmere/dom_parser_implementation.cpp b/src/westmere/dom_parser_implementation.cpp index 7c5a5c85..376fe0f7 100644 --- a/src/westmere/dom_parser_implementation.cpp +++ b/src/westmere/dom_parser_implementation.cpp @@ -71,6 +71,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len); } +#include "generic/stage1/find_next_document_index.h" #include "generic/stage1/utf8_lookup2_algorithm.h" #include "generic/stage1/json_structural_indexer.h" WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {