Fix fallback implementation

This commit is contained in:
John Keiser 2020-06-04 19:15:35 -07:00
parent fe01da077e
commit 0dbda65e44
6 changed files with 140 additions and 113 deletions

View File

@ -82,6 +82,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
#include "generic/stage1/find_next_document_index.h"
#include "generic/stage1/utf8_lookup2_algorithm.h"
#include "generic/stage1/json_structural_indexer.h"
WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {

View File

@ -9,15 +9,17 @@ namespace simdjson {
namespace fallback {
namespace stage1 {
#include "generic/stage1/find_next_document_index.h"
class structural_scanner {
public:
really_inline structural_scanner(dom_parser_implementation &_parser, bool _streaming)
really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial)
: buf{_parser.buf},
next_structural_index{_parser.structural_indexes.get()},
parser{_parser},
len{static_cast<uint32_t>(_parser.len)},
streaming{_streaming} {
partial{_partial} {
}
really_inline void add_structural() {
@ -41,7 +43,12 @@ really_inline void validate_utf8_character() {
// 2-byte
if ((buf[idx] & 0b00100000) == 0) {
// missing continuation
if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { error = UTF8_ERROR; idx++; return; }
if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
if (idx+1 > len && partial) { idx = len; return; }
error = UTF8_ERROR;
idx++;
return;
}
// overlong: 1100000_ 10______
if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; }
idx += 2;
@ -51,7 +58,12 @@ really_inline void validate_utf8_character() {
// 3-byte
if ((buf[idx] & 0b00010000) == 0) {
// missing continuation
if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { error = UTF8_ERROR; idx++; return; }
if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
if (idx+2 > len && partial) { idx = len; return; }
error = UTF8_ERROR;
idx++;
return;
}
// overlong: 11100000 100_____ ________
if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; }
// surrogates: U+D800-U+DFFF 11101101 101_____
@ -62,7 +74,12 @@ really_inline void validate_utf8_character() {
// 4-byte
// missing continuation
if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { error = UTF8_ERROR; idx++; return; }
if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
if (idx+2 > len && partial) { idx = len; return; }
error = UTF8_ERROR;
idx++;
return;
}
// overlong: 11110000 1000____ ________ ________
if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; }
// too large: > U+10FFFF:
@ -87,7 +104,7 @@ really_inline void validate_string() {
idx++;
}
}
if (idx >= len && !streaming) { error = UNCLOSED_STRING; }
if (idx >= len && !partial) { error = UNCLOSED_STRING; }
}
really_inline bool is_whitespace_or_operator(uint8_t c) {
@ -128,16 +145,26 @@ really_inline error_code scan() {
break;
}
}
if (unlikely(next_structural_index == parser.structural_indexes.get())) {
return EMPTY;
}
*next_structural_index = len;
next_structural_index++;
// We pad beyond.
// https://github.com/simdjson/simdjson/issues/906
next_structural_index[0] = len;
next_structural_index[1] = 0;
next_structural_index[1] = len;
next_structural_index[2] = 0;
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
parser.next_structural_index = 0;
if (unlikely(parser.n_structural_indexes == 0)) {
return EMPTY;
}
if (partial) {
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
}
parser.n_structural_indexes = new_structural_indexes;
}
return error;
}
@ -148,16 +175,16 @@ private:
uint32_t len;
uint32_t idx{0};
error_code error{SUCCESS};
bool streaming;
bool partial;
}; // structural_scanner
} // namespace stage1
WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept {
this->buf = _buf;
this->len = _len;
stage1::structural_scanner scanner(*this, streaming);
stage1::structural_scanner scanner(*this, partial);
return scanner.scan();
}

View File

@ -0,0 +1,86 @@
/**
* This algorithm is used to quickly identify the last structural position that
* makes up a complete document.
*
* It does this by going backwards and finding the last *document boundary* (a
* place where one value follows another without a comma between them). If the
* last document (the characters after the boundary) has an equal number of
* start and end brackets, it is considered complete.
*
* Simply put, we iterate over the structural characters, starting from
* the end. We consider that we found the end of a JSON document when the
* first element of the pair is NOT one of these characters: '{' '[' ';' ','
* and when the second element is NOT one of these characters: '}' '}' ';' ','.
*
* This simple comparison works most of the time, but it does not cover cases
* where the batch's structural indexes contain a perfect amount of documents.
* In such a case, we do not have access to the structural index which follows
* the last document, therefore, we do not have access to the second element in
* the pair, and that means we cannot identify the last document. To fix this
* issue, we keep a count of the open and closed curly/square braces we found
* while searching for the pair. When we find a pair AND the count of open and
* closed curly/square braces is the same, we know that we just passed a
* complete document, therefore the last json buffer location is the end of the
* batch.
*/
really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) {
// TODO don't count separately, just figure out depth
auto arr_cnt = 0;
auto obj_cnt = 0;
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
auto idxb = parser.structural_indexes[i];
switch (parser.buf[idxb]) {
case ':':
case ',':
continue;
case '}':
obj_cnt--;
continue;
case ']':
arr_cnt--;
continue;
case '{':
obj_cnt++;
break;
case '[':
arr_cnt++;
break;
}
auto idxa = parser.structural_indexes[i - 1];
switch (parser.buf[idxa]) {
case '{':
case '[':
case ':':
case ',':
continue;
}
// Last document is complete, so the next document will appear after!
if (!arr_cnt && !obj_cnt) {
return parser.n_structural_indexes;
}
// Last document is incomplete; mark the document at i + 1 as the next one
return i;
}
return 0;
}
// Skip the last character if it is partial
really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
if (unlikely(len < 3)) {
switch (len) {
case 2:
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
return len;
case 1:
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
return len;
case 0:
return len;
}
}
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
return len;
}

View File

@ -73,8 +73,6 @@ private:
really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
static really_inline uint32_t find_next_document_index(dom_parser_implementation &parser);
static really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len);
json_scanner scanner{};
utf8_checker checker{};
@ -98,7 +96,7 @@ really_inline json_structural_indexer::json_structural_indexer(uint32_t *structu
// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
// to finish: utf-8 checks and generating the output from the last iteration.
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are//still* not enough to soak up all
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
@ -162,13 +160,6 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
// a valid JSON file cannot have zero structural indexes - we should have found something
if (unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
/***
* This is related to https://github.com/simdjson/simdjson/issues/906
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
@ -186,6 +177,14 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial) {
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
@ -193,95 +192,7 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
}
parser.n_structural_indexes = new_structural_indexes;
}
parser.next_structural_index = 0;
return checker.errors();
}
/**
* This algorithm is used to quickly identify the last structural position that
* makes up a complete document.
*
* It does this by going backwards and finding the last *document boundary* (a
* place where one value follows another without a comma between them). If the
* last document (the characters after the boundary) has an equal number of
* start and end brackets, it is considered complete.
*
* Simply put, we iterate over the structural characters, starting from
* the end. We consider that we found the end of a JSON document when the
* first element of the pair is NOT one of these characters: '{' '[' ';' ','
* and when the second element is NOT one of these characters: '}' '}' ';' ','.
*
* This simple comparison works most of the time, but it does not cover cases
* where the batch's structural indexes contain a perfect amount of documents.
* In such a case, we do not have access to the structural index which follows
* the last document, therefore, we do not have access to the second element in
* the pair, and means that we cannot identify the last document. To fix this
* issue, we keep a count of the open and closed curly/square braces we found
* while searching for the pair. When we find a pair AND the count of open and
* closed curly/square braces is the same, we know that we just passed a
* complete
* document, therefore the last json buffer location is the end of the batch
*/
really_inline uint32_t json_structural_indexer::find_next_document_index(dom_parser_implementation &parser) {
// TODO don't count separately, just figure out depth
auto arr_cnt = 0;
auto obj_cnt = 0;
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
auto idxb = parser.structural_indexes[i];
switch (parser.buf[idxb]) {
case ':':
case ',':
continue;
case '}':
obj_cnt--;
continue;
case ']':
arr_cnt--;
continue;
case '{':
obj_cnt++;
break;
case '[':
arr_cnt++;
break;
}
auto idxa = parser.structural_indexes[i - 1];
switch (parser.buf[idxa]) {
case '{':
case '[':
case ':':
case ',':
continue;
}
// Last document is complete, so the next document will appear after!
if (!arr_cnt && !obj_cnt) {
return parser.n_structural_indexes;
}
// Last document is incomplete; mark the document at i + 1 as the next one
return i;
}
return 0;
}
// Skip the last character if it is partial
really_inline size_t json_structural_indexer::trim_partial_utf8(const uint8_t *buf, size_t len) {
if (unlikely(len < 3)) {
switch (len) {
case 2:
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
return len;
case 1:
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
return len;
case 0:
return len;
}
}
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
return len;
}
} // namespace stage1

View File

@ -70,6 +70,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
}
#include "generic/stage1/find_next_document_index.h"
#include "generic/stage1/utf8_lookup2_algorithm.h"
#include "generic/stage1/json_structural_indexer.h"
WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {

View File

@ -71,6 +71,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
#include "generic/stage1/find_next_document_index.h"
#include "generic/stage1/utf8_lookup2_algorithm.h"
#include "generic/stage1/json_structural_indexer.h"
WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {