Structural iterator

This commit is contained in:
John Keiser 2020-03-12 14:16:15 -07:00
parent d9a9fd387d
commit 81c86d7090
4 changed files with 117 additions and 97 deletions

View File

@ -145,9 +145,8 @@ really_inline double subnormal_power10(double base, int64_t negative_exponent) {
//
// Note: a redesign could avoid this function entirely.
//
never_inline bool parse_float(const uint8_t *const buf, document::parser &parser,
const uint32_t offset, bool found_minus) {
const char *p = reinterpret_cast<const char *>(buf + offset);
never_inline bool parse_float(const uint8_t *const src, document::parser &parser, bool found_minus) {
const char *p = reinterpret_cast<const char *>(src);
bool negative = false;
if (found_minus) {
++p;
@ -179,7 +178,7 @@ never_inline bool parse_float(const uint8_t *const buf, document::parser &parser
: 0);
} else {
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false;
}
@ -202,7 +201,7 @@ never_inline bool parse_float(const uint8_t *const buf, document::parser &parser
}
if (!is_integer(*p)) {
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false;
}
@ -228,7 +227,7 @@ never_inline bool parse_float(const uint8_t *const buf, document::parser &parser
if (exp_number > 0x100000000) { // we need to check for overflows
// we refuse to parse this
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false;
}
@ -246,7 +245,7 @@ never_inline bool parse_float(const uint8_t *const buf, document::parser &parser
// We know for sure that we have a number that is too large,
// we refuse to parse this
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false;
}
@ -264,14 +263,14 @@ never_inline bool parse_float(const uint8_t *const buf, document::parser &parser
// check that we can go from long double to double safely.
if(i > std::numeric_limits<double>::max()) {
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false;
}
double d = negative ? -i : i;
parser.on_number_double(d);
#ifdef JSON_TEST_NUMBERS // for unit testing
found_float(d, buf + offset);
found_float(d, src);
#endif
return is_structural_or_whitespace(*p);
}
@ -284,11 +283,8 @@ never_inline bool parse_float(const uint8_t *const buf, document::parser &parser
//
// This function will almost never be called!!!
//
never_inline bool parse_large_integer(const uint8_t *const buf,
document::parser &parser,
const uint32_t offset,
bool found_minus) {
const char *p = reinterpret_cast<const char *>(buf + offset);
never_inline bool parse_large_integer(const uint8_t *const src, document::parser &parser, bool found_minus) {
const char *p = reinterpret_cast<const char *>(src);
bool negative = false;
if (found_minus) {
@ -309,13 +305,13 @@ never_inline bool parse_large_integer(const uint8_t *const buf,
digit = *p - '0';
if (mul_overflow(i, 10, &i)) {
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false; // overflow
}
if (add_overflow(i, digit, &i)) {
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false; // overflow
}
@ -326,7 +322,7 @@ never_inline bool parse_large_integer(const uint8_t *const buf,
if (i > 0x8000000000000000) {
// overflows!
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false; // overflow
} else if (i == 0x8000000000000000) {
@ -336,14 +332,14 @@ never_inline bool parse_large_integer(const uint8_t *const buf,
constexpr int64_t signed_answer = INT64_MIN;
parser.on_number_s64(signed_answer);
#ifdef JSON_TEST_NUMBERS // for unit testing
found_integer(signed_answer, buf + offset);
found_integer(signed_answer, src);
#endif
} else {
// we can negate safely
int64_t signed_answer = -static_cast<int64_t>(i);
parser.on_number_s64(signed_answer);
#ifdef JSON_TEST_NUMBERS // for unit testing
found_integer(signed_answer, buf + offset);
found_integer(signed_answer, src);
#endif
}
} else {
@ -352,12 +348,12 @@ never_inline bool parse_large_integer(const uint8_t *const buf,
// fallback on unsigned integers if absolutely necessary.
if(i < 0x8000000000000000) {
#ifdef JSON_TEST_NUMBERS // for unit testing
found_integer(i, buf + offset);
found_integer(i, src);
#endif
parser.on_number_s64(i);
} else {
#ifdef JSON_TEST_NUMBERS // for unit testing
found_unsigned_integer(i, buf + offset);
found_unsigned_integer(i, src);
#endif
parser.on_number_u64(i);
}
@ -365,7 +361,7 @@ never_inline bool parse_large_integer(const uint8_t *const buf,
return is_structural_or_whitespace(*p);
}
// parse the number at buf + offset
// parse the number at src
// define JSON_TEST_NUMBERS for unit testing
//
// It is assumed that the number is followed by a structural ({,},],[) character
@ -374,8 +370,7 @@ never_inline bool parse_large_integer(const uint8_t *const buf,
// content and append a space before calling this function.
//
// Our objective is accurate parsing (ULP of 0 or 1) at high speed.
really_inline bool parse_number(UNUSED const uint8_t *const buf,
UNUSED const uint32_t offset,
really_inline bool parse_number(UNUSED const uint8_t *const src,
UNUSED bool found_minus,
document::parser &parser) {
#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
@ -383,14 +378,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
parser.on_number_s64(0); // always write zero
return true; // always succeeds
#else
const char *p = reinterpret_cast<const char *>(buf + offset);
const char *p = reinterpret_cast<const char *>(src);
bool negative = false;
if (found_minus) {
++p;
negative = true;
if (!is_integer(*p)) { // a negative sign must be followed by an integer
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false;
}
@ -402,7 +397,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
++p;
if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false;
}
@ -410,7 +405,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
} else {
if (!(is_integer(*p))) { // must start with an integer
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false;
}
@ -445,7 +440,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
// we will handle the overflow later
} else {
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false;
}
@ -480,7 +475,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
}
if (!is_integer(*p)) {
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false;
}
@ -501,7 +496,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
if (exp_number > 0x100000000) { // we need to check for overflows
// we refuse to parse this
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
found_invalid_number(src);
#endif
return false;
}
@ -526,31 +521,31 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
// Ok, chances are good that we had an overflow!
// this is almost never going to get called!!!
// we start anew, going slowly!!!
return parse_float(buf, parser, offset, found_minus);
return parse_float(src, parser, found_minus);
}
}
if (unlikely((power_index > 2 * 308))) { // this is uncommon!!!
// this is almost never going to get called!!!
// we start anew, going slowly!!!
return parse_float(buf, parser, offset, found_minus);
return parse_float(src, parser, found_minus);
}
double factor = power_of_ten[power_index];
factor = negative ? -factor : factor;
double d = i * factor;
parser.on_number_double(d);
#ifdef JSON_TEST_NUMBERS // for unit testing
found_float(d, buf + offset);
found_float(d, src);
#endif
} else {
if (unlikely(digit_count >= 18)) { // this is uncommon!!!
// there is a good chance that we had an overflow, so we need
// need to recover: we parse the whole thing again.
return parse_large_integer(buf, parser, offset, found_minus);
return parse_large_integer(src, parser, found_minus);
}
i = negative ? 0 - i : i;
parser.on_number_s64(i);
#ifdef JSON_TEST_NUMBERS // for unit testing
found_integer(i, buf + offset);
found_integer(i, src);
#endif
}
return is_structural_or_whitespace(*p);

View File

@ -47,28 +47,22 @@ struct unified_machine_addresses {
#undef FAIL_IF
#define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
struct structural_parser {
const uint8_t* const buf;
const size_t len;
document::parser &doc_parser;
size_t i; // next structural index
size_t idx; // location of the structural character in the input (buf)
uint8_t c; // used to track the (structural) character we are looking at
uint32_t depth = 0; // could have an arbitrary starting depth
really_inline structural_parser(
const uint8_t *_buf,
size_t _len,
document::parser &_doc_parser,
uint32_t _i = 0
) : buf{_buf}, len{_len}, doc_parser{_doc_parser}, i{_i} {}
class structural_iterator {
public:
really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
: buf{_buf}, len{_len}, structural_indexes{_structural_indexes}, next_structural{next_structural_index} {}
really_inline char advance_char() {
idx = doc_parser.structural_indexes[i++];
c = buf[idx];
idx = structural_indexes[next_structural];
next_structural++;
c = *current();
return c;
}
really_inline char current_char() {
return c;
}
really_inline const uint8_t* current() {
return &buf[idx];
}
template<typename F>
really_inline bool with_space_terminated_copy(const F& f) {
/**
@ -94,6 +88,36 @@ struct structural_parser {
free(copy);
return result;
}
really_inline bool past_end(uint32_t n_structural_indexes) {
return next_structural+1 > n_structural_indexes;
}
really_inline bool at_end(uint32_t n_structural_indexes) {
return next_structural+1 == n_structural_indexes;
}
really_inline size_t next_structural_index() {
return next_structural;
}
private:
const uint8_t* const buf;
const size_t len;
const uint32_t* const structural_indexes;
size_t next_structural; // next structural index
size_t idx; // location of the structural character in the input (buf)
uint8_t c; // used to track the (structural) character we are looking at
};
struct structural_parser {
structural_iterator structurals;
document::parser &doc_parser;
uint32_t depth;
really_inline structural_parser(
const uint8_t *buf,
size_t len,
document::parser &_doc_parser,
uint32_t next_structural = 0
) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
doc_parser.on_start_document(depth);
@ -134,32 +158,32 @@ struct structural_parser {
WARN_UNUSED really_inline bool parse_string() {
uint8_t *dst = doc_parser.on_start_string();
dst = stringparsing::parse_string(buf, idx, dst);
dst = stringparsing::parse_string(structurals.current(), dst);
if (dst == nullptr) {
return true;
}
return !doc_parser.on_end_string(dst);
}
WARN_UNUSED really_inline bool parse_number(const uint8_t *copy, uint32_t offset, bool found_minus) {
return !numberparsing::parse_number(copy, offset, found_minus, doc_parser);
WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
return !numberparsing::parse_number(src, found_minus, doc_parser);
}
WARN_UNUSED really_inline bool parse_number(bool found_minus) {
return parse_number(buf, idx, found_minus);
return parse_number(structurals.current(), found_minus);
}
WARN_UNUSED really_inline bool parse_atom(const uint8_t *copy, uint32_t offset) {
switch (c) {
WARN_UNUSED really_inline bool parse_atom(const uint8_t *src) {
switch (structurals.current_char()) {
case 't':
if (!is_valid_true_atom(copy + offset)) { return true; }
if (!is_valid_true_atom(src)) { return true; }
doc_parser.on_true_atom();
break;
case 'f':
if (!is_valid_false_atom(copy + offset)) { return true; }
if (!is_valid_false_atom(src)) { return true; }
doc_parser.on_false_atom();
break;
case 'n':
if (!is_valid_null_atom(copy + offset)) { return true; }
if (!is_valid_null_atom(src)) { return true; }
doc_parser.on_null_atom();
break;
default:
@ -169,11 +193,11 @@ struct structural_parser {
}
WARN_UNUSED really_inline bool parse_atom() {
return parse_atom(buf, idx);
return parse_atom(structurals.current());
}
WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
switch (c) {
switch (structurals.current_char()) {
case '"':
FAIL_IF( parse_string() );
return continue_state;
@ -200,7 +224,7 @@ struct structural_parser {
WARN_UNUSED really_inline error_code finish() {
// the string might not be NULL terminated.
if ( i + 1 != doc_parser.n_structural_indexes ) {
if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
return doc_parser.on_error(TAPE_ERROR);
}
end_document();
@ -228,7 +252,7 @@ struct structural_parser {
if (depth >= doc_parser.max_depth()) {
return doc_parser.on_error(DEPTH_ERROR);
}
switch (c) {
switch (structurals.current_char()) {
case '"':
return doc_parser.on_error(STRING_ERROR);
case '0':
@ -254,19 +278,23 @@ struct structural_parser {
}
}
WARN_UNUSED really_inline error_code start(ret_address finish_state) {
WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
doc_parser.init_stage2(); // sets is_valid to false
if (len > doc_parser.capacity()) {
return CAPACITY;
}
// Advance to the first character as soon as possible
advance_char();
structurals.advance_char();
// Push the root scope (there is always at least one scope)
if (start_document(finish_state)) {
return doc_parser.on_error(DEPTH_ERROR);
}
return SUCCESS;
}
really_inline char advance_char() {
return structurals.advance_char();
}
};
// Redefine FAIL_IF to use goto since it'll be used inside the function now
@ -282,13 +310,13 @@ struct structural_parser {
WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, document::parser &doc_parser) const noexcept {
static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
stage2::structural_parser parser(buf, len, doc_parser);
error_code result = parser.start(addresses.finish);
error_code result = parser.start(len, addresses.finish);
if (result) { return result; }
//
// Read first value
//
switch (parser.c) {
switch (parser.structurals.current_char()) {
case '{':
FAIL_IF( parser.start_object(addresses.finish) );
goto object_begin;
@ -300,23 +328,23 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, do
goto finish;
case 't': case 'f': case 'n':
FAIL_IF(
parser.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_atom(copy, idx);
parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_atom(&copy[idx]);
})
);
goto finish;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
FAIL_IF(
parser.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_number(copy, idx, false);
parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_number(&copy[idx], false);
})
);
goto finish;
case '-':
FAIL_IF(
parser.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_number(copy, idx, true);
parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_number(&copy[idx], true);
})
);
goto finish;
@ -328,8 +356,7 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, do
// Object parser states
//
object_begin:
parser.advance_char();
switch (parser.c) {
switch (parser.advance_char()) {
case '"': {
FAIL_IF( parser.parse_string() );
goto object_key_state;

View File

@ -4,7 +4,7 @@ struct streaming_structural_parser: structural_parser {
really_inline streaming_structural_parser(const uint8_t *_buf, size_t _len, document::parser &_doc_parser, size_t _i) : structural_parser(_buf, _len, _doc_parser, _i) {}
// override to add streaming
WARN_UNUSED really_inline error_code start(ret_address finish_parser) {
WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
doc_parser.init_stage2(); // sets is_valid to false
// Capacity ain't no thang for streaming, so we don't check it.
// Advance to the first character as soon as possible
@ -18,7 +18,7 @@ struct streaming_structural_parser: structural_parser {
// override to add streaming
WARN_UNUSED really_inline error_code finish() {
if ( i + 1 > doc_parser.n_structural_indexes ) {
if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
return doc_parser.on_error(TAPE_ERROR);
}
end_document();
@ -28,7 +28,7 @@ struct streaming_structural_parser: structural_parser {
if (doc_parser.containing_scope_offset[depth] != 0) {
return doc_parser.on_error(TAPE_ERROR);
}
bool finished = i + 1 == doc_parser.n_structural_indexes;
bool finished = structurals.at_end(doc_parser.n_structural_indexes);
return doc_parser.on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
}
};
@ -42,12 +42,12 @@ struct streaming_structural_parser: structural_parser {
WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, document::parser &doc_parser, size_t &next_json) const noexcept {
static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
stage2::streaming_structural_parser parser(buf, len, doc_parser, next_json);
error_code result = parser.start(addresses.finish);
error_code result = parser.start(len, addresses.finish);
if (result) { return result; }
//
// Read first value
//
switch (parser.c) {
switch (parser.structurals.current_char()) {
case '{':
FAIL_IF( parser.start_object(addresses.finish) );
goto object_begin;
@ -59,23 +59,23 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, do
goto finish;
case 't': case 'f': case 'n':
FAIL_IF(
parser.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_atom(copy, idx);
parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_atom(&copy[idx]);
})
);
goto finish;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
FAIL_IF(
parser.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_number(copy, idx, false);
parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_number(&copy[idx], false);
})
);
goto finish;
case '-':
FAIL_IF(
parser.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_number(copy, idx, true);
parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_number(&copy[idx], true);
})
);
goto finish;
@ -87,8 +87,7 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, do
// Object parser parsers
//
object_begin:
parser.advance_char();
switch (parser.c) {
switch (parser.advance_char()) {
case '"': {
FAIL_IF( parser.parse_string() );
goto object_key_parser;
@ -148,7 +147,7 @@ array_continue:
}
finish:
next_json = parser.i;
next_json = parser.structurals.next_structural_index();
return parser.finish();
error:

View File

@ -71,10 +71,9 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
return offset > 0;
}
WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *buf,
uint32_t offset,
WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src,
uint8_t *dst) {
const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */
src++;
while (1) {
parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst);
if (((helper.bs_bits - 1) & helper.quote_bits) != 0) {