Move stage 2 tape writing to ParsedJson (#477)
This is a first step to allowing alternate tape formats.
This commit is contained in:
parent
0c8f2b9d85
commit
76c706644a
|
@ -35,7 +35,6 @@ int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj,
|
|||
if (reallocated) { // must free before we exit
|
||||
aligned_free((void *)buf);
|
||||
}
|
||||
pj.error_code = stage1_is_ok;
|
||||
return pj.error_code;
|
||||
}
|
||||
int res = unified_machine<T>(buf, len, pj);
|
||||
|
|
|
@ -60,49 +60,105 @@ public:
|
|||
WARN_UNUSED
|
||||
bool dump_raw_tape(std::ostream &os) const;
|
||||
|
||||
// all nodes are stored on the tape using a 64-bit word.
|
||||
//
|
||||
// strings, double and ints are stored as
|
||||
// a 64-bit word with a pointer to the actual value
|
||||
//
|
||||
//
|
||||
//
|
||||
// for objects or arrays, store [ or { at the beginning and } and ] at the
|
||||
// end. For the openings ([ or {), we annotate them with a reference to the
|
||||
// location on the tape of the end, and for then closings (} and ]), we
|
||||
// annotate them with a reference to the location of the opening
|
||||
//
|
||||
//
|
||||
|
||||
// this should be considered a private function
|
||||
really_inline void write_tape(uint64_t val, uint8_t c) {
|
||||
tape[current_loc++] = val | ((static_cast<uint64_t>(c)) << 56);
|
||||
really_inline ErrorValues on_error(ErrorValues new_error_code) {
|
||||
error_code = new_error_code;
|
||||
return new_error_code;
|
||||
}
|
||||
really_inline ErrorValues on_success(ErrorValues success_code) {
|
||||
error_code = success_code;
|
||||
valid = true;
|
||||
return success_code;
|
||||
}
|
||||
really_inline bool on_start_document(uint32_t depth) {
|
||||
containing_scope_offset[depth] = get_current_loc();
|
||||
write_tape(0, 'r');
|
||||
return true;
|
||||
}
|
||||
really_inline bool on_start_object(uint32_t depth) {
|
||||
containing_scope_offset[depth] = get_current_loc();
|
||||
write_tape(0, '{');
|
||||
return true;
|
||||
}
|
||||
really_inline bool on_start_array(uint32_t depth) {
|
||||
containing_scope_offset[depth] = get_current_loc();
|
||||
write_tape(0, '[');
|
||||
return true;
|
||||
}
|
||||
// TODO we're not checking this bool
|
||||
really_inline bool on_end_document(uint32_t depth) {
|
||||
// write our tape location to the header scope
|
||||
// The root scope gets written *at* the previous location.
|
||||
annotate_previous_loc(containing_scope_offset[depth], get_current_loc());
|
||||
write_tape(containing_scope_offset[depth], 'r');
|
||||
return true;
|
||||
}
|
||||
really_inline bool on_end_object(uint32_t depth) {
|
||||
// write our tape location to the header scope
|
||||
write_tape(containing_scope_offset[depth], '}');
|
||||
annotate_previous_loc(containing_scope_offset[depth], get_current_loc());
|
||||
return true;
|
||||
}
|
||||
really_inline bool on_end_array(uint32_t depth) {
|
||||
// write our tape location to the header scope
|
||||
write_tape(containing_scope_offset[depth], ']');
|
||||
annotate_previous_loc(containing_scope_offset[depth], get_current_loc());
|
||||
return true;
|
||||
}
|
||||
|
||||
really_inline void write_tape_s64(int64_t i) {
|
||||
really_inline bool on_true_atom() {
|
||||
write_tape(0, 't');
|
||||
return true;
|
||||
}
|
||||
really_inline bool on_false_atom() {
|
||||
write_tape(0, 'f');
|
||||
return true;
|
||||
}
|
||||
really_inline bool on_null_atom() {
|
||||
write_tape(0, 'n');
|
||||
return true;
|
||||
}
|
||||
|
||||
really_inline uint8_t *on_start_string() {
|
||||
/* we advance the point, accounting for the fact that we have a NULL
|
||||
* termination */
|
||||
write_tape(current_string_buf_loc - string_buf.get(), '"');
|
||||
return current_string_buf_loc + sizeof(uint32_t);
|
||||
}
|
||||
|
||||
really_inline bool on_end_string(uint8_t *dst) {
|
||||
uint32_t str_length = dst - (current_string_buf_loc + sizeof(uint32_t));
|
||||
// TODO check for overflow in case someone has a crazy string (>=4GB?)
|
||||
// But only add the overflow check when the document itself exceeds 4GB
|
||||
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
|
||||
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
|
||||
// NULL termination is still handy if you expect all your strings to
|
||||
// be NULL terminated? It comes at a small cost
|
||||
*dst = 0;
|
||||
current_string_buf_loc = dst + 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
really_inline bool on_number_s64(int64_t value) {
|
||||
write_tape(0, 'l');
|
||||
std::memcpy(&tape[current_loc], &i, sizeof(i));
|
||||
std::memcpy(&tape[current_loc], &value, sizeof(value));
|
||||
++current_loc;
|
||||
return true;
|
||||
}
|
||||
|
||||
really_inline void write_tape_u64(uint64_t i) {
|
||||
really_inline bool on_number_u64(uint64_t value) {
|
||||
write_tape(0, 'u');
|
||||
tape[current_loc++] = i;
|
||||
tape[current_loc++] = value;
|
||||
return true;
|
||||
}
|
||||
|
||||
really_inline void write_tape_double(double d) {
|
||||
really_inline bool on_number_double(double value) {
|
||||
write_tape(0, 'd');
|
||||
static_assert(sizeof(d) == sizeof(tape[current_loc]), "mismatch size");
|
||||
memcpy(&tape[current_loc++], &d, sizeof(double));
|
||||
static_assert(sizeof(value) == sizeof(tape[current_loc]), "mismatch size");
|
||||
memcpy(&tape[current_loc++], &value, sizeof(double));
|
||||
// tape[current_loc++] = *((uint64_t *)&d);
|
||||
return true;
|
||||
}
|
||||
|
||||
really_inline uint32_t get_current_loc() const { return current_loc; }
|
||||
|
||||
really_inline void annotate_previous_loc(uint32_t saved_loc, uint64_t val) {
|
||||
tape[saved_loc] |= val;
|
||||
}
|
||||
|
||||
struct InvalidJSON : public std::exception {
|
||||
const char *what() const noexcept { return "JSON document is invalid"; }
|
||||
};
|
||||
|
@ -134,6 +190,29 @@ public:
|
|||
bool valid{false};
|
||||
int error_code{simdjson::UNINITIALIZED};
|
||||
|
||||
private:
|
||||
// all nodes are stored on the tape using a 64-bit word.
|
||||
//
|
||||
// strings, double and ints are stored as
|
||||
// a 64-bit word with a pointer to the actual value
|
||||
//
|
||||
//
|
||||
//
|
||||
// for objects or arrays, store [ or { at the beginning and } and ] at the
|
||||
// end. For the openings ([ or {), we annotate them with a reference to the
|
||||
// location on the tape of the end, and for then closings (} and ]), we
|
||||
// annotate them with a reference to the location of the opening
|
||||
//
|
||||
//
|
||||
|
||||
// this should be considered a private function
|
||||
really_inline void write_tape(uint64_t val, uint8_t c) {
|
||||
tape[current_loc++] = val | ((static_cast<uint64_t>(c)) << 56);
|
||||
}
|
||||
|
||||
really_inline void annotate_previous_loc(uint32_t saved_loc, uint64_t val) {
|
||||
tape[saved_loc] |= val;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
namespace numberparsing {
|
||||
|
||||
// Allowable floating-point values range
|
||||
// std::numeric_limits<double>::lowest() to std::numeric_limits<double>::max(),
|
||||
|
@ -75,7 +76,7 @@ static const double power_of_ten[] = {
|
|||
1e295, 1e296, 1e297, 1e298, 1e299, 1e300, 1e301, 1e302, 1e303,
|
||||
1e304, 1e305, 1e306, 1e307, 1e308};
|
||||
|
||||
static inline bool is_integer(char c) {
|
||||
really_inline bool is_integer(char c) {
|
||||
return (c >= '0' && c <= '9');
|
||||
// this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
|
||||
}
|
||||
|
@ -104,7 +105,7 @@ is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
|
|||
// check quickly whether the next 8 chars are made of digits
|
||||
// at a glance, it looks better than Mula's
|
||||
// http://0x80.pl/articles/swar-digits-validate.html
|
||||
static inline bool is_made_of_eight_digits_fast(const char *chars) {
|
||||
really_inline bool is_made_of_eight_digits_fast(const char *chars) {
|
||||
uint64_t val;
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
|
@ -123,7 +124,7 @@ static inline bool is_made_of_eight_digits_fast(const char *chars) {
|
|||
//
|
||||
// This function computes base * 10 ^ (- negative_exponent ).
|
||||
// It is only even going to be used when negative_exponent is tiny.
|
||||
static double subnormal_power10(double base, int64_t negative_exponent) {
|
||||
really_inline double subnormal_power10(double base, int64_t negative_exponent) {
|
||||
// avoid integer overflows in the pow expression, those values would
|
||||
// become zero anyway.
|
||||
if(negative_exponent < -1000) {
|
||||
|
@ -144,8 +145,8 @@ static double subnormal_power10(double base, int64_t negative_exponent) {
|
|||
//
|
||||
// Note: a redesign could avoid this function entirely.
|
||||
//
|
||||
static never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj,
|
||||
const uint32_t offset, bool found_minus) {
|
||||
never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj,
|
||||
const uint32_t offset, bool found_minus) {
|
||||
const char *p = reinterpret_cast<const char *>(buf + offset);
|
||||
bool negative = false;
|
||||
if (found_minus) {
|
||||
|
@ -268,7 +269,7 @@ static never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj,
|
|||
return false;
|
||||
}
|
||||
double d = negative ? -i : i;
|
||||
pj.write_tape_double(d);
|
||||
pj.on_number_double(d);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_float(d, buf + offset);
|
||||
#endif
|
||||
|
@ -283,7 +284,7 @@ static never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj,
|
|||
//
|
||||
// This function will almost never be called!!!
|
||||
//
|
||||
static never_inline bool parse_large_integer(const uint8_t *const buf,
|
||||
never_inline bool parse_large_integer(const uint8_t *const buf,
|
||||
ParsedJson &pj,
|
||||
const uint32_t offset,
|
||||
bool found_minus) {
|
||||
|
@ -333,14 +334,14 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
|
|||
// as a positive signed integer, but the negative version is
|
||||
// possible.
|
||||
constexpr int64_t signed_answer = INT64_MIN;
|
||||
pj.write_tape_s64(signed_answer);
|
||||
pj.on_number_s64(signed_answer);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_integer(signed_answer, buf + offset);
|
||||
#endif
|
||||
} else {
|
||||
// we can negate safely
|
||||
int64_t signed_answer = -static_cast<int64_t>(i);
|
||||
pj.write_tape_s64(signed_answer);
|
||||
pj.on_number_s64(signed_answer);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_integer(signed_answer, buf + offset);
|
||||
#endif
|
||||
|
@ -353,12 +354,12 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
|
|||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_integer(i, buf + offset);
|
||||
#endif
|
||||
pj.write_tape_s64(i);
|
||||
pj.on_number_s64(i);
|
||||
} else {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_unsigned_integer(i, buf + offset);
|
||||
#endif
|
||||
pj.write_tape_u64(i);
|
||||
pj.on_number_u64(i);
|
||||
}
|
||||
}
|
||||
return is_structural_or_whitespace(*p);
|
||||
|
@ -373,12 +374,13 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
|
|||
// content and append a space before calling this function.
|
||||
//
|
||||
// Our objective is accurate parsing (ULP of 0 or 1) at high speed.
|
||||
static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj,
|
||||
const uint32_t offset,
|
||||
bool found_minus) {
|
||||
really_inline bool parse_number(const uint8_t *const buf,
|
||||
const uint32_t offset,
|
||||
bool found_minus,
|
||||
ParsedJson &pj) {
|
||||
#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
|
||||
// useful to skip parsing
|
||||
pj.write_tape_s64(0); // always write zero
|
||||
pj.on_number_s64(0); // always write zero
|
||||
return true; // always succeeds
|
||||
#else
|
||||
const char *p = reinterpret_cast<const char *>(buf + offset);
|
||||
|
@ -535,7 +537,7 @@ static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj,
|
|||
double factor = power_of_ten[power_index];
|
||||
factor = negative ? -factor : factor;
|
||||
double d = i * factor;
|
||||
pj.write_tape_double(d);
|
||||
pj.on_number_double(d);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_float(d, buf + offset);
|
||||
#endif
|
||||
|
@ -546,7 +548,7 @@ static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj,
|
|||
return parse_large_integer(buf, pj, offset, found_minus);
|
||||
}
|
||||
i = negative ? 0 - i : i;
|
||||
pj.write_tape_s64(i);
|
||||
pj.on_number_s64(i);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_integer(i, buf + offset);
|
||||
#endif
|
||||
|
@ -555,3 +557,4 @@ static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj,
|
|||
#endif // SIMDJSON_SKIPNUMBERPARSING
|
||||
}
|
||||
|
||||
} // namespace numberparsing
|
|
@ -24,7 +24,7 @@ typedef char ret_address;
|
|||
case 'o': goto object_continue; \
|
||||
} \
|
||||
}
|
||||
// For the more constrained pop_scope() situation
|
||||
// For the more constrained end_xxx() situation
|
||||
#define CONTINUE(address) \
|
||||
{ \
|
||||
switch(address) { \
|
||||
|
@ -47,16 +47,6 @@ struct unified_machine_addresses {
|
|||
#undef FAIL_IF
|
||||
#define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
|
||||
|
||||
// This is just so we can call parse_string() from parser.parse_string() without conflict.
|
||||
WARN_UNUSED really_inline bool
|
||||
really_parse_string(const uint8_t *buf, size_t len, ParsedJson &pj, uint32_t depth, uint32_t idx) {
|
||||
return parse_string(buf, len, pj, depth, idx);
|
||||
}
|
||||
WARN_UNUSED really_inline bool
|
||||
really_parse_number(const uint8_t *const buf, ParsedJson &pj, const uint32_t offset, bool found_minus) {
|
||||
return parse_number(buf, pj, offset, found_minus);
|
||||
}
|
||||
|
||||
struct structural_parser {
|
||||
const uint8_t* const buf;
|
||||
const size_t len;
|
||||
|
@ -66,12 +56,12 @@ struct structural_parser {
|
|||
uint8_t c; // used to track the (structural) character we are looking at
|
||||
uint32_t depth = 0; // could have an arbitrary starting depth
|
||||
|
||||
really_inline structural_parser(const uint8_t *_buf, size_t _len, ParsedJson &_pj, uint32_t _i = 0) : buf{_buf}, len{_len}, pj{_pj}, i{_i} {}
|
||||
|
||||
WARN_UNUSED really_inline int set_error_code(ErrorValues error_code) {
|
||||
pj.error_code = error_code;
|
||||
return error_code;
|
||||
}
|
||||
really_inline structural_parser(
|
||||
const uint8_t *_buf,
|
||||
size_t _len,
|
||||
ParsedJson &_pj,
|
||||
uint32_t _i = 0
|
||||
) : buf{_buf}, len{_len}, pj{_pj}, i{_i} {}
|
||||
|
||||
really_inline char advance_char() {
|
||||
idx = pj.structural_indexes[i++];
|
||||
|
@ -105,47 +95,54 @@ struct structural_parser {
|
|||
return result;
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline bool push_start_scope(ret_address continue_state, char type) {
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
pj.ret_address[depth] = continue_state;
|
||||
depth++;
|
||||
pj.write_tape(0, type);
|
||||
return depth >= pj.depth_capacity;
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline bool push_start_scope(ret_address continue_state) {
|
||||
return push_start_scope(continue_state, c);
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline bool push_scope(ret_address continue_state) {
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
pj.write_tape(0, c); // Do this as early as possible
|
||||
WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
|
||||
pj.on_start_document(depth);
|
||||
pj.ret_address[depth] = continue_state;
|
||||
depth++;
|
||||
return depth >= pj.depth_capacity;
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline ret_address pop_scope() {
|
||||
// write our tape location to the header scope
|
||||
depth--;
|
||||
pj.write_tape(pj.containing_scope_offset[depth], c);
|
||||
pj.annotate_previous_loc(pj.containing_scope_offset[depth], pj.get_current_loc());
|
||||
return pj.ret_address[depth];
|
||||
WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
|
||||
pj.on_start_object(depth);
|
||||
pj.ret_address[depth] = continue_state;
|
||||
depth++;
|
||||
return depth >= pj.depth_capacity;
|
||||
}
|
||||
really_inline void pop_root_scope() {
|
||||
// write our tape location to the header scope
|
||||
// The root scope gets written *at* the previous location.
|
||||
|
||||
WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
|
||||
pj.on_start_array(depth);
|
||||
pj.ret_address[depth] = continue_state;
|
||||
depth++;
|
||||
return depth >= pj.depth_capacity;
|
||||
}
|
||||
|
||||
really_inline bool end_object() {
|
||||
depth--;
|
||||
pj.annotate_previous_loc(pj.containing_scope_offset[depth], pj.get_current_loc());
|
||||
pj.write_tape(pj.containing_scope_offset[depth], 'r');
|
||||
pj.on_end_object(depth);
|
||||
return false;
|
||||
}
|
||||
really_inline bool end_array() {
|
||||
depth--;
|
||||
pj.on_end_array(depth);
|
||||
return false;
|
||||
}
|
||||
really_inline bool end_document() {
|
||||
depth--;
|
||||
pj.on_end_document(depth);
|
||||
return false;
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline bool parse_string() {
|
||||
return !really_parse_string(buf, len, pj, depth, idx);
|
||||
uint8_t *dst = pj.on_start_string();
|
||||
dst = stringparsing::parse_string(buf, idx, dst);
|
||||
if (dst == nullptr) {
|
||||
return true;
|
||||
}
|
||||
return !pj.on_end_string(dst);
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline bool parse_number(const uint8_t *copy, uint32_t offset, bool found_minus) {
|
||||
return !really_parse_number(copy, pj, offset, found_minus);
|
||||
return !numberparsing::parse_number(copy, offset, found_minus, pj);
|
||||
}
|
||||
WARN_UNUSED really_inline bool parse_number(bool found_minus) {
|
||||
return parse_number(buf, idx, found_minus);
|
||||
|
@ -154,18 +151,20 @@ struct structural_parser {
|
|||
WARN_UNUSED really_inline bool parse_atom(const uint8_t *copy, uint32_t offset) {
|
||||
switch (c) {
|
||||
case 't':
|
||||
if (!is_valid_true_atom(copy + offset)) { return true; };
|
||||
if (!is_valid_true_atom(copy + offset)) { return true; }
|
||||
pj.on_true_atom();
|
||||
break;
|
||||
case 'f':
|
||||
if (!is_valid_false_atom(copy + offset)) { return true; }
|
||||
pj.on_false_atom();
|
||||
break;
|
||||
case 'n':
|
||||
if (!is_valid_null_atom(copy + offset)) { return true; }
|
||||
pj.on_null_atom();
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -189,34 +188,33 @@ struct structural_parser {
|
|||
FAIL_IF( parse_number(true) );
|
||||
return continue_state;
|
||||
case '{':
|
||||
FAIL_IF( push_scope(continue_state) );
|
||||
FAIL_IF( start_object(continue_state) );
|
||||
return addresses.object_begin;
|
||||
case '[':
|
||||
FAIL_IF( push_scope(continue_state) );
|
||||
FAIL_IF( start_array(continue_state) );
|
||||
return addresses.array_begin;
|
||||
default:
|
||||
return addresses.error;
|
||||
}
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline int finish() {
|
||||
WARN_UNUSED really_inline ErrorValues finish() {
|
||||
// the string might not be NULL terminated.
|
||||
if ( i + 1 != pj.n_structural_indexes ) {
|
||||
return set_error_code(TAPE_ERROR);
|
||||
return pj.on_error(TAPE_ERROR);
|
||||
}
|
||||
pop_root_scope();
|
||||
end_document();
|
||||
if (depth != 0) {
|
||||
return set_error_code(TAPE_ERROR);
|
||||
return pj.on_error(TAPE_ERROR);
|
||||
}
|
||||
if (pj.containing_scope_offset[depth] != 0) {
|
||||
return set_error_code(TAPE_ERROR);
|
||||
return pj.on_error(TAPE_ERROR);
|
||||
}
|
||||
|
||||
pj.valid = true;
|
||||
return set_error_code(SUCCESS);
|
||||
return pj.on_success(SUCCESS);
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline int error() {
|
||||
WARN_UNUSED really_inline ErrorValues error() {
|
||||
/* We do not need the next line because this is done by pj.init(),
|
||||
* pessimistically.
|
||||
* pj.is_valid = false;
|
||||
|
@ -228,11 +226,11 @@ struct structural_parser {
|
|||
* carefully,
|
||||
* all without any added cost. */
|
||||
if (depth >= pj.depth_capacity) {
|
||||
return set_error_code(DEPTH_ERROR);
|
||||
return pj.on_error(DEPTH_ERROR);
|
||||
}
|
||||
switch (c) {
|
||||
case '"':
|
||||
return set_error_code(STRING_ERROR);
|
||||
return pj.on_error(STRING_ERROR);
|
||||
case '0':
|
||||
case '1':
|
||||
case '2':
|
||||
|
@ -244,19 +242,19 @@ struct structural_parser {
|
|||
case '8':
|
||||
case '9':
|
||||
case '-':
|
||||
return set_error_code(NUMBER_ERROR);
|
||||
return pj.on_error(NUMBER_ERROR);
|
||||
case 't':
|
||||
return set_error_code(T_ATOM_ERROR);
|
||||
return pj.on_error(T_ATOM_ERROR);
|
||||
case 'n':
|
||||
return set_error_code(N_ATOM_ERROR);
|
||||
return pj.on_error(N_ATOM_ERROR);
|
||||
case 'f':
|
||||
return set_error_code(F_ATOM_ERROR);
|
||||
return pj.on_error(F_ATOM_ERROR);
|
||||
default:
|
||||
return set_error_code(TAPE_ERROR);
|
||||
return pj.on_error(TAPE_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline int start(ret_address finish_state) {
|
||||
WARN_UNUSED really_inline ErrorValues start(ret_address finish_state) {
|
||||
pj.init(); // sets is_valid to false
|
||||
if (len > pj.byte_capacity) {
|
||||
return CAPACITY;
|
||||
|
@ -264,8 +262,8 @@ struct structural_parser {
|
|||
// Advance to the first character as soon as possible
|
||||
advance_char();
|
||||
// Push the root scope (there is always at least one scope)
|
||||
if (push_start_scope(finish_state, 'r')) {
|
||||
return DEPTH_ERROR;
|
||||
if (start_document(finish_state)) {
|
||||
return pj.on_error(DEPTH_ERROR);
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
@ -291,10 +289,10 @@ unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
|||
//
|
||||
switch (parser.c) {
|
||||
case '{':
|
||||
FAIL_IF( parser.push_start_scope(addresses.finish) );
|
||||
FAIL_IF( parser.start_object(addresses.finish) );
|
||||
goto object_begin;
|
||||
case '[':
|
||||
FAIL_IF( parser.push_start_scope(addresses.finish) );
|
||||
FAIL_IF( parser.start_array(addresses.finish) );
|
||||
goto array_begin;
|
||||
case '"':
|
||||
FAIL_IF( parser.parse_string() );
|
||||
|
@ -336,7 +334,8 @@ object_begin:
|
|||
goto object_key_state;
|
||||
}
|
||||
case '}':
|
||||
goto scope_end; // could also go to object_continue
|
||||
parser.end_object();
|
||||
goto scope_end;
|
||||
default:
|
||||
goto error;
|
||||
}
|
||||
|
@ -353,20 +352,22 @@ object_continue:
|
|||
FAIL_IF( parser.parse_string() );
|
||||
goto object_key_state;
|
||||
case '}':
|
||||
parser.end_object();
|
||||
goto scope_end;
|
||||
default:
|
||||
goto error;
|
||||
}
|
||||
|
||||
scope_end:
|
||||
CONTINUE( parser.pop_scope() );
|
||||
CONTINUE( parser.pj.ret_address[parser.depth] );
|
||||
|
||||
//
|
||||
// Array parser states
|
||||
//
|
||||
array_begin:
|
||||
if (parser.advance_char() == ']') {
|
||||
goto scope_end; // could also go to array_continue
|
||||
parser.end_array();
|
||||
goto scope_end;
|
||||
}
|
||||
|
||||
main_array_switch:
|
||||
|
@ -380,6 +381,7 @@ array_continue:
|
|||
parser.advance_char();
|
||||
goto main_array_switch;
|
||||
case ']':
|
||||
parser.end_array();
|
||||
goto scope_end;
|
||||
default:
|
||||
goto error;
|
||||
|
|
|
@ -4,33 +4,32 @@ struct streaming_structural_parser: structural_parser {
|
|||
really_inline streaming_structural_parser(const uint8_t *_buf, size_t _len, ParsedJson &_pj, size_t _i) : structural_parser(_buf, _len, _pj, _i) {}
|
||||
|
||||
// override to add streaming
|
||||
WARN_UNUSED really_inline int start(ret_address finish_parser) {
|
||||
WARN_UNUSED really_inline ErrorValues start(ret_address finish_parser) {
|
||||
pj.init(); // sets is_valid to false
|
||||
// Capacity ain't no thang for streaming, so we don't check it.
|
||||
// Advance to the first character as soon as possible
|
||||
advance_char();
|
||||
// Push the root scope (there is always at least one scope)
|
||||
if (push_start_scope(finish_parser, 'r')) {
|
||||
return DEPTH_ERROR;
|
||||
if (start_document(finish_parser)) {
|
||||
return pj.on_error(DEPTH_ERROR);
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
// override to add streaming
|
||||
WARN_UNUSED really_inline int finish() {
|
||||
WARN_UNUSED really_inline ErrorValues finish() {
|
||||
if ( i + 1 > pj.n_structural_indexes ) {
|
||||
return set_error_code(TAPE_ERROR);
|
||||
return pj.on_error(TAPE_ERROR);
|
||||
}
|
||||
pop_root_scope();
|
||||
end_document();
|
||||
if (depth != 0) {
|
||||
return set_error_code(TAPE_ERROR);
|
||||
return pj.on_error(TAPE_ERROR);
|
||||
}
|
||||
if (pj.containing_scope_offset[depth] != 0) {
|
||||
return set_error_code(TAPE_ERROR);
|
||||
return pj.on_error(TAPE_ERROR);
|
||||
}
|
||||
bool finished = i + 1 == pj.n_structural_indexes;
|
||||
pj.valid = true;
|
||||
return set_error_code(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
|
||||
return pj.on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -49,10 +48,10 @@ unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj, size_t &next_jso
|
|||
//
|
||||
switch (parser.c) {
|
||||
case '{':
|
||||
FAIL_IF( parser.push_start_scope(addresses.finish) );
|
||||
FAIL_IF( parser.start_object(addresses.finish) );
|
||||
goto object_begin;
|
||||
case '[':
|
||||
FAIL_IF( parser.push_start_scope(addresses.finish) );
|
||||
FAIL_IF( parser.start_array(addresses.finish) );
|
||||
goto array_begin;
|
||||
case '"':
|
||||
FAIL_IF( parser.parse_string() );
|
||||
|
@ -94,7 +93,8 @@ object_begin:
|
|||
goto object_key_parser;
|
||||
}
|
||||
case '}':
|
||||
goto scope_end; // could also go to object_continue
|
||||
parser.end_object();
|
||||
goto scope_end;
|
||||
default:
|
||||
goto error;
|
||||
}
|
||||
|
@ -111,20 +111,22 @@ object_continue:
|
|||
FAIL_IF( parser.parse_string() );
|
||||
goto object_key_parser;
|
||||
case '}':
|
||||
parser.end_object();
|
||||
goto scope_end;
|
||||
default:
|
||||
goto error;
|
||||
}
|
||||
|
||||
scope_end:
|
||||
CONTINUE( parser.pop_scope() );
|
||||
CONTINUE( parser.pj.ret_address[parser.depth] );
|
||||
|
||||
//
|
||||
// Array parser parsers
|
||||
//
|
||||
array_begin:
|
||||
if (parser.advance_char() == ']') {
|
||||
goto scope_end; // could also go to array_continue
|
||||
parser.end_array();
|
||||
goto scope_end;
|
||||
}
|
||||
|
||||
main_array_switch:
|
||||
|
@ -138,6 +140,7 @@ array_continue:
|
|||
parser.advance_char();
|
||||
goto main_array_switch;
|
||||
case ']':
|
||||
parser.end_array();
|
||||
goto scope_end;
|
||||
default:
|
||||
goto error;
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
// We assume the file in which it is include already includes
|
||||
// "stringparsing.h" (this simplifies amalgation)
|
||||
|
||||
namespace stringparsing {
|
||||
|
||||
// begin copypasta
|
||||
// These chars yield themselves: " \ /
|
||||
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
||||
|
@ -69,14 +71,10 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
|
|||
return offset > 0;
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
|
||||
UNUSED size_t len, ParsedJson &pj,
|
||||
UNUSED const uint32_t depth,
|
||||
UNUSED uint32_t offset) {
|
||||
pj.write_tape(pj.current_string_buf_loc - pj.string_buf.get(), '"');
|
||||
WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *buf,
|
||||
uint32_t offset,
|
||||
uint8_t *dst) {
|
||||
const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */
|
||||
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
|
||||
const uint8_t *const start_of_string = dst;
|
||||
while (1) {
|
||||
parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst);
|
||||
if (((helper.bs_bits - 1) & helper.quote_bits) != 0) {
|
||||
|
@ -86,26 +84,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
|
|||
/* find out where the quote is... */
|
||||
auto quote_dist = trailing_zeroes(helper.quote_bits);
|
||||
|
||||
/* NULL termination is still handy if you expect all your strings to
|
||||
* be NULL terminated? */
|
||||
/* It comes at a small cost */
|
||||
dst[quote_dist] = 0;
|
||||
|
||||
uint32_t str_length = (dst - start_of_string) + quote_dist;
|
||||
memcpy(pj.current_string_buf_loc, &str_length, sizeof(str_length));
|
||||
/*****************************
|
||||
* Above, check for overflow in case someone has a crazy string
|
||||
* (>=4GB?) _
|
||||
* But only add the overflow check when the document itself exceeds
|
||||
* 4GB
|
||||
* Currently unneeded because we refuse to parse docs larger or equal
|
||||
* to 4GB.
|
||||
****************************/
|
||||
|
||||
/* we advance the point, accounting for the fact that we have a NULL
|
||||
* termination */
|
||||
pj.current_string_buf_loc = dst + quote_dist + 1;
|
||||
return true;
|
||||
return dst + quote_dist;
|
||||
}
|
||||
if (((helper.quote_bits - 1) & helper.bs_bits) != 0) {
|
||||
/* find out where the backspace is */
|
||||
|
@ -118,7 +97,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
|
|||
src += bs_dist;
|
||||
dst += bs_dist;
|
||||
if (!handle_unicode_codepoint(&src, &dst)) {
|
||||
return false;
|
||||
return nullptr;
|
||||
}
|
||||
} else {
|
||||
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
||||
|
@ -127,7 +106,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
|
|||
* seen. I think this is ok */
|
||||
uint8_t escape_result = escape_map[escape_char];
|
||||
if (escape_result == 0u) {
|
||||
return false; /* bogus escape value is an error */
|
||||
return nullptr; /* bogus escape value is an error */
|
||||
}
|
||||
dst[bs_dist] = escape_result;
|
||||
src += bs_dist + 2;
|
||||
|
@ -141,5 +120,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
|
|||
}
|
||||
}
|
||||
/* can't be reached */
|
||||
return true;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
} // namespace stringparsing
|
|
@ -29,7 +29,7 @@ unified_machine<Architecture::HASWELL>(const uint8_t *buf, size_t len, ParsedJso
|
|||
|
||||
template <>
|
||||
WARN_UNUSED int
|
||||
unified_machine<Architecture::HASWELL>(const uint8_t *buf, size_t len, ParsedJson &pj, size_t &next_json) {
|
||||
unified_machine<Architecture::HASWELL>(const uint8_t *buf, size_t len, ParsedJson &pj, UNUSED size_t &next_json) {
|
||||
return haswell::stage2::unified_machine(buf, len, pj, next_json);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue