From 165e23773f168bcecc3ffabf3c5951ec112e37d2 Mon Sep 17 00:00:00 2001 From: John Keiser Date: Tue, 31 Dec 2019 16:38:19 -0700 Subject: [PATCH] Refactor stage 2 into structural_parser class --- .github/workflows/fuzzers.yml | 3 +- include/simdjson/stage2_build_tape.h | 3 +- src/arm64/stage2_build_tape.h | 4 +- src/generic/stage2_build_tape.h | 791 +++++++++------------- src/generic/stage2_streaming_build_tape.h | 622 ++++------------- src/haswell/stage2_build_tape.h | 4 +- src/westmere/stage2_build_tape.h | 4 +- 7 files changed, 488 insertions(+), 943 deletions(-) diff --git a/.github/workflows/fuzzers.yml b/.github/workflows/fuzzers.yml index 8e2c83f5..71ec8ba1 100644 --- a/.github/workflows/fuzzers.yml +++ b/.github/workflows/fuzzers.yml @@ -64,7 +64,8 @@ jobs: done - name: Run the other fuzzer variants for $fuzzer, with sanitizers etc run: | - for fuzzer in $allfuzzers; do + set -x + for fuzzer in $allfuzzers; do build-ossfuzz-withavx/fuzz/fuzz_$fuzzer out/$fuzzer -max_total_time=20 $artifactsprefix || touch failed build-ossfuzz-noavx/fuzz/fuzz_$fuzzer out/$fuzzer -max_total_time=10 $artifactsprefix || touch failed build-ossfuzz-noavx8/fuzz/fuzz_$fuzzer out/$fuzzer -max_total_time=10 $artifactsprefix || touch failed diff --git a/include/simdjson/stage2_build_tape.h b/include/simdjson/stage2_build_tape.h index dbcf7f85..95010a8b 100644 --- a/include/simdjson/stage2_build_tape.h +++ b/include/simdjson/stage2_build_tape.h @@ -14,7 +14,8 @@ WARN_UNUSED int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj); template -int unified_machine(const char *buf, size_t len, ParsedJson &pj) { +WARN_UNUSED int +unified_machine(const char *buf, size_t len, ParsedJson &pj) { return unified_machine(reinterpret_cast(buf), len, pj); } diff --git a/src/arm64/stage2_build_tape.h b/src/arm64/stage2_build_tape.h index dc8f630c..b118b436 100644 --- a/src/arm64/stage2_build_tape.h +++ b/src/arm64/stage2_build_tape.h @@ -21,13 +21,13 @@ namespace simdjson { template <> WARN_UNUSED int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) { - return arm64::unified_machine(buf, len, pj); + return arm64::stage2::unified_machine(buf, len, pj); } template <> WARN_UNUSED int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj, size_t &next_json) { - return arm64::unified_machine(buf, len, pj, next_json); + return arm64::stage2::unified_machine(buf, len, pj, next_json); } } // namespace simdjson diff --git a/src/generic/stage2_build_tape.h b/src/generic/stage2_build_tape.h index 99c34328..f096e51e 100644 --- a/src/generic/stage2_build_tape.h +++ b/src/generic/stage2_build_tape.h @@ -3,515 +3,394 @@ // We assume the file in which it is include already includes // "simdjson/stage2_build_tape.h" (this simplifies amalgation) -// this macro reads the next structural character, updating idx, i and c. -#define UPDATE_CHAR() \ - { \ - idx = pj.structural_indexes[i++]; \ - c = buf[idx]; \ - } +namespace stage2 { #ifdef SIMDJSON_USE_COMPUTED_GOTO -#define SET_GOTO_ARRAY_CONTINUE() pj.ret_address[depth] = &&array_continue; -#define SET_GOTO_OBJECT_CONTINUE() pj.ret_address[depth] = &&object_continue; -#define SET_GOTO_START_CONTINUE() pj.ret_address[depth] = &&start_continue; -#define GOTO_CONTINUE() goto *pj.ret_address[depth]; +typedef void* ret_address; +#define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue } +#define GOTO(address) { goto *(address); } +#define CONTINUE(address) { goto *(address); } #else -#define SET_GOTO_ARRAY_CONTINUE() pj.ret_address[depth] = 'a'; -#define SET_GOTO_OBJECT_CONTINUE() pj.ret_address[depth] = 'o'; -#define SET_GOTO_START_CONTINUE() pj.ret_address[depth] = 's'; -#define GOTO_CONTINUE() \ - { \ - if (pj.ret_address[depth] == 'a') { \ - goto array_continue; \ - } else if (pj.ret_address[depth] == 'o') { \ - goto object_continue; \ - } else { \ - goto start_continue; \ - } \ +typedef char ret_address; +#define INIT_ADDRESSES() { '[', 'a', 'e', 'f', '{', 'o' }; +#define GOTO(address) \ + { \ + switch(address) { \ + case '[': goto array_begin; \ + case 'a': goto array_continue; \ + case 'e': goto error; \ + case 'f': goto finish; \ + case '{': goto object_begin; \ + case 'o': goto object_continue; \ + } \ + } +// For the more constrained pop_scope() situation +#define CONTINUE(address) \ + { \ + switch(address) { \ + case 'a': goto array_continue; \ + case 'o': goto object_continue; \ + case 'f': goto finish; \ + } \ } #endif +struct unified_machine_addresses { + ret_address array_begin; + ret_address array_continue; + ret_address error; + ret_address finish; + ret_address object_begin; + ret_address object_continue; +}; + +#undef FAIL_IF +#define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } } + +// This is just so we can call parse_string() from parser.parse_string() without conflict. +WARN_UNUSED really_inline bool +really_parse_string(const uint8_t *buf, size_t len, ParsedJson &pj, uint32_t depth, uint32_t idx) { + return parse_string(buf, len, pj, depth, idx); +} +WARN_UNUSED really_inline bool +really_parse_number(const uint8_t *const buf, ParsedJson &pj, const uint32_t offset, bool found_minus) { + return parse_number(buf, pj, offset, found_minus); +} + +struct structural_parser { + const uint8_t* const buf; + const size_t len; + ParsedJson &pj; + uint32_t i; // next structural index + uint32_t idx; // location of the structural character in the input (buf) + uint8_t c; // used to track the (structural) character we are looking at + uint32_t depth = 0; // could have an arbitrary starting depth + + really_inline structural_parser(const uint8_t *_buf, size_t _len, ParsedJson &_pj, uint32_t _i = 0) : buf{_buf}, len{_len}, pj{_pj}, i{_i} {} + + WARN_UNUSED really_inline int set_error_code(ErrorValues error_code) { + pj.error_code = error_code; + return error_code; + } + + really_inline char advance_char() { + idx = pj.structural_indexes[i++]; + c = buf[idx]; + return c; + } + + template + really_inline bool with_space_terminated_copy(const F& f) { + /** + * We need to make a copy to make sure that the string is space terminated. + * This is not about padding the input, which should already padded up + * to len + SIMDJSON_PADDING. However, we have no control at this stage + * on how the padding was done. What if the input string was padded with nulls? + * It is quite common for an input string to have an extra null character (C string). + * We do not want to allow 9\0 (where \0 is the null character) inside a JSON + * document, but the string "9\0" by itself is fine. So we make a copy and + * pad the input with spaces when we know that there is just one input element. + * This copy is relatively expensive, but it will almost never be called in + * practice unless you are in the strange scenario where you have many JSON + * documents made of single numbers. + */ + char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); + if (copy == nullptr) { + return true; + } + memcpy(copy, buf, len); + memset(copy + len, ' ', SIMDJSON_PADDING); + bool result = f(reinterpret_cast(copy), idx); + free(copy); + return result; + } + + WARN_UNUSED really_inline bool push_start_scope(ret_address continue_state, char type) { + pj.containing_scope_offset[depth] = pj.get_current_loc(); + pj.ret_address[depth] = continue_state; + depth++; + pj.write_tape(0, type); + return depth >= pj.depth_capacity; + } + + WARN_UNUSED really_inline bool push_start_scope(ret_address continue_state) { + return push_start_scope(continue_state, c); + } + + WARN_UNUSED really_inline bool push_scope(ret_address continue_state) { + pj.containing_scope_offset[depth] = pj.get_current_loc(); + pj.write_tape(0, c); // Do this as early as possible + pj.ret_address[depth] = continue_state; + depth++; + return depth >= pj.depth_capacity; + } + + WARN_UNUSED really_inline ret_address pop_scope() { + // write our tape location to the header scope + depth--; + pj.write_tape(pj.containing_scope_offset[depth], c); + pj.annotate_previous_loc(pj.containing_scope_offset[depth], pj.get_current_loc()); + return pj.ret_address[depth]; + } + really_inline void pop_root_scope() { + // write our tape location to the header scope + // The root scope gets written *at* the previous location. + depth--; + pj.annotate_previous_loc(pj.containing_scope_offset[depth], pj.get_current_loc()); + pj.write_tape(pj.containing_scope_offset[depth], 'r'); + } + + WARN_UNUSED really_inline bool parse_string() { + return !really_parse_string(buf, len, pj, depth, idx); + } + + WARN_UNUSED really_inline bool parse_number(const uint8_t *copy, uint32_t offset, bool found_minus) { + return !really_parse_number(copy, pj, offset, found_minus); + } + WARN_UNUSED really_inline bool parse_number(bool found_minus) { + return parse_number(buf, idx, found_minus); + } + + WARN_UNUSED really_inline bool parse_atom(const uint8_t *copy, uint32_t offset) { + switch (c) { + case 't': + if (!is_valid_true_atom(copy + offset)) { return true; }; + break; + case 'f': + if (!is_valid_false_atom(copy + offset)) { return true; } + break; + case 'n': + if (!is_valid_null_atom(copy + offset)) { return true; } + break; + default: + return false; + } + pj.write_tape(0, c); + return false; + } + + WARN_UNUSED really_inline bool parse_atom() { + return parse_atom(buf, idx); + } + + WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) { + switch (c) { + case '"': + FAIL_IF( parse_string() ); + return continue_state; + case 't': case 'f': case 'n': + FAIL_IF( parse_atom() ); + return continue_state; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + FAIL_IF( parse_number(false) ); + return continue_state; + case '-': + FAIL_IF( parse_number(true) ); + return continue_state; + case '{': + FAIL_IF( push_scope(continue_state) ); + return addresses.object_begin; + case '[': + FAIL_IF( push_scope(continue_state) ); + return addresses.array_begin; + default: + return addresses.error; + } + } + + WARN_UNUSED really_inline int finish() { + // the string might not be NULL terminated. + if ( i + 1 != pj.n_structural_indexes ) { + return set_error_code(TAPE_ERROR); + } + pop_root_scope(); + if (depth != 0) { + return set_error_code(TAPE_ERROR); + } + if (pj.containing_scope_offset[depth] != 0) { + return set_error_code(TAPE_ERROR); + } + + pj.valid = true; + return set_error_code(SUCCESS); + } + + WARN_UNUSED really_inline int error() { + /* we do not need the next line because this is done by pj.init(), + * pessimistically. + * pj.is_valid = false; + * At this point in the code, we have all the time in the world. + * Note that we know exactly where we are in the document so we could, + * without any overhead on the processing code, report a specific + * location. + * We could even trigger special code paths to assess what happened + * carefully, + * all without any added cost. */ + if (depth >= pj.depth_capacity) { + return set_error_code(DEPTH_ERROR); + } + switch (c) { + case '"': + return set_error_code(STRING_ERROR); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '-': + return set_error_code(NUMBER_ERROR); + case 't': + return set_error_code(T_ATOM_ERROR); + case 'n': + return set_error_code(N_ATOM_ERROR); + case 'f': + return set_error_code(F_ATOM_ERROR); + default: + return set_error_code(TAPE_ERROR); + } + } + + WARN_UNUSED really_inline int start(ret_address finish_state) { + pj.init(); // sets is_valid to false + if (len > pj.byte_capacity) { + return CAPACITY; + } + // Advance to the first character as soon as possible + advance_char(); + // Push the root scope (there is always at least one scope) + if (push_start_scope(finish_state, 'r')) { + return DEPTH_ERROR; + } + return SUCCESS; + } +}; + +// Redefine FAIL_IF to use goto since it'll be used inside the function now +#undef FAIL_IF +#define FAIL_IF(EXPR) { if (EXPR) { goto error; } } + /************ * The JSON is parsed to a tape, see the accompanying tape.md file * for documentation. ***********/ WARN_UNUSED int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) { - uint32_t i = 0; /* index of the structural character (0,1,2,3...) */ - uint32_t idx; /* location of the structural character in the input (buf) */ - uint8_t c; /* used to track the (structural) character we are looking at, - updated */ - /* by UPDATE_CHAR macro */ - uint32_t depth = 0; /* could have an arbitrary starting depth */ - pj.init(); /* sets is_valid to false */ - if (pj.byte_capacity < len) { - pj.error_code = simdjson::CAPACITY; - return pj.error_code; - } + static constexpr unified_machine_addresses addresses = INIT_ADDRESSES(); + structural_parser parser(buf, len, pj); + int result = parser.start(addresses.finish); + if (result) { return result; } - /*//////////////////////////// START STATE ///////////////////////////// - */ - SET_GOTO_START_CONTINUE() - pj.containing_scope_offset[depth] = pj.get_current_loc(); - pj.write_tape(0, 'r'); /* r for root, 0 is going to get overwritten */ - /* the root is used, if nothing else, to capture the size of the tape */ - depth++; /* everything starts at depth = 1, depth = 0 is just for the - root, the root may contain an object, an array or something - else. */ - if (depth >= pj.depth_capacity) { - goto fail; - } - - UPDATE_CHAR(); - switch (c) { + // + // Read first value + // + switch (parser.c) { case '{': - pj.containing_scope_offset[depth] = pj.get_current_loc(); - SET_GOTO_START_CONTINUE(); - depth++; - if (depth >= pj.depth_capacity) { - goto fail; - } - pj.write_tape( - 0, c); /* strangely, moving this to object_begin slows things down */ + FAIL_IF( parser.push_start_scope(addresses.finish) ); goto object_begin; case '[': - pj.containing_scope_offset[depth] = pj.get_current_loc(); - SET_GOTO_START_CONTINUE(); - depth++; - if (depth >= pj.depth_capacity) { - goto fail; - } - pj.write_tape(0, c); + FAIL_IF( parser.push_start_scope(addresses.finish) ); goto array_begin; - /* #define SIMDJSON_ALLOWANYTHINGINROOT - * A JSON text is a serialized value. Note that certain previous - * specifications of JSON constrained a JSON text to be an object or an - * array. Implementations that generate only objects or arrays where a - * JSON text is called for will be interoperable in the sense that all - * implementations will accept these as conforming JSON texts. - * https://tools.ietf.org/html/rfc8259 - * #ifdef SIMDJSON_ALLOWANYTHINGINROOT */ - case '"': { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - break; - } - case 't': { - /* we need to make a copy to make sure that the string is space - * terminated. - * this only applies to the JSON document made solely of the true value. - * this will almost never be called in practice */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); - if (copy == nullptr) { - goto fail; - } - memcpy(copy, buf, len); - memset(copy + len, ' ', sizeof(uint64_t)); - if (!is_valid_true_atom(reinterpret_cast(copy) + idx)) { - free(copy); - goto fail; - } - free(copy); - pj.write_tape(0, c); - break; - } - case 'f': { - /* we need to make a copy to make sure that the string is space - * terminated. - * this only applies to the JSON document made solely of the false - * value. - * this will almost never be called in practice */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); - if (copy == nullptr) { - goto fail; - } - memcpy(copy, buf, len); - memset(copy + len, ' ', sizeof(uint64_t)); - if (!is_valid_false_atom(reinterpret_cast(copy) + idx)) { - free(copy); - goto fail; - } - free(copy); - pj.write_tape(0, c); - break; - } - case 'n': { - /* we need to make a copy to make sure that the string is space - * terminated. - * this only applies to the JSON document made solely of the null value. - * this will almost never be called in practice */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); - if (copy == nullptr) { - goto fail; - } - memcpy(copy, buf, len); - memset(copy + len, ' ', sizeof(uint64_t)); - if (!is_valid_null_atom(reinterpret_cast(copy) + idx)) { - free(copy); - goto fail; - } - free(copy); - pj.write_tape(0, c); - break; - } - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - /* we need to make a copy to make sure that the string is space - * terminated. - * this is done only for JSON documents made of a sole number - * this will almost never be called in practice. We terminate with a - * space - * because we do not want to allow NULLs in the middle of a number - * (whereas a - * space in the middle of a number would be identified in stage 1). */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); - if (copy == nullptr) { - goto fail; - } - memcpy(copy, buf, len); - memset(copy + len, ' ', SIMDJSON_PADDING); - if (!parse_number(reinterpret_cast(copy), pj, idx, - false)) { - free(copy); - goto fail; - } - free(copy); - break; - } - case '-': { - /* we need to make a copy to make sure that the string is NULL - * terminated. - * this is done only for JSON documents made of a sole number - * this will almost never be called in practice */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); - if (copy == nullptr) { - goto fail; - } - memcpy(copy, buf, len); - memset(copy + len, ' ', SIMDJSON_PADDING); - if (!parse_number(reinterpret_cast(copy), pj, idx, true)) { - free(copy); - goto fail; - } - free(copy); - break; - } + case '"': + FAIL_IF( parser.parse_string() ); + goto finish; + case 't': case 'f': case 'n': + FAIL_IF( + parser.with_space_terminated_copy([&](auto copy, auto idx) { + return parser.parse_atom(copy, idx); + }) + ); + goto finish; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + FAIL_IF( + parser.with_space_terminated_copy([&](auto copy, auto idx) { + return parser.parse_number(copy, idx, false); + }) + ); + goto finish; + case '-': + FAIL_IF( + parser.with_space_terminated_copy([&](auto copy, auto idx) { + return parser.parse_number(copy, idx, true); + }) + ); + goto finish; default: - goto fail; + goto error; } -start_continue: - /* the string might not be NULL terminated. */ - if (i + 1 == pj.n_structural_indexes) { - goto succeed; - } else { - goto fail; - } - /*//////////////////////////// OBJECT STATES ///////////////////////////*/ +// +// Object parser states +// object_begin: - UPDATE_CHAR(); - switch (c) { + parser.advance_char(); + switch (parser.c) { case '"': { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } + FAIL_IF( parser.parse_string() ); goto object_key_state; } case '}': - goto scope_end; /* could also go to object_continue */ + goto scope_end; // could also go to object_continue default: - goto fail; + goto error; } object_key_state: - UPDATE_CHAR(); - if (c != ':') { - goto fail; - } - UPDATE_CHAR(); - switch (c) { - case '"': { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - break; - } - case 't': - if (!is_valid_true_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case 'f': - if (!is_valid_false_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case 'n': - if (!is_valid_null_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - if (!parse_number(buf, pj, idx, false)) { - goto fail; - } - break; - } - case '-': { - if (!parse_number(buf, pj, idx, true)) { - goto fail; - } - break; - } - case '{': { - pj.containing_scope_offset[depth] = pj.get_current_loc(); - pj.write_tape(0, c); /* here the compilers knows what c is so this gets - optimized */ - /* we have not yet encountered } so we need to come back for it */ - SET_GOTO_OBJECT_CONTINUE() - /* we found an object inside an object, so we need to increment the - * depth */ - depth++; - if (depth >= pj.depth_capacity) { - goto fail; - } + FAIL_IF( parser.advance_char() != ':' ); - goto object_begin; - } - case '[': { - pj.containing_scope_offset[depth] = pj.get_current_loc(); - pj.write_tape(0, c); /* here the compilers knows what c is so this gets - optimized */ - /* we have not yet encountered } so we need to come back for it */ - SET_GOTO_OBJECT_CONTINUE() - /* we found an array inside an object, so we need to increment the depth - */ - depth++; - if (depth >= pj.depth_capacity) { - goto fail; - } - goto array_begin; - } - default: - goto fail; - } + parser.advance_char(); + GOTO( parser.parse_value(addresses, addresses.object_continue) ); object_continue: - UPDATE_CHAR(); - switch (c) { + switch (parser.advance_char()) { case ',': - UPDATE_CHAR(); - if (c != '"') { - goto fail; - } else { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - goto object_key_state; - } + FAIL_IF( parser.advance_char() != '"' ); + FAIL_IF( parser.parse_string() ); + goto object_key_state; case '}': goto scope_end; default: - goto fail; + goto error; } - /*//////////////////////////// COMMON STATE ///////////////////////////*/ - scope_end: - /* write our tape location to the header scope */ - depth--; - pj.write_tape(pj.containing_scope_offset[depth], c); - pj.annotate_previous_loc(pj.containing_scope_offset[depth], - pj.get_current_loc()); - /* goto saved_state */ - GOTO_CONTINUE() + CONTINUE( parser.pop_scope() ); - /*//////////////////////////// ARRAY STATES ///////////////////////////*/ +// +// Array parser states +// array_begin: - UPDATE_CHAR(); - if (c == ']') { - goto scope_end; /* could also go to array_continue */ + if (parser.advance_char() == ']') { + goto scope_end; // could also go to array_continue } main_array_switch: - /* we call update char on all paths in, so we can peek at c on the + /* we call update char on all paths in, so we can peek at parser.c on the * on paths that can accept a close square brace (post-, and at start) */ - switch (c) { - case '"': { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - break; - } - case 't': - if (!is_valid_true_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case 'f': - if (!is_valid_false_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case 'n': - if (!is_valid_null_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; /* goto array_continue; */ - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - if (!parse_number(buf, pj, idx, false)) { - goto fail; - } - break; /* goto array_continue; */ - } - case '-': { - if (!parse_number(buf, pj, idx, true)) { - goto fail; - } - break; /* goto array_continue; */ - } - case '{': { - /* we have not yet encountered ] so we need to come back for it */ - pj.containing_scope_offset[depth] = pj.get_current_loc(); - pj.write_tape(0, c); /* here the compilers knows what c is so this gets - optimized */ - SET_GOTO_ARRAY_CONTINUE() - /* we found an object inside an array, so we need to increment the depth - */ - depth++; - if (depth >= pj.depth_capacity) { - goto fail; - } - - goto object_begin; - } - case '[': { - /* we have not yet encountered ] so we need to come back for it */ - pj.containing_scope_offset[depth] = pj.get_current_loc(); - pj.write_tape(0, c); /* here the compilers knows what c is so this gets - optimized */ - SET_GOTO_ARRAY_CONTINUE() - /* we found an array inside an array, so we need to increment the depth - */ - depth++; - if (depth >= pj.depth_capacity) { - goto fail; - } - goto array_begin; - } - default: - goto fail; - } + GOTO( parser.parse_value(addresses, addresses.array_continue) ); array_continue: - UPDATE_CHAR(); - switch (c) { + switch (parser.advance_char()) { case ',': - UPDATE_CHAR(); + parser.advance_char(); goto main_array_switch; case ']': goto scope_end; default: - goto fail; + goto error; } - /*//////////////////////////// FINAL STATES ///////////////////////////*/ +finish: + return parser.finish(); -succeed: - depth--; - if (depth != 0) { - fprintf(stderr, "internal bug\n"); - abort(); - } - if (pj.containing_scope_offset[depth] != 0) { - fprintf(stderr, "internal bug\n"); - abort(); - } - pj.annotate_previous_loc(pj.containing_scope_offset[depth], - pj.get_current_loc()); - pj.write_tape(pj.containing_scope_offset[depth], 'r'); /* r is root */ - - pj.valid = true; - pj.error_code = simdjson::SUCCESS; - return pj.error_code; -fail: - /* we do not need the next line because this is done by pj.init(), - * pessimistically. - * pj.is_valid = false; - * At this point in the code, we have all the time in the world. - * Note that we know exactly where we are in the document so we could, - * without any overhead on the processing code, report a specific - * location. - * We could even trigger special code paths to assess what happened - * carefully, - * all without any added cost. */ - if (depth >= pj.depth_capacity) { - pj.error_code = simdjson::DEPTH_ERROR; - return pj.error_code; - } - switch (c) { - case '"': - pj.error_code = simdjson::STRING_ERROR; - return pj.error_code; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - case '-': - pj.error_code = simdjson::NUMBER_ERROR; - return pj.error_code; - case 't': - pj.error_code = simdjson::T_ATOM_ERROR; - return pj.error_code; - case 'n': - pj.error_code = simdjson::N_ATOM_ERROR; - return pj.error_code; - case 'f': - pj.error_code = simdjson::F_ATOM_ERROR; - return pj.error_code; - default: - break; - } - pj.error_code = simdjson::TAPE_ERROR; - return pj.error_code; +error: + return parser.error(); } + +} // namespace stage2 \ No newline at end of file diff --git a/src/generic/stage2_streaming_build_tape.h b/src/generic/stage2_streaming_build_tape.h index 0b43a7fc..4af13d90 100755 --- a/src/generic/stage2_streaming_build_tape.h +++ b/src/generic/stage2_streaming_build_tape.h @@ -1,497 +1,161 @@ +namespace stage2 { + +struct streaming_structural_parser: structural_parser { + really_inline streaming_structural_parser(const uint8_t *_buf, size_t _len, ParsedJson &_pj, size_t _i) : structural_parser(_buf, _len, _pj, _i) {} + + // override to add streaming + WARN_UNUSED really_inline int start(ret_address finish_parser) { + pj.init(); // sets is_valid to false + // Capacity ain't no thang for streaming, so we don't check it. + // Advance to the first character as soon as possible + advance_char(); + // Push the root scope (there is always at least one scope) + if (push_start_scope(finish_parser, 'r')) { + return DEPTH_ERROR; + } + return SUCCESS; + } + + // override to add streaming + WARN_UNUSED really_inline int finish() { + /* the string might not be NULL terminated. */ + if ( i + 1 > pj.n_structural_indexes ) { + return set_error_code(TAPE_ERROR); + } + bool finished = i + 1 == pj.n_structural_indexes; + if (finished && buf[idx+2] != '\0') { + return set_error_code(TAPE_ERROR); + } + pop_root_scope(); + if (depth != 0) { + return set_error_code(TAPE_ERROR); + } + if (pj.containing_scope_offset[depth] != 0) { + return set_error_code(TAPE_ERROR); + } + + pj.valid = true; + return set_error_code(finished ? SUCCESS : SUCCESS_AND_HAS_MORE); + } +}; + /************ * The JSON is parsed to a tape, see the accompanying tape.md file * for documentation. ***********/ WARN_UNUSED int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj, size_t &next_json) { - size_t i{next_json}; /* index of the structural character (0,1,2,3...) */ - size_t idx; /* location of the structural character in the input (buf) */ - uint8_t c; /* used to track the (structural) character we are looking at, - updated */ - /* by UPDATE_CHAR macro */ - size_t depth = 0; /* could have an arbitrary starting depth */ - pj.init(); /* sets is_valid to false */ - /*//////////////////////////// START STATE ///////////////////////////// - */ - SET_GOTO_START_CONTINUE() - pj.containing_scope_offset[depth] = pj.get_current_loc(); - pj.write_tape(0, 'r'); /* r for root, 0 is going to get overwritten */ - /* the root is used, if nothing else, to capture the size of the tape */ - depth++; /* everything starts at depth = 1, depth = 0 is just for the - root, the root may contain an object, an array or something - else. */ - if (depth >= pj.depth_capacity) { - goto fail; - } + static constexpr unified_machine_addresses addresses = INIT_ADDRESSES(); + streaming_structural_parser parser(buf, len, pj, next_json); + int result = parser.start(addresses.finish); + if (result) { return result; } - UPDATE_CHAR(); - switch (c) { - case '{': - pj.containing_scope_offset[depth] = pj.get_current_loc(); - SET_GOTO_START_CONTINUE(); - depth++; - if (depth >= pj.depth_capacity) { - goto fail; - } - pj.write_tape( - 0, c); /* strangely, moving this to object_begin slows things down */ - goto object_begin; - case '[': - pj.containing_scope_offset[depth] = pj.get_current_loc(); - SET_GOTO_START_CONTINUE(); - depth++; - if (depth >= pj.depth_capacity) { - goto fail; - } - pj.write_tape(0, c); - goto array_begin; - /* #define SIMDJSON_ALLOWANYTHINGINROOT - * A JSON text is a serialized value. Note that certain previous - * specifications of JSON constrained a JSON text to be an object or an - * array. Implementations that generate only objects or arrays where a - * JSON text is called for will be interoperable in the sense that all - * implementations will accept these as conforming JSON texts. - * https://tools.ietf.org/html/rfc8259 - * #ifdef SIMDJSON_ALLOWANYTHINGINROOT */ - case '"': { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - break; - } - case 't': { - /* we need to make a copy to make sure that the string is space - * terminated. - * this only applies to the JSON document made solely of the true value. - * this will almost never be called in practice */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); - if (copy == nullptr) { - goto fail; - } - memcpy(copy, buf, len); - copy[len] = ' '; - if (!is_valid_true_atom(reinterpret_cast(copy) + idx)) { - free(copy); - goto fail; - } - free(copy); - pj.write_tape(0, c); - break; - } - case 'f': { - /* we need to make a copy to make sure that the string is space - * terminated. - * this only applies to the JSON document made solely of the false - * value. - * this will almost never be called in practice */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); - if (copy == nullptr) { - goto fail; - } - memcpy(copy, buf, len); - copy[len] = ' '; - if (!is_valid_false_atom(reinterpret_cast(copy) + idx)) { - free(copy); - goto fail; - } - free(copy); - pj.write_tape(0, c); - break; - } - case 'n': { - /* we need to make a copy to make sure that the string is space - * terminated. - * this only applies to the JSON document made solely of the null value. - * this will almost never be called in practice */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); - if (copy == nullptr) { - goto fail; - } - memcpy(copy, buf, len); - copy[len] = ' '; - if (!is_valid_null_atom(reinterpret_cast(copy) + idx)) { - free(copy); - goto fail; - } - free(copy); - pj.write_tape(0, c); - break; - } - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - /* we need to make a copy to make sure that the string is space - * terminated. - * this is done only for JSON documents made of a sole number - * this will almost never be called in practice. We terminate with a - * space - * because we do not want to allow NULLs in the middle of a number - * (whereas a - * space in the middle of a number would be identified in stage 1). */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); - if (copy == nullptr) { - goto fail; - } - memcpy(copy, buf, len); - copy[len] = ' '; - if (!parse_number(reinterpret_cast(copy), pj, idx, - false)) { - free(copy); - goto fail; - } - free(copy); - break; - } - case '-': { - /* we need to make a copy to make sure that the string is NULL - * terminated. - * this is done only for JSON documents made of a sole number - * this will almost never be called in practice */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); - if (copy == nullptr) { - goto fail; - } - memcpy(copy, buf, len); - copy[len] = ' '; - if (!parse_number(reinterpret_cast(copy), pj, idx, true)) { - free(copy); - goto fail; - } - free(copy); - break; - } - default: - goto fail; - } - start_continue: - /* the string might not be NULL terminated. */ - if (i + 1 == pj.n_structural_indexes && buf[idx+2] == '\0') { - goto succeed; - } else if(depth == 1 && i<=pj.n_structural_indexes) { - goto succeedAndHasMore; - } else { - goto fail; - } - /*//////////////////////////// OBJECT STATES ///////////////////////////*/ + // + // Read first value + // + switch (parser.c) { + case '{': + FAIL_IF( parser.push_start_scope(addresses.finish) ); + goto object_begin; + case '[': + FAIL_IF( parser.push_start_scope(addresses.finish) ); + goto array_begin; + case '"': + FAIL_IF( parser.parse_string() ); + goto finish; + case 't': case 'f': case 'n': + FAIL_IF( + parser.with_space_terminated_copy([&](auto copy, auto idx) { + return parser.parse_atom(copy, idx); + }) + ); + goto finish; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + FAIL_IF( + parser.with_space_terminated_copy([&](auto copy, auto idx) { + return parser.parse_number(copy, idx, false); + }) + ); + goto finish; + case '-': + FAIL_IF( + parser.with_space_terminated_copy([&](auto copy, auto idx) { + return parser.parse_number(copy, idx, true); + }) + ); + goto finish; + default: + goto error; + } - object_begin: - UPDATE_CHAR(); - switch (c) { - case '"': { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - goto object_key_state; - } - case '}': - goto scope_end; /* could also go to object_continue */ - default: - goto fail; - } +// +// Object parser parsers +// +object_begin: + parser.advance_char(); + switch (parser.c) { + case '"': { + FAIL_IF( parser.parse_string() ); + goto object_key_parser; + } + case '}': + goto scope_end; // could also go to object_continue + default: + goto error; + } - object_key_state: - UPDATE_CHAR(); - if (c != ':') { - goto fail; - } - UPDATE_CHAR(); - switch (c) { - case '"': { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - break; - } - case 't': - if (!is_valid_true_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case 'f': - if (!is_valid_false_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case 'n': - if (!is_valid_null_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - if (!parse_number(buf, pj, idx, false)) { - goto fail; - } - break; - } - case '-': { - if (!parse_number(buf, pj, idx, true)) { - goto fail; - } - break; - } - case '{': { - pj.containing_scope_offset[depth] = pj.get_current_loc(); - pj.write_tape(0, c); /* here the compilers knows what c is so this gets - optimized */ - /* we have not yet encountered } so we need to come back for it */ - SET_GOTO_OBJECT_CONTINUE() - /* we found an object inside an object, so we need to increment the - * depth */ - depth++; - if (depth >= pj.depth_capacity) { - goto fail; - } +object_key_parser: + FAIL_IF( parser.advance_char() != ':' ); - goto object_begin; - } - case '[': { - pj.containing_scope_offset[depth] = pj.get_current_loc(); - pj.write_tape(0, c); /* here the compilers knows what c is so this gets - optimized */ - /* we have not yet encountered } so we need to come back for it */ - SET_GOTO_OBJECT_CONTINUE() - /* we found an array inside an object, so we need to increment the depth - */ - depth++; - if (depth >= pj.depth_capacity) { - goto fail; - } - goto array_begin; - } - default: - goto fail; - } + parser.advance_char(); + GOTO( parser.parse_value(addresses, addresses.object_continue) ); - object_continue: - UPDATE_CHAR(); - switch (c) { - case ',': - UPDATE_CHAR(); - if (c != '"') { - goto fail; - } else { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - goto object_key_state; - } - case '}': - goto scope_end; - default: - goto fail; - } +object_continue: + switch (parser.advance_char()) { + case ',': + FAIL_IF( parser.advance_char() != '"' ); + FAIL_IF( parser.parse_string() ); + goto object_key_parser; + case '}': + goto scope_end; + default: + goto error; + } - /*//////////////////////////// COMMON STATE ///////////////////////////*/ +scope_end: + CONTINUE( parser.pop_scope() ); - scope_end: - /* write our tape location to the header scope */ - depth--; - pj.write_tape(pj.containing_scope_offset[depth], c); - pj.annotate_previous_loc(pj.containing_scope_offset[depth], - pj.get_current_loc()); - /* goto saved_state */ - GOTO_CONTINUE() +// +// Array parser parsers +// +array_begin: + if (parser.advance_char() == ']') { + goto scope_end; // could also go to array_continue + } - /*//////////////////////////// ARRAY STATES ///////////////////////////*/ - array_begin: - UPDATE_CHAR(); - if (c == ']') { - goto scope_end; /* could also go to array_continue */ - } +main_array_switch: + /* we call update char on all paths in, so we can peek at parser.c on the + * on paths that can accept a close square brace (post-, and at start) */ + GOTO( parser.parse_value(addresses, addresses.array_continue) ); - main_array_switch: - /* we call update char on all paths in, so we can peek at c on the - * on paths that can accept a close square brace (post-, and at start) */ - switch (c) { - case '"': { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - break; - } - case 't': - if (!is_valid_true_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case 'f': - if (!is_valid_false_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case 'n': - if (!is_valid_null_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; /* goto array_continue; */ +array_continue: + switch (parser.advance_char()) { + case ',': + parser.advance_char(); + goto main_array_switch; + case ']': + goto scope_end; + default: + goto error; + } - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - if (!parse_number(buf, pj, idx, false)) { - goto fail; - } - break; /* goto array_continue; */ - } - case '-': { - if (!parse_number(buf, pj, idx, true)) { - goto fail; - } - break; /* goto array_continue; */ - } - case '{': { - /* we have not yet encountered ] so we need to come back for it */ - pj.containing_scope_offset[depth] = pj.get_current_loc(); - pj.write_tape(0, c); /* here the compilers knows what c is so this gets - optimized */ - SET_GOTO_ARRAY_CONTINUE() - /* we found an object inside an array, so we need to increment the depth - */ - depth++; - if (depth >= pj.depth_capacity) { - goto fail; - } +finish: + next_json = parser.i; + return parser.finish(); - goto object_begin; - } - case '[': { - /* we have not yet encountered ] so we need to come back for it */ - pj.containing_scope_offset[depth] = pj.get_current_loc(); - pj.write_tape(0, c); /* here the compilers knows what c is so this gets - optimized */ - SET_GOTO_ARRAY_CONTINUE() - /* we found an array inside an array, so we need to increment the depth - */ - depth++; - if (depth >= pj.depth_capacity) { - goto fail; - } - goto array_begin; - } - default: - goto fail; - } - - array_continue: - UPDATE_CHAR(); - switch (c) { - case ',': - UPDATE_CHAR(); - goto main_array_switch; - case ']': - goto scope_end; - default: - goto fail; - } - - /*//////////////////////////// FINAL STATES ///////////////////////////*/ - succeedAndHasMore: - depth--; - if (pj.containing_scope_offset[depth] != 0) { - fprintf(stderr, "internal bug\n"); - abort(); - } - pj.annotate_previous_loc(pj.containing_scope_offset[depth], - pj.get_current_loc()); - pj.write_tape(pj.containing_scope_offset[depth], 'r'); /* r is root */ - - - next_json = i; - - pj.valid = true; - pj.error_code = simdjson::SUCCESS_AND_HAS_MORE; - return pj.error_code; - - succeed: - depth--; - if (depth != 0) { - fprintf(stderr, "internal bug\n"); - abort(); - } - if (pj.containing_scope_offset[depth] != 0) { - fprintf(stderr, "internal bug\n"); - abort(); - } - pj.annotate_previous_loc(pj.containing_scope_offset[depth], - pj.get_current_loc()); - pj.write_tape(pj.containing_scope_offset[depth], 'r'); /* r is root */ - - pj.valid = true; - pj.error_code = simdjson::SUCCESS; - return pj.error_code; - fail: - /* we do not need the next line because this is done by pj.init(), - * pessimistically. - * pj.is_valid = false; - * At this point in the code, we have all the time in the world. - * Note that we know exactly where we are in the document so we could, - * without any overhead on the processing code, report a specific - * location. - * We could even trigger special code paths to assess what happened - * carefully, - * all without any added cost. */ - if (depth >= pj.depth_capacity) { - pj.error_code = simdjson::DEPTH_ERROR; - return pj.error_code; - } - switch (c) { - case '"': - pj.error_code = simdjson::STRING_ERROR; - return pj.error_code; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - case '-': - pj.error_code = simdjson::NUMBER_ERROR; - return pj.error_code; - case 't': - pj.error_code = simdjson::T_ATOM_ERROR; - return pj.error_code; - case 'n': - pj.error_code = simdjson::N_ATOM_ERROR; - return pj.error_code; - case 'f': - pj.error_code = simdjson::F_ATOM_ERROR; - return pj.error_code; - default: - break; - } - pj.error_code = simdjson::TAPE_ERROR; - return pj.error_code; +error: + return parser.error(); } + +} // namespace stage2 \ No newline at end of file diff --git a/src/haswell/stage2_build_tape.h b/src/haswell/stage2_build_tape.h index 706da4af..badebfbf 100644 --- a/src/haswell/stage2_build_tape.h +++ b/src/haswell/stage2_build_tape.h @@ -24,13 +24,13 @@ namespace simdjson { template <> WARN_UNUSED int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) { - return haswell::unified_machine(buf, len, pj); + return haswell::stage2::unified_machine(buf, len, pj); } template <> WARN_UNUSED int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj, size_t &next_json) { - return haswell::unified_machine(buf, len, pj, next_json); + return haswell::stage2::unified_machine(buf, len, pj, next_json); } } // namespace simdjson diff --git a/src/westmere/stage2_build_tape.h b/src/westmere/stage2_build_tape.h index aec3ae8b..31be99c0 100644 --- a/src/westmere/stage2_build_tape.h +++ b/src/westmere/stage2_build_tape.h @@ -24,13 +24,13 @@ namespace simdjson { template <> WARN_UNUSED int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) { - return westmere::unified_machine(buf, len, pj); + return westmere::stage2::unified_machine(buf, len, pj); } template <> WARN_UNUSED int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj, size_t &next_json) { - return westmere::unified_machine(buf, len, pj, next_json); + return westmere::stage2::unified_machine(buf, len, pj, next_json); }