diff --git a/src/generic/stage2/logger.h b/src/generic/stage2/logger.h index 7d6fb97a..c682fb0d 100644 --- a/src/generic/stage2/logger.h +++ b/src/generic/stage2/logger.h @@ -61,10 +61,10 @@ namespace logger { printf(" "); } printf("| %c ", printable_char(structurals.at_beginning() ? ' ' : structurals.current_char())); - printf("| %c ", printable_char(structurals.peek_char())); - printf("| %5u ", structurals.structural_indexes[structurals.next_structural]); + printf("| %c ", printable_char(structurals.peek_next_char())); + printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural+1)]); printf("| %-*s ", LOG_DETAIL_LEN, detail); - printf("| %*zu ", LOG_INDEX_LEN, structurals.idx); + printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural); printf("|\n"); } } diff --git a/src/generic/stage2/streaming_structural_parser.h b/src/generic/stage2/streaming_structural_parser.h new file mode 100755 index 00000000..8e63d028 --- /dev/null +++ b/src/generic/stage2/streaming_structural_parser.h @@ -0,0 +1,168 @@ +namespace stage2 { + +struct streaming_structural_parser: structural_parser { + really_inline streaming_structural_parser(dom_parser_implementation &_parser) : structural_parser(_parser, _parser.next_structural_index) {} + + // override to add streaming + WARN_UNUSED really_inline error_code start(ret_address_t finish_parser) { + // If there are no structurals left, return EMPTY + if (structurals.at_end(parser.n_structural_indexes)) { + return parser.error = EMPTY; + } + + log_start(); + init(); + + // Capacity ain't no thang for streaming, so we don't check it. + // Advance to the first character as soon as possible + advance_char(); + // Push the root scope (there is always at least one scope) + if (start_document(finish_parser)) { + return parser.error = DEPTH_ERROR; + } + return SUCCESS; + } + + // override to add streaming + WARN_UNUSED really_inline error_code finish() { + if ( structurals.past_end(parser.n_structural_indexes) ) { + log_error("IMPOSSIBLE: past the end of the JSON!"); + return parser.error = TAPE_ERROR; + } + end_document(); + parser.next_structural_index = uint32_t(structurals.next_structural_index()); + if (depth != 0) { + log_error("Unclosed objects or arrays!"); + return parser.error = TAPE_ERROR; + } + if (parser.containing_scope[depth].tape_index != 0) { + log_error("IMPOSSIBLE: root scope tape index did not start at 0!"); + return parser.error = TAPE_ERROR; + } + return SUCCESS; + } +}; + +} // namespace stage2 + +/************ + * The JSON is parsed to a tape, see the accompanying tape.md file + * for documentation. + ***********/ +WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { + this->doc = &_doc; + static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); + stage2::streaming_structural_parser parser(*this); + error_code result = parser.start(addresses.finish); + if (result) { return result; } + // + // Read first value + // + switch (parser.structurals.current_char()) { + case '{': + FAIL_IF( parser.start_object(addresses.finish) ); + goto object_begin; + case '[': + FAIL_IF( parser.start_array(addresses.finish) ); + goto array_begin; + case '"': + FAIL_IF( parser.parse_string() ); + goto finish; + case 't': case 'f': case 'n': + FAIL_IF( parser.parse_single_atom() ); + goto finish; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + FAIL_IF( + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + return parser.parse_number(©[idx], false); + }) + ); + goto finish; + case '-': + FAIL_IF( + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + return parser.parse_number(©[idx], true); + }) + ); + goto finish; + default: + parser.log_error("Document starts with a non-value character"); + goto error; + } + +// +// Object parser parsers +// +object_begin: + switch (parser.advance_char()) { + case '"': { + FAIL_IF( parser.parse_string(true) ); + goto object_key_parser; + } + case '}': + parser.end_object(); + goto scope_end; + default: + parser.log_error("Object does not start with a key"); + goto error; + } + +object_key_parser: + if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; } + parser.increment_count(); + parser.advance_char(); + GOTO( parser.parse_value(addresses, addresses.object_continue) ); + +object_continue: + switch (parser.advance_char()) { + case ',': + if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; } + FAIL_IF( parser.parse_string(true) ); + goto object_key_parser; + case '}': + parser.end_object(); + goto scope_end; + default: + parser.log_error("No comma between object fields"); + goto error; + } + +scope_end: + CONTINUE( parser.parser.ret_address[parser.depth] ); + +// +// Array parser parsers +// +array_begin: + if (parser.advance_char() == ']') { + parser.end_array(); + goto scope_end; + } + parser.increment_count(); + +main_array_switch: + /* we call update char on all paths in, so we can peek at parser.c on the + * on paths that can accept a close square brace (post-, and at start) */ + GOTO( parser.parse_value(addresses, addresses.array_continue) ); + +array_continue: + switch (parser.advance_char()) { + case ',': + parser.increment_count(); + parser.advance_char(); + goto main_array_switch; + case ']': + parser.end_array(); + goto scope_end; + default: + parser.log_error("Missing comma between array values"); + goto error; + } + +finish: + return parser.finish(); + +error: + return parser.error(); +} diff --git a/src/generic/stage2/structural_iterator.h b/src/generic/stage2/structural_iterator.h index 92a990b2..ae47ec91 100644 --- a/src/generic/stage2/structural_iterator.h +++ b/src/generic/stage2/structural_iterator.h @@ -2,29 +2,34 @@ namespace stage2 { class structural_iterator { public: - really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index) - : buf{_buf}, - len{_len}, - structural_indexes{_structural_indexes}, - next_structural{next_structural_index} - {} - really_inline char advance_char() { - idx = structural_indexes[next_structural]; - next_structural++; - c = *current(); - return c; - } - really_inline char current_char() { - return c; - } - really_inline char peek_char() { - return buf[structural_indexes[next_structural]]; + const uint8_t* const buf; + uint32_t *current_structural; + dom_parser_implementation &parser; + + // Start a structural + really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index) + : buf{_parser.buf}, + current_structural{&_parser.structural_indexes[start_structural_index]}, + parser{_parser} { } + // Get the buffer position of the current structural character really_inline const uint8_t* current() { - return &buf[idx]; + return &buf[*current_structural]; + } + // Get the current structural character + really_inline char current_char() { + return buf[*current_structural]; + } + // Get the next structural character without advancing + really_inline char peek_next_char() { + return buf[*(current_structural+1)]; + } + really_inline char advance_char() { + current_structural++; + return buf[*current_structural]; } really_inline size_t remaining_len() { - return len - idx; + return parser.len - *current_structural; } template really_inline bool with_space_terminated_copy(const F& f) { @@ -41,35 +46,25 @@ public: * practice unless you are in the strange scenario where you have many JSON * documents made of single atoms. */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); + char *copy = static_cast(malloc(parser.len + SIMDJSON_PADDING)); if (copy == nullptr) { return true; } - memcpy(copy, buf, len); - memset(copy + len, ' ', SIMDJSON_PADDING); - bool result = f(reinterpret_cast(copy), idx); + memcpy(copy, buf, parser.len); + memset(copy + parser.len, ' ', SIMDJSON_PADDING); + bool result = f(reinterpret_cast(copy), *current_structural); free(copy); return result; } really_inline bool past_end(uint32_t n_structural_indexes) { - return next_structural > n_structural_indexes; + return current_structural >= &parser.structural_indexes[n_structural_indexes]; } really_inline bool at_end(uint32_t n_structural_indexes) { - return next_structural == n_structural_indexes; + return current_structural == &parser.structural_indexes[n_structural_indexes]; } really_inline bool at_beginning() { - return next_structural == 0; + return current_structural == parser.structural_indexes.get(); } - really_inline size_t next_structural_index() { - return next_structural; - } - - const uint8_t* const buf; - const size_t len; - const uint32_t* const structural_indexes; - size_t next_structural; // next structural index - size_t idx{0}; // location of the structural character in the input (buf) - uint8_t c{0}; // used to track the (structural) character we are looking at }; } // namespace stage2 diff --git a/src/generic/stage2/structural_parser.h b/src/generic/stage2/structural_parser.h index 4c112b67..6fabcd88 100644 --- a/src/generic/stage2/structural_parser.h +++ b/src/generic/stage2/structural_parser.h @@ -69,17 +69,15 @@ struct number_writer { } }; // struct number_writer -struct structural_parser { - structural_iterator structurals; - dom_parser_implementation &parser; +struct structural_parser : structural_iterator { /** Next write location in the string buf for stage 2 parsing */ uint8_t *current_string_buf_loc{}; + /** Current depth (nested objects and arrays) */ uint32_t depth; // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations - really_inline structural_parser(dom_parser_implementation &_parser, uint32_t next_structural) - : structurals(_parser.buf, _parser.len, _parser.structural_indexes.get(), next_structural), - parser{_parser}, + really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index) + : structural_iterator(_parser, start_structural_index), depth{0} { } @@ -174,7 +172,7 @@ struct structural_parser { WARN_UNUSED really_inline bool parse_string(bool key = false) { log_value(key ? "key" : "string"); uint8_t *dst = on_start_string(); - dst = stringparsing::parse_string(structurals.current(), dst); + dst = stringparsing::parse_string(current(), dst); if (dst == nullptr) { log_error("Invalid escape in string"); return true; @@ -191,64 +189,28 @@ struct structural_parser { return !succeeded; } WARN_UNUSED really_inline bool parse_number(bool found_minus) { - return parse_number(structurals.current(), found_minus); - } - - WARN_UNUSED really_inline bool parse_atom() { - switch (structurals.current_char()) { - case 't': - log_value("true"); - if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; } - append_tape(0, internal::tape_type::TRUE_VALUE); - break; - case 'f': - log_value("false"); - if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; } - append_tape(0, internal::tape_type::FALSE_VALUE); - break; - case 'n': - log_value("null"); - if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; } - append_tape(0, internal::tape_type::NULL_VALUE); - break; - default: - log_error("IMPOSSIBLE: unrecognized parse_atom structural character"); - return true; - } - return false; - } - - WARN_UNUSED really_inline bool parse_single_atom() { - switch (structurals.current_char()) { - case 't': - log_value("true"); - if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; } - append_tape(0, internal::tape_type::TRUE_VALUE); - break; - case 'f': - log_value("false"); - if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; } - append_tape(0, internal::tape_type::FALSE_VALUE); - break; - case 'n': - log_value("null"); - if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; } - append_tape(0, internal::tape_type::NULL_VALUE); - break; - default: - log_error("IMPOSSIBLE: unrecognized parse_atom structural character"); - return true; - } - return false; + return parse_number(current(), found_minus); } WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) { - switch (structurals.current_char()) { + switch (advance_char()) { case '"': FAIL_IF( parse_string() ); return continue_state; - case 't': case 'f': case 'n': - FAIL_IF( parse_atom() ); + case 't': + log_value("true"); + FAIL_IF( !atomparsing::is_valid_true_atom(current()) ); + append_tape(0, internal::tape_type::TRUE_VALUE); + return continue_state; + case 'f': + log_value("false"); + FAIL_IF( !atomparsing::is_valid_false_atom(current()) ); + append_tape(0, internal::tape_type::FALSE_VALUE); + return continue_state; + case 'n': + log_value("null"); + FAIL_IF( !atomparsing::is_valid_null_atom(current()) ); + append_tape(0, internal::tape_type::NULL_VALUE); return continue_state; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': @@ -271,7 +233,7 @@ struct structural_parser { WARN_UNUSED really_inline error_code finish() { end_document(); - parser.next_structural_index = uint32_t(structurals.next_structural_index()); + parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]); if (depth != 0) { log_error("Unclosed objects or arrays!"); @@ -295,7 +257,7 @@ struct structural_parser { if (depth >= parser.max_depth()) { return parser.error = DEPTH_ERROR; } - switch (structurals.current_char()) { + switch (current_char()) { case '"': return parser.error = STRING_ERROR; case '0': @@ -322,6 +284,7 @@ struct structural_parser { } really_inline void init() { + log_start(); current_string_buf_loc = parser.doc->string_buf.get(); parser.current_loc = 0; parser.error = UNINITIALIZED; @@ -329,14 +292,11 @@ struct structural_parser { WARN_UNUSED really_inline error_code start(ret_address_t finish_state) { // If there are no structurals left, return EMPTY - if (structurals.at_end(parser.n_structural_indexes)) { + if (at_end(parser.n_structural_indexes)) { return parser.error = EMPTY; } - log_start(); init(); - // Advance to the first character as soon as possible - structurals.advance_char(); // Push the root scope (there is always at least one scope) if (start_document(finish_state)) { return parser.error = DEPTH_ERROR; @@ -344,12 +304,8 @@ struct structural_parser { return SUCCESS; } - really_inline char advance_char() { - return structurals.advance_char(); - } - really_inline void log_value(const char *type) { - logger::log_line(structurals, "", type, ""); + logger::log_line(*this, "", type, ""); } static really_inline void log_start() { @@ -357,17 +313,17 @@ struct structural_parser { } really_inline void log_start_value(const char *type) { - logger::log_line(structurals, "+", type, ""); + logger::log_line(*this, "+", type, ""); if (logger::LOG_ENABLED) { logger::log_depth++; } } really_inline void log_end_value(const char *type) { if (logger::LOG_ENABLED) { logger::log_depth--; } - logger::log_line(structurals, "-", type, ""); + logger::log_line(*this, "-", type, ""); } really_inline void log_error(const char *error) { - logger::log_line(structurals, "", "ERROR", error); + logger::log_line(*this, "", "ERROR", error); } }; // struct structural_parser @@ -386,7 +342,7 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p // // Read first value // - switch (parser.structurals.current_char()) { + switch (parser.current_char()) { case '{': FAIL_IF( parser.start_object(addresses.finish) ); goto object_begin; @@ -394,27 +350,41 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p FAIL_IF( parser.start_array(addresses.finish) ); // Make sure the outer array is closed before continuing; otherwise, there are ways we could get // into memory corruption. See https://github.com/simdjson/simdjson/issues/906 - if (parser.structurals.buf[parser.structurals.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') { - goto error; + if (!STREAMING) { + if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') { + goto error; + } } goto array_begin; case '"': FAIL_IF( parser.parse_string() ); goto finish; - case 't': case 'f': case 'n': - FAIL_IF( parser.parse_single_atom() ); + case 't': + parser.log_value("true"); + FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) ); + parser.append_tape(0, internal::tape_type::TRUE_VALUE); + goto finish; + case 'f': + parser.log_value("false"); + FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) ); + parser.append_tape(0, internal::tape_type::FALSE_VALUE); + goto finish; + case 'n': + parser.log_value("null"); + FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) ); + parser.append_tape(0, internal::tape_type::NULL_VALUE); goto finish; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], false); }) ); goto finish; case '-': FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], true); }) ); @@ -444,7 +414,6 @@ object_begin: object_key_state: if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; } - parser.advance_char(); GOTO( parser.parse_value(addresses, addresses.object_continue) ); object_continue: @@ -469,7 +438,8 @@ scope_end: // Array parser states // array_begin: - if (parser.advance_char() == ']') { + if (parser.peek_next_char() == ']') { + parser.advance_char(); parser.end_array(); goto scope_end; } @@ -484,7 +454,6 @@ array_continue: switch (parser.advance_char()) { case ',': parser.increment_count(); - parser.advance_char(); goto main_array_switch; case ']': parser.end_array();