diff --git a/src/arm64/dom_parser_implementation.cpp b/src/arm64/dom_parser_implementation.cpp index 1e706377..fcdc14d7 100644 --- a/src/arm64/dom_parser_implementation.cpp +++ b/src/arm64/dom_parser_implementation.cpp @@ -112,6 +112,7 @@ really_inline simd8 must_be_2_3_continuation(const simd8 prev2, c #include "arm64/stringparsing.h" #include "arm64/numberparsing.h" #include "generic/stage2/structural_parser.h" +#include "generic/stage2/tape_builder.h" // // Implementation-specific overrides @@ -144,19 +145,15 @@ WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) cons } WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { - if (auto error = stage2::parse_structurals(*this, _doc)) { return error; } - - // If we didn't make it to the end, it's an error - if ( next_structural_index != n_structural_indexes ) { - logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); - return TAPE_ERROR; - } - - return SUCCESS; + doc = &_doc; + stage2::tape_builder builder(*doc); + return stage2::structural_parser::parse(*this, builder); } WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { - return stage2::parse_structurals(*this, _doc); + doc = &_doc; + stage2::tape_builder builder(_doc); + return stage2::structural_parser::parse(*this, builder); } WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { diff --git a/src/fallback/dom_parser_implementation.cpp b/src/fallback/dom_parser_implementation.cpp index d00c3783..37d4ef2c 100644 --- a/src/fallback/dom_parser_implementation.cpp +++ b/src/fallback/dom_parser_implementation.cpp @@ -316,24 +316,21 @@ WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) cons #include "fallback/stringparsing.h" #include "fallback/numberparsing.h" #include "generic/stage2/structural_parser.h" +#include "generic/stage2/tape_builder.h" namespace { namespace SIMDJSON_IMPLEMENTATION { WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { - if (auto error = stage2::parse_structurals(*this, _doc)) { return error; } - - // If we didn't make it to the end, it's an error - if ( next_structural_index != n_structural_indexes ) { - logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); - return TAPE_ERROR; - } - - return SUCCESS; + doc = &_doc; + stage2::tape_builder builder(*doc); + return stage2::structural_parser::parse(*this, builder); } WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { - return stage2::parse_structurals(*this, _doc); + doc = &_doc; + stage2::tape_builder builder(_doc); + return stage2::structural_parser::parse(*this, builder); } WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { diff --git a/src/generic/stage2/logger.h b/src/generic/stage2/logger.h index c8edd6d4..7d981d9a 100644 --- a/src/generic/stage2/logger.h +++ b/src/generic/stage2/logger.h @@ -7,10 +7,10 @@ namespace logger { static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"; static constexpr const bool LOG_ENABLED = false; - static constexpr const int LOG_EVENT_LEN = 30; - static constexpr const int LOG_BUFFER_LEN = 20; - static constexpr const int LOG_DETAIL_LEN = 50; - static constexpr const int LOG_INDEX_LEN = 10; + static constexpr const int LOG_EVENT_LEN = 20; + static constexpr const int LOG_BUFFER_LEN = 10; + static constexpr const int LOG_SMALL_BUFFER_LEN = 10; + static constexpr const int LOG_INDEX_LEN = 5; static int log_depth; // Not threadsafe. Log only. @@ -28,8 +28,8 @@ namespace logger { if (LOG_ENABLED) { log_depth = 0; printf("\n"); - printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index"); - printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES); + printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#"); + printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES); } } @@ -44,22 +44,35 @@ namespace logger { static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) { if (LOG_ENABLED) { printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title); + auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1; + auto next_index = structurals.next_structural; + auto current = current_index ? &structurals.buf[*current_index] : (const uint8_t*)" "; + auto next = &structurals.buf[*next_index]; { // Print the next N characters in the buffer. printf("| "); // Otherwise, print the characters starting from the buffer position. // Print spaces for unprintable or newline characters. for (int i=0;itape.get()}, - current_string_buf_loc{parser.doc->string_buf.get()} { + template + WARN_UNUSED really_inline error_code parse(T &builder) noexcept; + template + WARN_UNUSED static really_inline error_code parse(dom_parser_implementation &dom_parser, T &builder) noexcept { + structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); + return parser.parse(builder); } - WARN_UNUSED really_inline error_code start_scope(bool is_array) { - depth++; - if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; } - parser.containing_scope[depth].tape_index = next_tape_index(); - parser.containing_scope[depth].count = 0; - tape.skip(); // We don't actually *write* the start element until the end. - parser.is_array[depth] = is_array; - return SUCCESS; + // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations + really_inline structural_parser(dom_parser_implementation &_dom_parser, uint32_t start_structural_index) + : structural_iterator(_dom_parser, start_structural_index) { } WARN_UNUSED really_inline error_code start_document() { - log_start_value("document"); - parser.containing_scope[depth].tape_index = next_tape_index(); - parser.containing_scope[depth].count = 0; - tape.skip(); // We don't actually *write* the start element until the end. - parser.is_array[depth] = false; - if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; } + dom_parser.is_array[depth] = false; + return SUCCESS; + } + template + WARN_UNUSED really_inline error_code start_array(T &builder) { + depth++; + if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; } + builder.start_array(*this); + dom_parser.is_array[depth] = true; return SUCCESS; } - WARN_UNUSED really_inline error_code start_object() { - log_start_value("object"); - return start_scope(false); - } - - WARN_UNUSED really_inline error_code start_array() { - log_start_value("array"); - return start_scope(true); - } - - // this function is responsible for annotating the start of the scope - really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept { - // SIMDJSON_ASSUME(depth > 0); - // Write the ending tape element, pointing at the start location - const uint32_t start_tape_index = parser.containing_scope[depth].tape_index; - tape.append(start_tape_index, end); - // Write the start tape element, pointing at the end location (and including count) - // count can overflow if it exceeds 24 bits... so we saturate - // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). - const uint32_t count = parser.containing_scope[depth].count; - const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; - tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start); - depth--; - } - - really_inline uint32_t next_tape_index() { - return uint32_t(tape.next_tape_loc - parser.doc->tape.get()); - } - - really_inline void end_object() { - log_end_value("object"); - end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); - } - really_inline void end_array() { - log_end_value("array"); - end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); - } - really_inline void end_document() { - log_end_value("document"); - constexpr uint32_t start_tape_index = 0; - tape.append(start_tape_index, internal::tape_type::ROOT); - tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index(), internal::tape_type::ROOT); - } - - really_inline void empty_container(internal::tape_type start, internal::tape_type end) { - auto start_index = next_tape_index(); - tape.append(start_index+2, start); - tape.append(start_index, end); - } - WARN_UNUSED really_inline bool empty_object() { + template + WARN_UNUSED really_inline bool empty_object(T &builder) { if (peek_next_char() == '}') { advance_char(); - log_value("empty object"); - empty_container(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); + builder.empty_object(*this); return true; } return false; } - WARN_UNUSED really_inline bool empty_array() { + template + WARN_UNUSED really_inline bool empty_array(T &builder) { if (peek_next_char() == ']') { advance_char(); - log_value("empty array"); - empty_container(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); + builder.empty_array(*this); return true; } return false; } - // increment_count increments the count of keys in an object or values in an array. - really_inline void increment_count() { - parser.containing_scope[depth].count++; // we have a key value pair in the object at parser.depth - 1 - } - - really_inline uint8_t *on_start_string() noexcept { - // we advance the point, accounting for the fact that we have a NULL termination - tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING); - return current_string_buf_loc + sizeof(uint32_t); - } - - really_inline void on_end_string(uint8_t *dst) noexcept { - uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t))); - // TODO check for overflow in case someone has a crazy string (>=4GB?) - // But only add the overflow check when the document itself exceeds 4GB - // Currently unneeded because we refuse to parse docs larger or equal to 4GB. - memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t)); - // NULL termination is still handy if you expect all your strings to - // be NULL terminated? It comes at a small cost - *dst = 0; - current_string_buf_loc = dst + 1; - } - - WARN_UNUSED really_inline error_code parse_string(bool key = false) { - log_value(key ? "key" : "string"); - uint8_t *dst = on_start_string(); - dst = stringparsing::parse_string(current(), dst); - if (dst == nullptr) { - log_error("Invalid escape in string"); - return STRING_ERROR; - } - on_end_string(dst); - return SUCCESS; - } - - WARN_UNUSED really_inline error_code parse_number(const uint8_t *src) { - log_value("number"); - if (!numberparsing::parse_number(src, tape)) { log_error("Invalid number"); return NUMBER_ERROR; } - return SUCCESS; - } - WARN_UNUSED really_inline error_code parse_number() { - return parse_number(current()); - } - - really_inline error_code parse_root_number() { - /** - * We need to make a copy to make sure that the string is space terminated. - * This is not about padding the input, which should already padded up - * to len + SIMDJSON_PADDING. However, we have no control at this stage - * on how the padding was done. What if the input string was padded with nulls? - * It is quite common for an input string to have an extra null character (C string). - * We do not want to allow 9\0 (where \0 is the null character) inside a JSON - * document, but the string "9\0" by itself is fine. So we make a copy and - * pad the input with spaces when we know that there is just one input element. - * This copy is relatively expensive, but it will almost never be called in - * practice unless you are in the strange scenario where you have many JSON - * documents made of single atoms. - */ - uint8_t *copy = static_cast(malloc(parser.len + SIMDJSON_PADDING)); - if (copy == nullptr) { - return MEMALLOC; - } - memcpy(copy, buf, parser.len); - memset(copy + parser.len, ' ', SIMDJSON_PADDING); - size_t idx = *current_structural; - error_code error = parse_number(©[idx]); // parse_number does not throw - free(copy); - return error; - } - - WARN_UNUSED really_inline error_code parse_true_atom() { - log_value("true"); - if (!atomparsing::is_valid_true_atom(current())) { return T_ATOM_ERROR; } - tape.append(0, internal::tape_type::TRUE_VALUE); - return SUCCESS; - } - - WARN_UNUSED really_inline error_code parse_root_true_atom() { - log_value("true"); - if (!atomparsing::is_valid_true_atom(current(), remaining_len())) { return T_ATOM_ERROR; } - tape.append(0, internal::tape_type::TRUE_VALUE); - return SUCCESS; - } - - WARN_UNUSED really_inline error_code parse_false_atom() { - log_value("false"); - if (!atomparsing::is_valid_false_atom(current())) { return F_ATOM_ERROR; } - tape.append(0, internal::tape_type::FALSE_VALUE); - return SUCCESS; - } - - WARN_UNUSED really_inline error_code parse_root_false_atom() { - log_value("false"); - if (!atomparsing::is_valid_false_atom(current(), remaining_len())) { return F_ATOM_ERROR; } - tape.append(0, internal::tape_type::FALSE_VALUE); - return SUCCESS; - } - - WARN_UNUSED really_inline error_code parse_null_atom() { - log_value("null"); - if (!atomparsing::is_valid_null_atom(current())) { return N_ATOM_ERROR; } - tape.append(0, internal::tape_type::NULL_VALUE); - return SUCCESS; - } - - WARN_UNUSED really_inline error_code parse_root_null_atom() { - log_value("null"); - if (!atomparsing::is_valid_null_atom(current(), remaining_len())) { return N_ATOM_ERROR; } - tape.append(0, internal::tape_type::NULL_VALUE); - return SUCCESS; - } - - WARN_UNUSED really_inline error_code start() { - logger::log_start(); - - // If there are no structurals left, return EMPTY - if (at_end()) { return EMPTY; } - - // Push the root scope (there is always at least one scope) - return start_document(); - } - + template WARN_UNUSED really_inline error_code finish() { - end_document(); - parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]); + dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]); if (depth != 0) { log_error("Unclosed objects or arrays!"); return TAPE_ERROR; } + // If we didn't make it to the end, it's an error + if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) { + logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); + return TAPE_ERROR; + } + return SUCCESS; } @@ -268,152 +98,132 @@ struct structural_parser : structural_iterator { } }; // struct structural_parser -#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } } +template +WARN_UNUSED really_inline error_code structural_parser::parse(T &builder) noexcept { + logger::log_start(); -template -WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept { - dom_parser.doc = &doc; - stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); - SIMDJSON_TRY( parser.start() ); + // + // Start the document + // + if (at_end()) { return EMPTY; } + SIMDJSON_TRY( start_document() ); + builder.start_document(*this); // // Read first value // - switch (parser.current_char()) { - case '{': { - if (parser.empty_object()) { goto document_end; } - SIMDJSON_TRY( parser.start_object() ); - goto object_begin; - } - case '[': { - if (parser.empty_array()) { goto document_end; } - SIMDJSON_TRY( parser.start_array() ); - // Make sure the outer array is closed before continuing; otherwise, there are ways we could get - // into memory corruption. See https://github.com/simdjson/simdjson/issues/906 - if (!STREAMING) { - if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') { - return TAPE_ERROR; + { + const uint8_t *value = advance(); + switch (*value) { + case '{': if (!empty_object(builder)) { goto object_begin; }; break; + case '[': { + // Make sure the outer array is closed before continuing; otherwise, there are ways we could get + // into memory corruption. See https://github.com/simdjson/simdjson/issues/906 + if (!STREAMING) { + if (buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') { + return TAPE_ERROR; + } + } + if (!empty_array(builder)) { goto array_begin; }; break; } + default: SIMDJSON_TRY( builder.parse_root_primitive(*this, value) ); } - goto array_begin; - } - case '"': SIMDJSON_TRY( parser.parse_string() ); goto document_end; - case 't': SIMDJSON_TRY( parser.parse_root_true_atom() ); goto document_end; - case 'f': SIMDJSON_TRY( parser.parse_root_false_atom() ); goto document_end; - case 'n': SIMDJSON_TRY( parser.parse_root_null_atom() ); goto document_end; - case '-': - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - SIMDJSON_TRY( parser.parse_root_number() ); goto document_end; - default: - parser.log_error("Document starts with a non-value character"); - return TAPE_ERROR; + goto document_end; } // // Object parser states // -object_begin: - if (parser.advance_char() != '"') { - parser.log_error("Object does not start with a key"); +object_begin: { + depth++; + if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; } + builder.start_object(*this); + dom_parser.is_array[depth] = false; + + const uint8_t *key = advance(); + if (*key != '"') { + log_error("Object does not start with a key"); return TAPE_ERROR; } - parser.increment_count(); - SIMDJSON_TRY( parser.parse_string(true) ); + builder.increment_count(*this); + SIMDJSON_TRY( builder.parse_key(*this, key) ); goto object_field; +} // object_begin: -object_field: - if (unlikely( parser.advance_char() != ':' )) { parser.log_error("Missing colon after key in object"); return TAPE_ERROR; } - switch (parser.advance_char()) { - case '{': { - if (parser.empty_object()) { break; }; - SIMDJSON_TRY( parser.start_object() ); - goto object_begin; - } - case '[': { - if (parser.empty_array()) { break; }; - SIMDJSON_TRY( parser.start_array() ); - goto array_begin; - } - case '"': SIMDJSON_TRY( parser.parse_string() ); break; - case 't': SIMDJSON_TRY( parser.parse_true_atom() ); break; - case 'f': SIMDJSON_TRY( parser.parse_false_atom() ); break; - case 'n': SIMDJSON_TRY( parser.parse_null_atom() ); break; - case '-': - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - SIMDJSON_TRY( parser.parse_number() ); break; - default: - parser.log_error("Non-value found when value was expected!"); - return TAPE_ERROR; +object_field: { + if (unlikely( advance_char() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; } + const uint8_t *value = advance(); + switch (*value) { + case '{': if (!empty_object(builder)) { goto object_begin; }; break; + case '[': if (!empty_array(builder)) { goto array_begin; }; break; + default: SIMDJSON_TRY( builder.parse_primitive(*this, value) ); } +} // object_field: -object_continue: - switch (parser.advance_char()) { - case ',': - parser.increment_count(); - if (unlikely( parser.advance_char() != '"' )) { parser.log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; } - SIMDJSON_TRY( parser.parse_string(true) ); +object_continue: { + switch (advance_char()) { + case ',': { + builder.increment_count(*this); + const uint8_t *key = advance(); + if (unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; } + SIMDJSON_TRY( builder.parse_key(*this, key) ); goto object_field; + } case '}': - parser.end_object(); + builder.end_object(*this); goto scope_end; default: - parser.log_error("No comma between object fields"); + log_error("No comma between object fields"); return TAPE_ERROR; } +} // object_continue: -scope_end: - if (parser.depth == 0) { goto document_end; } - if (parser.parser.is_array[parser.depth]) { goto array_continue; } +scope_end: { + depth--; + if (depth == 0) { goto document_end; } + if (dom_parser.is_array[depth]) { goto array_continue; } goto object_continue; +} // scope_end: // // Array parser states // -array_begin: - parser.increment_count(); +array_begin: { + depth++; + if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; } + builder.start_array(*this); + dom_parser.is_array[depth] = true; -array_value: - switch (parser.advance_char()) { - case '{': { - if (parser.empty_object()) { break; }; - SIMDJSON_TRY( parser.start_object() ); - goto object_begin; - } - case '[': { - if (parser.empty_array()) { break; }; - SIMDJSON_TRY( parser.start_array() ); - goto array_begin; - } - case '"': SIMDJSON_TRY( parser.parse_string() ); break; - case 't': SIMDJSON_TRY( parser.parse_true_atom() ); break; - case 'f': SIMDJSON_TRY( parser.parse_false_atom() ); break; - case 'n': SIMDJSON_TRY( parser.parse_null_atom() ); break; - case '-': - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - SIMDJSON_TRY( parser.parse_number() ); break; - default: - parser.log_error("Non-value found when value was expected!"); - return TAPE_ERROR; + builder.increment_count(*this); +} // array_begin: + +array_value: { + const uint8_t *value = advance(); + switch (*value) { + case '{': if (!empty_object(builder)) { goto object_begin; }; break; + case '[': if (!empty_array(builder)) { goto array_begin; }; break; + default: SIMDJSON_TRY( builder.parse_primitive(*this, value) ); } +} // array_value: -array_continue: - switch (parser.advance_char()) { +array_continue: { + switch (advance_char()) { case ',': - parser.increment_count(); + builder.increment_count(*this); goto array_value; case ']': - parser.end_array(); + builder.end_array(*this); goto scope_end; default: - parser.log_error("Missing comma between array values"); + log_error("Missing comma between array values"); return TAPE_ERROR; } +} // array_continue: -document_end: - return parser.finish(); +document_end: { + builder.end_document(*this); + return finish(); +} // document_end: } // parse_structurals() diff --git a/src/generic/stage2/tape_builder.h b/src/generic/stage2/tape_builder.h new file mode 100644 index 00000000..bd1f8e4a --- /dev/null +++ b/src/generic/stage2/tape_builder.h @@ -0,0 +1,230 @@ +#include "generic/stage2/tape_writer.h" +#include "generic/stage2/atomparsing.h" + +namespace { +namespace SIMDJSON_IMPLEMENTATION { +namespace stage2 { + +struct tape_builder { + /** Next location to write to tape */ + tape_writer tape; + /** Next write location in the string buf for stage 2 parsing */ + uint8_t *current_string_buf_loc; + + really_inline tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {} + +private: + friend struct structural_parser; + + really_inline error_code parse_root_primitive(structural_parser &parser, const uint8_t *value) { + switch (*value) { + case '"': return parse_string(parser, value); + case 't': return parse_root_true_atom(parser, value); + case 'f': return parse_root_false_atom(parser, value); + case 'n': return parse_root_null_atom(parser, value); + case '-': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return parse_root_number(parser, value); + default: + parser.log_error("Document starts with a non-value character"); + return TAPE_ERROR; + } + } + really_inline error_code parse_primitive(structural_parser &parser, const uint8_t *value) { + switch (*value) { + case '"': return parse_string(parser, value); + case 't': return parse_true_atom(parser, value); + case 'f': return parse_false_atom(parser, value); + case 'n': return parse_null_atom(parser, value); + case '-': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return parse_number(parser, value); + default: + parser.log_error("Non-value found when value was expected!"); + return TAPE_ERROR; + } + } + really_inline void empty_object(structural_parser &parser) { + parser.log_value("empty object"); + empty_container(parser, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); + } + really_inline void empty_array(structural_parser &parser) { + parser.log_value("empty array"); + empty_container(parser, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); + } + + really_inline void start_document(structural_parser &parser) { + parser.log_start_value("document"); + start_container(parser); + } + really_inline void start_object(structural_parser &parser) { + parser.log_start_value("object"); + start_container(parser); + } + really_inline void start_array(structural_parser &parser) { + parser.log_start_value("array"); + start_container(parser); + } + + really_inline void end_object(structural_parser &parser) { + parser.log_end_value("object"); + end_container(parser, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); + } + really_inline void end_array(structural_parser &parser) { + parser.log_end_value("array"); + end_container(parser, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); + } + really_inline void end_document(structural_parser &parser) { + parser.log_end_value("document"); + constexpr uint32_t start_tape_index = 0; + tape.append(start_tape_index, internal::tape_type::ROOT); + tape_writer::write(parser.dom_parser.doc->tape[start_tape_index], next_tape_index(parser), internal::tape_type::ROOT); + } + + WARN_UNUSED really_inline error_code parse_key(structural_parser &parser, const uint8_t *value) { + return parse_string(parser, value, true); + } + WARN_UNUSED really_inline error_code parse_string(structural_parser &parser, const uint8_t *value, bool key = false) { + parser.log_value(key ? "key" : "string"); + uint8_t *dst = on_start_string(parser); + dst = stringparsing::parse_string(value, dst); + if (dst == nullptr) { + parser.log_error("Invalid escape in string"); + return STRING_ERROR; + } + on_end_string(dst); + return SUCCESS; + } + + WARN_UNUSED really_inline error_code parse_number(structural_parser &parser, const uint8_t *value) { + parser.log_value("number"); + if (!numberparsing::parse_number(value, tape)) { parser.log_error("Invalid number"); return NUMBER_ERROR; } + return SUCCESS; + } + + really_inline error_code parse_root_number(structural_parser &parser, const uint8_t *value) { + // + // We need to make a copy to make sure that the string is space terminated. + // This is not about padding the input, which should already padded up + // to len + SIMDJSON_PADDING. However, we have no control at this stage + // on how the padding was done. What if the input string was padded with nulls? + // It is quite common for an input string to have an extra null character (C string). + // We do not want to allow 9\0 (where \0 is the null character) inside a JSON + // document, but the string "9\0" by itself is fine. So we make a copy and + // pad the input with spaces when we know that there is just one input element. + // This copy is relatively expensive, but it will almost never be called in + // practice unless you are in the strange scenario where you have many JSON + // documents made of single atoms. + // + uint8_t *copy = static_cast(malloc(parser.remaining_len() + SIMDJSON_PADDING)); + if (copy == nullptr) { + return MEMALLOC; + } + memcpy(copy, value, parser.remaining_len()); + memset(copy + parser.remaining_len(), ' ', SIMDJSON_PADDING); + error_code error = parse_number(parser, copy); + free(copy); + return error; + } + + WARN_UNUSED really_inline error_code parse_true_atom(structural_parser &parser, const uint8_t *value) { + parser.log_value("true"); + if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; } + tape.append(0, internal::tape_type::TRUE_VALUE); + return SUCCESS; + } + + WARN_UNUSED really_inline error_code parse_root_true_atom(structural_parser &parser, const uint8_t *value) { + parser.log_value("true"); + if (!atomparsing::is_valid_true_atom(value, parser.remaining_len())) { return T_ATOM_ERROR; } + tape.append(0, internal::tape_type::TRUE_VALUE); + return SUCCESS; + } + + WARN_UNUSED really_inline error_code parse_false_atom(structural_parser &parser, const uint8_t *value) { + parser.log_value("false"); + if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; } + tape.append(0, internal::tape_type::FALSE_VALUE); + return SUCCESS; + } + + WARN_UNUSED really_inline error_code parse_root_false_atom(structural_parser &parser, const uint8_t *value) { + parser.log_value("false"); + if (!atomparsing::is_valid_false_atom(value, parser.remaining_len())) { return F_ATOM_ERROR; } + tape.append(0, internal::tape_type::FALSE_VALUE); + return SUCCESS; + } + + WARN_UNUSED really_inline error_code parse_null_atom(structural_parser &parser, const uint8_t *value) { + parser.log_value("null"); + if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; } + tape.append(0, internal::tape_type::NULL_VALUE); + return SUCCESS; + } + + WARN_UNUSED really_inline error_code parse_root_null_atom(structural_parser &parser, const uint8_t *value) { + parser.log_value("null"); + if (!atomparsing::is_valid_null_atom(value, parser.remaining_len())) { return N_ATOM_ERROR; } + tape.append(0, internal::tape_type::NULL_VALUE); + return SUCCESS; + } + + // increment_count increments the count of keys in an object or values in an array. + really_inline void increment_count(structural_parser &parser) { + parser.dom_parser.containing_scope[parser.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1 + } + +// private: + + really_inline uint32_t next_tape_index(structural_parser &parser) { + return uint32_t(tape.next_tape_loc - parser.dom_parser.doc->tape.get()); + } + + really_inline void empty_container(structural_parser &parser, internal::tape_type start, internal::tape_type end) { + auto start_index = next_tape_index(parser); + tape.append(start_index+2, start); + tape.append(start_index, end); + } + + really_inline void start_container(structural_parser &parser) { + parser.dom_parser.containing_scope[parser.depth].tape_index = next_tape_index(parser); + parser.dom_parser.containing_scope[parser.depth].count = 0; + tape.skip(); // We don't actually *write* the start element until the end. + } + + really_inline void end_container(structural_parser &parser, internal::tape_type start, internal::tape_type end) noexcept { + // Write the ending tape element, pointing at the start location + const uint32_t start_tape_index = parser.dom_parser.containing_scope[parser.depth].tape_index; + tape.append(start_tape_index, end); + // Write the start tape element, pointing at the end location (and including count) + // count can overflow if it exceeds 24 bits... so we saturate + // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). + const uint32_t count = parser.dom_parser.containing_scope[parser.depth].count; + const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; + tape_writer::write(parser.dom_parser.doc->tape[start_tape_index], next_tape_index(parser) | (uint64_t(cntsat) << 32), start); + } + + really_inline uint8_t *on_start_string(structural_parser &parser) noexcept { + // we advance the point, accounting for the fact that we have a NULL termination + tape.append(current_string_buf_loc - parser.dom_parser.doc->string_buf.get(), internal::tape_type::STRING); + return current_string_buf_loc + sizeof(uint32_t); + } + + really_inline void on_end_string(uint8_t *dst) noexcept { + uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t))); + // TODO check for overflow in case someone has a crazy string (>=4GB?) + // But only add the overflow check when the document itself exceeds 4GB + // Currently unneeded because we refuse to parse docs larger or equal to 4GB. + memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t)); + // NULL termination is still handy if you expect all your strings to + // be NULL terminated? It comes at a small cost + *dst = 0; + current_string_buf_loc = dst + 1; + } +}; // class tape_builder + +} // namespace stage2 +} // namespace SIMDJSON_IMPLEMENTATION +} // unnamed namespace diff --git a/src/haswell/dom_parser_implementation.cpp b/src/haswell/dom_parser_implementation.cpp index 1b5ca5bf..2b5af3bb 100644 --- a/src/haswell/dom_parser_implementation.cpp +++ b/src/haswell/dom_parser_implementation.cpp @@ -77,6 +77,7 @@ really_inline simd8 must_be_2_3_continuation(const simd8 prev2, c #include "haswell/stringparsing.h" #include "haswell/numberparsing.h" #include "generic/stage2/structural_parser.h" +#include "generic/stage2/tape_builder.h" // // Implementation-specific overrides @@ -107,19 +108,15 @@ WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) cons } WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { - if (auto error = stage2::parse_structurals(*this, _doc)) { return error; } - - // If we didn't make it to the end, it's an error - if ( next_structural_index != n_structural_indexes ) { - logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); - return TAPE_ERROR; - } - - return SUCCESS; + doc = &_doc; + stage2::tape_builder builder(_doc); + return stage2::structural_parser::parse(*this, builder); } WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { - return stage2::parse_structurals(*this, _doc); + doc = &_doc; + stage2::tape_builder builder(_doc); + return stage2::structural_parser::parse(*this, builder); } WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { diff --git a/src/westmere/dom_parser_implementation.cpp b/src/westmere/dom_parser_implementation.cpp index 71981947..c3954b9d 100644 --- a/src/westmere/dom_parser_implementation.cpp +++ b/src/westmere/dom_parser_implementation.cpp @@ -82,6 +82,7 @@ really_inline simd8 must_be_2_3_continuation(const simd8 prev2, c #include "westmere/stringparsing.h" #include "westmere/numberparsing.h" #include "generic/stage2/structural_parser.h" +#include "generic/stage2/tape_builder.h" // // Implementation-specific overrides @@ -113,19 +114,15 @@ WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) cons } WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { - if (auto error = stage2::parse_structurals(*this, _doc)) { return error; } - - // If we didn't make it to the end, it's an error - if ( next_structural_index != n_structural_indexes ) { - logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); - return TAPE_ERROR; - } - - return SUCCESS; + doc = &_doc; + stage2::tape_builder builder(*doc); + return stage2::structural_parser::parse(*this, builder); } WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { - return stage2::parse_structurals(*this, _doc); + doc = &_doc; + stage2::tape_builder builder(_doc); + return stage2::structural_parser::parse(*this, builder); } WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {