Use SAX model for stage 2

2020-08-03 15:05:30 -07:00 · 2020-08-03 15:05:30 -07:00 · 03d54f8f6e
parent 553e6d7549
commit 03d54f8f6e
3 changed files with 245 additions and 159 deletions
--- a/src/generic/stage2/logger.h
+++ b/src/generic/stage2/logger.h
@ -28,8 +28,8 @@ namespace logger {
    if (LOG_ENABLED) {
      log_depth = 0;
      printf("\n");
-      printf("| %-*s | %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#", 5, "Tape#");
-      printf("|%.*s|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES, 5+2, DASHES);
+      printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
+      printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
    }
  }

@ -71,7 +71,7 @@ namespace logger {
      } else {
        printf("| %-*s ", LOG_INDEX_LEN, "");
      }
-      printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
+      // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
      printf("| %-s ", detail);
      printf("|\n");
    }
--- a/src/generic/stage2/structural_parser.h
+++ b/src/generic/stage2/structural_parser.h
@ -3,103 +3,63 @@
 // We assume the file in which it is include already includes
 // "simdjson/stage2.h" (this simplifies amalgation)

-#include "generic/stage2/tape_writer.h"
 #include "generic/stage2/logger.h"
-#include "generic/stage2/atomparsing.h"
 #include "generic/stage2/structural_iterator.h"

 namespace { // Make everything here private
 namespace SIMDJSON_IMPLEMENTATION {
 namespace stage2 {

+#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
+
+template<typename T>
 struct structural_parser : structural_iterator {
-  /** Lets you append to the tape */
-  tape_writer tape;
-  /** Next write location in the string buf for stage 2 parsing */
-  uint8_t *current_string_buf_loc;
+  /** Receiver that actually parses the strings and builds the tape */
+  T builder;
  /** Current depth (nested objects and arrays) */
  uint32_t depth{0};

  // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
  really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
    : structural_iterator(_parser, start_structural_index),
-      tape{parser.doc->tape.get()},
-      current_string_buf_loc{parser.doc->string_buf.get()} {
-  }
-
-  WARN_UNUSED really_inline error_code start_scope(bool is_array) {
-    depth++;
-    if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
-    parser.containing_scope[depth].tape_index = next_tape_index();
-    parser.containing_scope[depth].count = 0;
-    tape.skip(); // We don't actually *write* the start element until the end.
-    parser.is_array[depth] = is_array;
-    return SUCCESS;
+      builder{parser.doc->tape.get(), parser.doc->string_buf.get()} {
  }

  WARN_UNUSED really_inline error_code start_document() {
-    log_start_value("document");
-    parser.containing_scope[depth].tape_index = next_tape_index();
-    parser.containing_scope[depth].count = 0;
-    tape.skip(); // We don't actually *write* the start element until the end.
+    builder.start_document(*this);
    parser.is_array[depth] = false;
    return SUCCESS;
  }
-
  WARN_UNUSED really_inline error_code start_object() {
-    log_start_value("object");
-    return start_scope(false);
+    depth++;
+    if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+    builder.start_object(*this);
+    parser.is_array[depth] = false;
+    return SUCCESS;
  }
-
  WARN_UNUSED really_inline error_code start_array() {
-    log_start_value("array");
-    return start_scope(true);
+    depth++;
+    if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+    builder.start_array(*this);
+    parser.is_array[depth] = true;
+    return SUCCESS;
  }
-
-  // this function is responsible for annotating the start of the scope
-  really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
-    // SIMDJSON_ASSUME(depth > 0);
-    // Write the ending tape element, pointing at the start location
-    const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
-    tape.append(start_tape_index, end);
-    // Write the start tape element, pointing at the end location (and including count)
-    // count can overflow if it exceeds 24 bits... so we saturate
-    // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
-    const uint32_t count = parser.containing_scope[depth].count;
-    const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
-    tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
+  really_inline void end_object() {
+    builder.end_object(*this);
    depth--;
  }
-
-  really_inline uint32_t next_tape_index() {
-    return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
-  }
-
-  really_inline void end_object() {
-    log_end_value("object");
-    end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
-  }
  really_inline void end_array() {
-    log_end_value("array");
-    end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+    builder.end_array(*this);
+    depth--;
  }
  really_inline void end_document() {
-    log_end_value("document");
-    constexpr uint32_t start_tape_index = 0;
-    tape.append(start_tape_index, internal::tape_type::ROOT);
-    tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index(), internal::tape_type::ROOT);
+    builder.end_document(*this);
  }

-  really_inline void empty_container(internal::tape_type start, internal::tape_type end) {
-    auto start_index = next_tape_index();
-    tape.append(start_index+2, start);
-    tape.append(start_index, end);
-  }
  WARN_UNUSED really_inline bool empty_object() {
    if (peek_next_char() == '}') {
      advance_char();
-      log_value("empty object");
-      empty_container(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+      builder.empty_object(*this);
      return true;
    }
    return false;
@ -107,122 +67,45 @@ struct structural_parser : structural_iterator {
  WARN_UNUSED really_inline bool empty_array() {
    if (peek_next_char() == ']') {
      advance_char();
-      log_value("empty array");
-      empty_container(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+      builder.empty_array(*this);
      return true;
    }
    return false;
  }

-  // increment_count increments the count of keys in an object or values in an array.
  really_inline void increment_count() {
-    parser.containing_scope[depth].count++; // we have a key value pair in the object at parser.depth - 1
-  }
-
-  really_inline uint8_t *on_start_string() noexcept {
-    // we advance the point, accounting for the fact that we have a NULL termination
-    tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
-    return current_string_buf_loc + sizeof(uint32_t);
-  }
-
-  really_inline void on_end_string(uint8_t *dst) noexcept {
-    uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
-    // TODO check for overflow in case someone has a crazy string (>=4GB?)
-    // But only add the overflow check when the document itself exceeds 4GB
-    // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
-    memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
-    // NULL termination is still handy if you expect all your strings to
-    // be NULL terminated? It comes at a small cost
-    *dst = 0;
-    current_string_buf_loc = dst + 1;
+    builder.increment_count(*this);
  }

  WARN_UNUSED really_inline error_code parse_key(const uint8_t *key) {
-    return parse_string(key, true);
+    return builder.parse_key(*this, key);
  }
-  WARN_UNUSED really_inline error_code parse_string(const uint8_t *value, bool key = false) {
-    log_value(key ? "key" : "string");
-    uint8_t *dst = on_start_string();
-    dst = stringparsing::parse_string(value, dst);
-    if (dst == nullptr) {
-      log_error("Invalid escape in string");
-      return STRING_ERROR;
-    }
-    on_end_string(dst);
-    return SUCCESS;
+  WARN_UNUSED really_inline error_code parse_string(const uint8_t *value) {
+    return builder.parse_string(*this, value);
  }
-
  WARN_UNUSED really_inline error_code parse_number(const uint8_t *value) {
-    log_value("number");
-    if (!numberparsing::parse_number(value, tape)) { log_error("Invalid number"); return NUMBER_ERROR; }
-    return SUCCESS;
+    return builder.parse_number(*this, value);
  }
-
-  really_inline error_code parse_root_number(const uint8_t *value) {
-    //
-    // We need to make a copy to make sure that the string is space terminated.
-    // This is not about padding the input, which should already padded up
-    // to len + SIMDJSON_PADDING. However, we have no control at this stage
-    // on how the padding was done. What if the input string was padded with nulls?
-    // It is quite common for an input string to have an extra null character (C string).
-    // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
-    // document, but the string "9\0" by itself is fine. So we make a copy and
-    // pad the input with spaces when we know that there is just one input element.
-    // This copy is relatively expensive, but it will almost never be called in
-    // practice unless you are in the strange scenario where you have many JSON
-    // documents made of single atoms.
-    //
-    uint8_t *copy = static_cast<uint8_t *>(malloc(remaining_len() + SIMDJSON_PADDING));
-    if (copy == nullptr) {
-      return MEMALLOC;
-    }
-    memcpy(copy, value, remaining_len());
-    memset(copy + remaining_len(), ' ', SIMDJSON_PADDING);
-    error_code error = parse_number(copy);
-    free(copy);
-    return error;
+  WARN_UNUSED really_inline error_code parse_root_number(const uint8_t *value) {
+    return builder.parse_root_number(*this, value);
  }
-
  WARN_UNUSED really_inline error_code parse_true_atom(const uint8_t *value) {
-    log_value("true");
-    if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
-    tape.append(0, internal::tape_type::TRUE_VALUE);
-    return SUCCESS;
+    return builder.parse_true_atom(*this, value);
  }
-
  WARN_UNUSED really_inline error_code parse_root_true_atom(const uint8_t *value) {
-    log_value("true");
-    if (!atomparsing::is_valid_true_atom(value, remaining_len())) { return T_ATOM_ERROR; }
-    tape.append(0, internal::tape_type::TRUE_VALUE);
-    return SUCCESS;
+    return builder.parse_root_true_atom(*this, value);
  }
-
  WARN_UNUSED really_inline error_code parse_false_atom(const uint8_t *value) {
-    log_value("false");
-    if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
-    tape.append(0, internal::tape_type::FALSE_VALUE);
-    return SUCCESS;
+    return builder.parse_false_atom(*this, value);
  }
-
  WARN_UNUSED really_inline error_code parse_root_false_atom(const uint8_t *value) {
-    log_value("false");
-    if (!atomparsing::is_valid_false_atom(value, remaining_len())) { return F_ATOM_ERROR; }
-    tape.append(0, internal::tape_type::FALSE_VALUE);
-    return SUCCESS;
+    return builder.parse_root_false_atom(*this, value);
  }
-
  WARN_UNUSED really_inline error_code parse_null_atom(const uint8_t *value) {
-    log_value("null");
-    if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
-    tape.append(0, internal::tape_type::NULL_VALUE);
-    return SUCCESS;
+    return builder.parse_null_atom(*this, value);
  }
-
  WARN_UNUSED really_inline error_code parse_root_null_atom(const uint8_t *value) {
-    log_value("null");
-    if (!atomparsing::is_valid_null_atom(value, remaining_len())) { return N_ATOM_ERROR; }
-    tape.append(0, internal::tape_type::NULL_VALUE);
-    return SUCCESS;
+    return builder.parse_root_null_atom(*this, value);
  }

  WARN_UNUSED really_inline error_code start() {
@ -266,12 +149,20 @@ struct structural_parser : structural_iterator {
  }
 }; // struct structural_parser

-#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
+} // namespace stage2
+} // namespace SIMDJSON_IMPLEMENTATION
+} // unnamed namespace
+
+#include "generic/stage2/tape_builder.h"
+
+namespace { // Make everything here private
+namespace SIMDJSON_IMPLEMENTATION {
+namespace stage2 {

 template<bool STREAMING>
 WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
  dom_parser.doc = &doc;
-  stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+  stage2::structural_parser<stage2::tape_builder> parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
  SIMDJSON_TRY( parser.start() );

  //
--- a/src/generic/stage2/tape_builder.h
+++ b/src/generic/stage2/tape_builder.h
@ -0,0 +1,195 @@
+#include "generic/stage2/tape_writer.h"
+#include "generic/stage2/atomparsing.h"
+
+namespace {
+namespace SIMDJSON_IMPLEMENTATION {
+namespace stage2 {
+
+struct tape_builder {
+  /** Next location to write to tape */
+  tape_writer tape;
+  /** Next write location in the string buf for stage 2 parsing */
+  uint8_t *current_string_buf_loc;
+
+  really_inline void empty_object(structural_parser<tape_builder> &parser) {
+    parser.log_value("empty object");
+    empty_container(parser, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+  }
+  really_inline void empty_array(structural_parser<tape_builder> &parser) {
+    parser.log_value("empty array");
+    empty_container(parser, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+  }
+
+  really_inline void start_document(structural_parser<tape_builder> &parser) {
+    parser.log_start_value("document");
+    start_container(parser);
+  }
+  really_inline void start_object(structural_parser<tape_builder> &parser) {
+    parser.log_start_value("object");
+    start_container(parser);
+  }
+  really_inline void start_array(structural_parser<tape_builder> &parser) {
+    parser.log_start_value("array");
+    start_container(parser);
+  }
+
+  really_inline void end_object(structural_parser<tape_builder> &parser) {
+    parser.log_end_value("object");
+    end_container(parser, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+  }
+  really_inline void end_array(structural_parser<tape_builder> &parser) {
+    parser.log_end_value("array");
+    end_container(parser, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+  }
+  really_inline void end_document(structural_parser<tape_builder> &parser) {
+    parser.log_end_value("document");
+    constexpr uint32_t start_tape_index = 0;
+    tape.append(start_tape_index, internal::tape_type::ROOT);
+    tape_writer::write(parser.parser.doc->tape[start_tape_index], next_tape_index(parser), internal::tape_type::ROOT);
+  }
+
+  WARN_UNUSED really_inline error_code parse_key(structural_parser<tape_builder> &parser, const uint8_t *value) {
+    return parse_string(parser, value, true);
+  }
+  WARN_UNUSED really_inline error_code parse_string(structural_parser<tape_builder> &parser, const uint8_t *value, bool key = false) {
+    parser.log_value(key ? "key" : "string");
+    uint8_t *dst = on_start_string(parser);
+    dst = stringparsing::parse_string(value, dst);
+    if (dst == nullptr) {
+      parser.log_error("Invalid escape in string");
+      return STRING_ERROR;
+    }
+    on_end_string(dst);
+    return SUCCESS;
+  }
+
+  WARN_UNUSED really_inline error_code parse_number(structural_parser<tape_builder> &parser, const uint8_t *value) {
+    parser.log_value("number");
+    if (!numberparsing::parse_number(value, tape)) { parser.log_error("Invalid number"); return NUMBER_ERROR; }
+    return SUCCESS;
+  }
+
+  really_inline error_code parse_root_number(structural_parser<tape_builder> &parser, const uint8_t *value) {
+    //
+    // We need to make a copy to make sure that the string is space terminated.
+    // This is not about padding the input, which should already padded up
+    // to len + SIMDJSON_PADDING. However, we have no control at this stage
+    // on how the padding was done. What if the input string was padded with nulls?
+    // It is quite common for an input string to have an extra null character (C string).
+    // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
+    // document, but the string "9\0" by itself is fine. So we make a copy and
+    // pad the input with spaces when we know that there is just one input element.
+    // This copy is relatively expensive, but it will almost never be called in
+    // practice unless you are in the strange scenario where you have many JSON
+    // documents made of single atoms.
+    //
+    uint8_t *copy = static_cast<uint8_t *>(malloc(parser.remaining_len() + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      return MEMALLOC;
+    }
+    memcpy(copy, value, parser.remaining_len());
+    memset(copy + parser.remaining_len(), ' ', SIMDJSON_PADDING);
+    error_code error = parse_number(parser, copy);
+    free(copy);
+    return error;
+  }
+
+  WARN_UNUSED really_inline error_code parse_true_atom(structural_parser<tape_builder> &parser, const uint8_t *value) {
+    parser.log_value("true");
+    if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
+    tape.append(0, internal::tape_type::TRUE_VALUE);
+    return SUCCESS;
+  }
+
+  WARN_UNUSED really_inline error_code parse_root_true_atom(structural_parser<tape_builder> &parser, const uint8_t *value) {
+    parser.log_value("true");
+    if (!atomparsing::is_valid_true_atom(value, parser.remaining_len())) { return T_ATOM_ERROR; }
+    tape.append(0, internal::tape_type::TRUE_VALUE);
+    return SUCCESS;
+  }
+
+  WARN_UNUSED really_inline error_code parse_false_atom(structural_parser<tape_builder> &parser, const uint8_t *value) {
+    parser.log_value("false");
+    if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
+    tape.append(0, internal::tape_type::FALSE_VALUE);
+    return SUCCESS;
+  }
+
+  WARN_UNUSED really_inline error_code parse_root_false_atom(structural_parser<tape_builder> &parser, const uint8_t *value) {
+    parser.log_value("false");
+    if (!atomparsing::is_valid_false_atom(value, parser.remaining_len())) { return F_ATOM_ERROR; }
+    tape.append(0, internal::tape_type::FALSE_VALUE);
+    return SUCCESS;
+  }
+
+  WARN_UNUSED really_inline error_code parse_null_atom(structural_parser<tape_builder> &parser, const uint8_t *value) {
+    parser.log_value("null");
+    if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
+    tape.append(0, internal::tape_type::NULL_VALUE);
+    return SUCCESS;
+  }
+
+  WARN_UNUSED really_inline error_code parse_root_null_atom(structural_parser<tape_builder> &parser, const uint8_t *value) {
+    parser.log_value("null");
+    if (!atomparsing::is_valid_null_atom(value, parser.remaining_len())) { return N_ATOM_ERROR; }
+    tape.append(0, internal::tape_type::NULL_VALUE);
+    return SUCCESS;
+  }
+
+  // increment_count increments the count of keys in an object or values in an array.
+  really_inline void increment_count(structural_parser<tape_builder> &parser) {
+    parser.parser.containing_scope[parser.depth].count++; // we have a key value pair in the object at parser.parser.depth - 1
+  }
+
+// private:
+
+  really_inline uint32_t next_tape_index(structural_parser<tape_builder> &parser) {
+    return uint32_t(tape.next_tape_loc - parser.parser.doc->tape.get());
+  }
+
+  really_inline void empty_container(structural_parser<tape_builder> &parser, internal::tape_type start, internal::tape_type end) {
+    auto start_index = next_tape_index(parser);
+    tape.append(start_index+2, start);
+    tape.append(start_index, end);
+  }
+
+  really_inline void start_container(structural_parser<tape_builder> &parser) {
+    parser.parser.containing_scope[parser.depth].tape_index = next_tape_index(parser);
+    parser.parser.containing_scope[parser.depth].count = 0;
+    tape.skip(); // We don't actually *write* the start element until the end.
+  }
+
+  really_inline void end_container(structural_parser<tape_builder> &parser, internal::tape_type start, internal::tape_type end) noexcept {
+    // Write the ending tape element, pointing at the start location
+    const uint32_t start_tape_index = parser.parser.containing_scope[parser.depth].tape_index;
+    tape.append(start_tape_index, end);
+    // Write the start tape element, pointing at the end location (and including count)
+    // count can overflow if it exceeds 24 bits... so we saturate
+    // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
+    const uint32_t count = parser.parser.containing_scope[parser.depth].count;
+    const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
+    tape_writer::write(parser.parser.doc->tape[start_tape_index], next_tape_index(parser) | (uint64_t(cntsat) << 32), start);
+  }
+
+  really_inline uint8_t *on_start_string(structural_parser<tape_builder> &parser) noexcept {
+    // we advance the point, accounting for the fact that we have a NULL termination
+    tape.append(current_string_buf_loc - parser.parser.doc->string_buf.get(), internal::tape_type::STRING);
+    return current_string_buf_loc + sizeof(uint32_t);
+  }
+
+  really_inline void on_end_string(uint8_t *dst) noexcept {
+    uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
+    // TODO check for overflow in case someone has a crazy string (>=4GB?)
+    // But only add the overflow check when the document itself exceeds 4GB
+    // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+    memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
+    // NULL termination is still handy if you expect all your strings to
+    // be NULL terminated? It comes at a small cost
+    *dst = 0;
+    current_string_buf_loc = dst + 1;
+  }
+}; // class tape_builder
+
+} // namespace stage2
+} // namespace SIMDJSON_IMPLEMENTATION
+} // unnamed namespace