Structural iterator

2020-03-12 14:16:15 -07:00 · 2020-03-12 14:16:15 -07:00 · 81c86d7090
parent d9a9fd387d
commit 81c86d7090
4 changed files with 117 additions and 97 deletions
--- a/src/generic/numberparsing.h
+++ b/src/generic/numberparsing.h
@ -145,9 +145,8 @@ really_inline double subnormal_power10(double base, int64_t negative_exponent) {
 //
 // Note: a redesign could avoid this function entirely.
 //
-never_inline bool parse_float(const uint8_t *const buf, document::parser &parser,
-                              const uint32_t offset, bool found_minus) {
-  const char *p = reinterpret_cast<const char *>(buf + offset);
+never_inline bool parse_float(const uint8_t *const src, document::parser &parser, bool found_minus) {
+  const char *p = reinterpret_cast<const char *>(src);
  bool negative = false;
  if (found_minus) {
    ++p;
@ -179,7 +178,7 @@ never_inline bool parse_float(const uint8_t *const buf, document::parser &parser
                                              : 0);
    } else {
 #ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(buf + offset);
+      found_invalid_number(src);
 #endif
      return false;
    }
@ -202,7 +201,7 @@ never_inline bool parse_float(const uint8_t *const buf, document::parser &parser
    }
    if (!is_integer(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(buf + offset);
+      found_invalid_number(src);
 #endif
      return false;
    }
@ -228,7 +227,7 @@ never_inline bool parse_float(const uint8_t *const buf, document::parser &parser
      if (exp_number > 0x100000000) { // we need to check for overflows
 // we refuse to parse this
 #ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(buf + offset);
+        found_invalid_number(src);
 #endif
        return false;
      }
@ -246,7 +245,7 @@ never_inline bool parse_float(const uint8_t *const buf, document::parser &parser
 // We know for sure that we have a number that is too large,
 // we refuse to parse this
 #ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(buf + offset);
+        found_invalid_number(src);
 #endif
        return false;
      }
@ -264,14 +263,14 @@ never_inline bool parse_float(const uint8_t *const buf, document::parser &parser
  // check that we can go from long double to double safely.
  if(i > std::numeric_limits<double>::max()) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(buf + offset);
+        found_invalid_number(src);
 #endif
        return false;
  }
  double d = negative ? -i : i;
  parser.on_number_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
-  found_float(d, buf + offset);
+  found_float(d, src);
 #endif
  return is_structural_or_whitespace(*p);
 }
@ -284,11 +283,8 @@ never_inline bool parse_float(const uint8_t *const buf, document::parser &parser
 //
 // This function will almost never be called!!!
 //
-never_inline bool parse_large_integer(const uint8_t *const buf,
-                                             document::parser &parser,
-                                             const uint32_t offset,
-                                             bool found_minus) {
-  const char *p = reinterpret_cast<const char *>(buf + offset);
+never_inline bool parse_large_integer(const uint8_t *const src, document::parser &parser, bool found_minus) {
+  const char *p = reinterpret_cast<const char *>(src);

  bool negative = false;
  if (found_minus) {
@ -309,13 +305,13 @@ never_inline bool parse_large_integer(const uint8_t *const buf,
      digit = *p - '0';
      if (mul_overflow(i, 10, &i)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(buf + offset);
+        found_invalid_number(src);
 #endif
        return false; // overflow
      }
      if (add_overflow(i, digit, &i)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(buf + offset);
+        found_invalid_number(src);
 #endif
        return false; // overflow
      }
@ -326,7 +322,7 @@ never_inline bool parse_large_integer(const uint8_t *const buf,
    if (i > 0x8000000000000000) {
       // overflows!
 #ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(buf + offset);
+      found_invalid_number(src);
 #endif
      return false; // overflow
    } else if (i == 0x8000000000000000) {
@ -336,14 +332,14 @@ never_inline bool parse_large_integer(const uint8_t *const buf,
      constexpr int64_t signed_answer = INT64_MIN;
      parser.on_number_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(signed_answer, buf + offset);
+      found_integer(signed_answer, src);
 #endif
    } else {
      // we can negate safely
      int64_t signed_answer = -static_cast<int64_t>(i);
      parser.on_number_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(signed_answer, buf + offset);
+      found_integer(signed_answer, src);
 #endif
    }
  } else {
@ -352,12 +348,12 @@ never_inline bool parse_large_integer(const uint8_t *const buf,
    // fallback on unsigned integers if absolutely necessary.
    if(i < 0x8000000000000000) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
-      found_integer(i, buf + offset);
+      found_integer(i, src);
 #endif
      parser.on_number_s64(i);
    } else {
 #ifdef JSON_TEST_NUMBERS // for unit testing
-      found_unsigned_integer(i, buf + offset);
+      found_unsigned_integer(i, src);
 #endif
      parser.on_number_u64(i);
    }
@ -365,7 +361,7 @@ never_inline bool parse_large_integer(const uint8_t *const buf,
  return is_structural_or_whitespace(*p);
 }

-// parse the number at buf + offset
+// parse the number at src
 // define JSON_TEST_NUMBERS for unit testing
 //
 // It is assumed that the number is followed by a structural ({,},],[) character
@ -374,8 +370,7 @@ never_inline bool parse_large_integer(const uint8_t *const buf,
 // content and append a space before calling this function.
 //
 // Our objective is accurate parsing (ULP of 0 or 1) at high speed.
-really_inline bool parse_number(UNUSED const uint8_t *const buf,
-                                UNUSED const uint32_t offset,
+really_inline bool parse_number(UNUSED const uint8_t *const src,
                                UNUSED bool found_minus,
                                document::parser &parser) {
 #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
@ -383,14 +378,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
  parser.on_number_s64(0);           // always write zero
  return true;                    // always succeeds
 #else
-  const char *p = reinterpret_cast<const char *>(buf + offset);
+  const char *p = reinterpret_cast<const char *>(src);
  bool negative = false;
  if (found_minus) {
    ++p;
    negative = true;
    if (!is_integer(*p)) { // a negative sign must be followed by an integer
 #ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(buf + offset);
+      found_invalid_number(src);
 #endif
      return false;
    }
@ -402,7 +397,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
    ++p;
    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(buf + offset);
+      found_invalid_number(src);
 #endif
      return false;
    }
@ -410,7 +405,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
  } else {
    if (!(is_integer(*p))) { // must start with an integer
 #ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(buf + offset);
+      found_invalid_number(src);
 #endif
      return false;
    }
@ -445,7 +440,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
      // we will handle the overflow later
    } else {
 #ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(buf + offset);
+      found_invalid_number(src);
 #endif
      return false;
    }
@ -480,7 +475,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
    }
    if (!is_integer(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
-      found_invalid_number(buf + offset);
+      found_invalid_number(src);
 #endif
      return false;
    }
@ -501,7 +496,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
      if (exp_number > 0x100000000) { // we need to check for overflows
                                      // we refuse to parse this
 #ifdef JSON_TEST_NUMBERS // for unit testing
-        found_invalid_number(buf + offset);
+        found_invalid_number(src);
 #endif
        return false;
      }
@ -526,31 +521,31 @@ really_inline bool parse_number(UNUSED const uint8_t *const buf,
        // Ok, chances are good that we had an overflow!
        // this is almost never going to get called!!!
        // we start anew, going slowly!!!
-        return parse_float(buf, parser, offset, found_minus);
+        return parse_float(src, parser, found_minus);
      }
    }
    if (unlikely((power_index > 2 * 308))) { // this is uncommon!!!
      // this is almost never going to get called!!!
      // we start anew, going slowly!!!
-      return parse_float(buf, parser, offset, found_minus);
+      return parse_float(src, parser, found_minus);
    }
    double factor = power_of_ten[power_index];
    factor = negative ? -factor : factor;
    double d = i * factor;
    parser.on_number_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
-    found_float(d, buf + offset);
+    found_float(d, src);
 #endif
  } else {
    if (unlikely(digit_count >= 18)) { // this is uncommon!!!
      // there is a good chance that we had an overflow, so we need
      // need to recover: we parse the whole thing again.
-      return parse_large_integer(buf, parser, offset, found_minus);
+      return parse_large_integer(src, parser, found_minus);
    }
    i = negative ? 0 - i : i;
    parser.on_number_s64(i);
 #ifdef JSON_TEST_NUMBERS // for unit testing
-    found_integer(i, buf + offset);
+    found_integer(i, src);
 #endif
  }
  return is_structural_or_whitespace(*p);
--- a/src/generic/stage2_build_tape.h
+++ b/src/generic/stage2_build_tape.h
@ -47,28 +47,22 @@ struct unified_machine_addresses {
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }

-struct structural_parser {
-  const uint8_t* const buf;
-  const size_t len;
-  document::parser &doc_parser;
-  size_t i; // next structural index
-  size_t idx; // location of the structural character in the input (buf)
-  uint8_t c;    // used to track the (structural) character we are looking at
-  uint32_t depth = 0; // could have an arbitrary starting depth
-
-  really_inline structural_parser(
-    const uint8_t *_buf,
-    size_t _len,
-    document::parser &_doc_parser,
-    uint32_t _i = 0
-  ) : buf{_buf}, len{_len}, doc_parser{_doc_parser}, i{_i} {}
-
+class structural_iterator {
+public:
+  really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
+    : buf{_buf}, len{_len}, structural_indexes{_structural_indexes}, next_structural{next_structural_index} {}
  really_inline char advance_char() {
-    idx = doc_parser.structural_indexes[i++];
-    c = buf[idx];
+    idx = structural_indexes[next_structural];
+    next_structural++;
+    c = *current();
    return c;
  }
-
+  really_inline char current_char() {
+    return c;
+  }
+  really_inline const uint8_t* current() {
+    return &buf[idx];
+  }
  template<typename F>
  really_inline bool with_space_terminated_copy(const F& f) {
    /**
@ -94,6 +88,36 @@ struct structural_parser {
    free(copy);
    return result;
  }
+  really_inline bool past_end(uint32_t n_structural_indexes) {
+    return next_structural+1 > n_structural_indexes;
+  }
+  really_inline bool at_end(uint32_t n_structural_indexes) {
+    return next_structural+1 == n_structural_indexes;
+  }
+  really_inline size_t next_structural_index() {
+    return next_structural;
+  }
+
+private:
+  const uint8_t* const buf;
+  const size_t len;
+  const uint32_t* const structural_indexes;
+  size_t next_structural; // next structural index
+  size_t idx; // location of the structural character in the input (buf)
+  uint8_t c;  // used to track the (structural) character we are looking at
+};
+
+struct structural_parser {
+  structural_iterator structurals;
+  document::parser &doc_parser;
+  uint32_t depth;
+
+  really_inline structural_parser(
+    const uint8_t *buf,
+    size_t len,
+    document::parser &_doc_parser,
+    uint32_t next_structural = 0
+  ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}

  WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
    doc_parser.on_start_document(depth);
@ -134,32 +158,32 @@ struct structural_parser {

  WARN_UNUSED really_inline bool parse_string() {
    uint8_t *dst = doc_parser.on_start_string();
-    dst = stringparsing::parse_string(buf, idx, dst);
+    dst = stringparsing::parse_string(structurals.current(), dst);
    if (dst == nullptr) {
      return true;
    }
    return !doc_parser.on_end_string(dst);
  }

-  WARN_UNUSED really_inline bool parse_number(const uint8_t *copy, uint32_t offset, bool found_minus) {
-    return !numberparsing::parse_number(copy, offset, found_minus, doc_parser);
+  WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
+    return !numberparsing::parse_number(src, found_minus, doc_parser);
  }
  WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(buf, idx, found_minus);
+    return parse_number(structurals.current(), found_minus);
  }

-  WARN_UNUSED really_inline bool parse_atom(const uint8_t *copy, uint32_t offset) {
-    switch (c) {
+  WARN_UNUSED really_inline bool parse_atom(const uint8_t *src) {
+    switch (structurals.current_char()) {
      case 't':
-        if (!is_valid_true_atom(copy + offset)) { return true; }
+        if (!is_valid_true_atom(src)) { return true; }
        doc_parser.on_true_atom();
        break;
      case 'f':
-        if (!is_valid_false_atom(copy + offset)) { return true; }
+        if (!is_valid_false_atom(src)) { return true; }
        doc_parser.on_false_atom();
        break;
      case 'n':
-        if (!is_valid_null_atom(copy + offset)) { return true; }
+        if (!is_valid_null_atom(src)) { return true; }
        doc_parser.on_null_atom();
        break;
      default:
@ -169,11 +193,11 @@ struct structural_parser {
  }

  WARN_UNUSED really_inline bool parse_atom() {
-    return parse_atom(buf, idx);
+    return parse_atom(structurals.current());
  }

  WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
-    switch (c) {
+    switch (structurals.current_char()) {
    case '"':
      FAIL_IF( parse_string() );
      return continue_state;
@ -200,7 +224,7 @@ struct structural_parser {

  WARN_UNUSED really_inline error_code finish() {
    // the string might not be NULL terminated.
-    if ( i + 1 != doc_parser.n_structural_indexes ) {
+    if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
      return doc_parser.on_error(TAPE_ERROR);
    }
    end_document();
@ -228,7 +252,7 @@ struct structural_parser {
    if (depth >= doc_parser.max_depth()) {
      return doc_parser.on_error(DEPTH_ERROR);
    }
-    switch (c) {
+    switch (structurals.current_char()) {
    case '"':
      return doc_parser.on_error(STRING_ERROR);
    case '0':
@ -254,19 +278,23 @@ struct structural_parser {
    }
  }

-  WARN_UNUSED really_inline error_code start(ret_address finish_state) {
+  WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
    doc_parser.init_stage2(); // sets is_valid to false
    if (len > doc_parser.capacity()) {
      return CAPACITY;
    }
    // Advance to the first character as soon as possible
-    advance_char();
+    structurals.advance_char();
    // Push the root scope (there is always at least one scope)
    if (start_document(finish_state)) {
      return doc_parser.on_error(DEPTH_ERROR);
    }
    return SUCCESS;
  }
+
+  really_inline char advance_char() {
+    return structurals.advance_char();
+  }
 };

 // Redefine FAIL_IF to use goto since it'll be used inside the function now
@ -282,13 +310,13 @@ struct structural_parser {
 WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, document::parser &doc_parser) const noexcept {
  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
  stage2::structural_parser parser(buf, len, doc_parser);
-  error_code result = parser.start(addresses.finish);
+  error_code result = parser.start(len, addresses.finish);
  if (result) { return result; }

  //
  // Read first value
  //
-  switch (parser.c) {
+  switch (parser.structurals.current_char()) {
  case '{':
    FAIL_IF( parser.start_object(addresses.finish) );
    goto object_begin;
@ -300,23 +328,23 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, do
    goto finish;
  case 't': case 'f': case 'n':
    FAIL_IF(
-      parser.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_atom(copy, idx);
+      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
+        return parser.parse_atom(&copy[idx]);
      })
    );
    goto finish;
  case '0': case '1': case '2': case '3': case '4':
  case '5': case '6': case '7': case '8': case '9':
    FAIL_IF(
-      parser.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(copy, idx, false);
+      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
+        return parser.parse_number(&copy[idx], false);
      })
    );
    goto finish;
  case '-':
    FAIL_IF(
-      parser.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(copy, idx, true);
+      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
+        return parser.parse_number(&copy[idx], true);
      })
    );
    goto finish;
@ -328,8 +356,7 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, do
 // Object parser states
 //
 object_begin:
-  parser.advance_char();
-  switch (parser.c) {
+  switch (parser.advance_char()) {
  case '"': {
    FAIL_IF( parser.parse_string() );
    goto object_key_state;
--- a/src/generic/stage2_streaming_build_tape.h
+++ b/src/generic/stage2_streaming_build_tape.h
@ -4,7 +4,7 @@ struct streaming_structural_parser: structural_parser {
  really_inline streaming_structural_parser(const uint8_t *_buf, size_t _len, document::parser &_doc_parser, size_t _i) : structural_parser(_buf, _len, _doc_parser, _i) {}

  // override to add streaming
-  WARN_UNUSED really_inline error_code start(ret_address finish_parser) {
+  WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
    doc_parser.init_stage2(); // sets is_valid to false
    // Capacity ain't no thang for streaming, so we don't check it.
    // Advance to the first character as soon as possible
@ -18,7 +18,7 @@ struct streaming_structural_parser: structural_parser {

  // override to add streaming
  WARN_UNUSED really_inline error_code finish() {
-    if ( i + 1 > doc_parser.n_structural_indexes ) {
+    if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
      return doc_parser.on_error(TAPE_ERROR);
    }
    end_document();
@ -28,7 +28,7 @@ struct streaming_structural_parser: structural_parser {
    if (doc_parser.containing_scope_offset[depth] != 0) {
      return doc_parser.on_error(TAPE_ERROR);
    }
-    bool finished = i + 1 == doc_parser.n_structural_indexes;
+    bool finished = structurals.at_end(doc_parser.n_structural_indexes);
    return doc_parser.on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
  }
 };
@ -42,12 +42,12 @@ struct streaming_structural_parser: structural_parser {
 WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, document::parser &doc_parser, size_t &next_json) const noexcept {
  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
  stage2::streaming_structural_parser parser(buf, len, doc_parser, next_json);
-  error_code result = parser.start(addresses.finish);
+  error_code result = parser.start(len, addresses.finish);
  if (result) { return result; }
  //
  // Read first value
  //
-  switch (parser.c) {
+  switch (parser.structurals.current_char()) {
  case '{':
    FAIL_IF( parser.start_object(addresses.finish) );
    goto object_begin;
@ -59,23 +59,23 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, do
    goto finish;
  case 't': case 'f': case 'n':
    FAIL_IF(
-      parser.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_atom(copy, idx);
+      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
+        return parser.parse_atom(&copy[idx]);
      })
    );
    goto finish;
  case '0': case '1': case '2': case '3': case '4':
  case '5': case '6': case '7': case '8': case '9':
    FAIL_IF(
-      parser.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(copy, idx, false);
+      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
+        return parser.parse_number(&copy[idx], false);
      })
    );
    goto finish;
  case '-':
    FAIL_IF(
-      parser.with_space_terminated_copy([&](auto copy, auto idx) {
-        return parser.parse_number(copy, idx, true);
+      parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
+        return parser.parse_number(&copy[idx], true);
      })
    );
    goto finish;
@ -87,8 +87,7 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, do
 // Object parser parsers
 //
 object_begin:
-  parser.advance_char();
-  switch (parser.c) {
+  switch (parser.advance_char()) {
  case '"': {
    FAIL_IF( parser.parse_string() );
    goto object_key_parser;
@ -148,7 +147,7 @@ array_continue:
  }

 finish:
-  next_json = parser.i;
+  next_json = parser.structurals.next_structural_index();
  return parser.finish();

 error:
--- a/src/generic/stringparsing.h
+++ b/src/generic/stringparsing.h
@ -71,10 +71,9 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
  return offset > 0;
 }

-WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *buf,
-                                                uint32_t offset,
+WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src,
                                                uint8_t *dst) {
-  const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */
+  src++;
  while (1) {
    parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst);
    if (((helper.bs_bits - 1) & helper.quote_bits) != 0) {