Merge structural_parser+iterator into json_iterator

This commit is contained in:
John Keiser 2020-08-06 12:53:36 -07:00
parent a67e83e24e
commit 6bb99aec3c
3 changed files with 76 additions and 46 deletions

View File

@ -1,27 +1,45 @@
// This file contains the common code every implementation uses for stage2
// It is intended to be included multiple times and compiled multiple times
// We assume the file in which it is include already includes
// "simdjson/stage2.h" (this simplifies amalgation)
#include "generic/stage2/logger.h" #include "generic/stage2/logger.h"
#include "generic/stage2/structural_iterator.h"
namespace { // Make everything here private namespace {
namespace SIMDJSON_IMPLEMENTATION { namespace SIMDJSON_IMPLEMENTATION {
namespace stage2 { namespace stage2 {
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } } class json_iterator {
public:
struct structural_parser : structural_iterator { const uint8_t* const buf;
/** Current depth (nested objects and arrays) */ uint32_t *next_structural;
dom_parser_implementation &dom_parser;
uint32_t depth{0}; uint32_t depth{0};
template<bool STREAMING, typename T> template<bool STREAMING, typename T>
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code walk_document(T &visitor) noexcept; SIMDJSON_WARN_UNUSED simdjson_really_inline error_code walk_document(T &visitor) noexcept;
// For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations // Start a structural
simdjson_really_inline structural_parser(dom_parser_implementation &_dom_parser, uint32_t start_structural_index) simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
: structural_iterator(_dom_parser, start_structural_index) { : buf{_dom_parser.buf},
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
dom_parser{_dom_parser} {
}
// Get the buffer position of the current structural character
simdjson_really_inline char peek_next_char() {
return buf[*(next_structural)];
}
simdjson_really_inline const uint8_t* advance() {
return &buf[*(next_structural++)];
}
simdjson_really_inline char advance_char() {
return buf[*(next_structural++)];
}
simdjson_really_inline size_t remaining_len() {
return dom_parser.len - *(next_structural-1);
}
simdjson_really_inline bool at_end() {
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
}
simdjson_really_inline bool at_beginning() {
return next_structural == dom_parser.structural_indexes.get();
} }
template<typename T> template<typename T>
@ -64,11 +82,11 @@ struct structural_parser : structural_iterator {
simdjson_really_inline void log_error(const char *error) { simdjson_really_inline void log_error(const char *error) {
logger::log_line(*this, "", "ERROR", error); logger::log_line(*this, "", "ERROR", error);
} }
}; // struct structural_parser };
template<bool STREAMING, typename T> template<bool STREAMING, typename T>
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code structural_parser::walk_document(T &visitor) noexcept { SIMDJSON_WARN_UNUSED simdjson_really_inline error_code json_iterator::walk_document(T &visitor) noexcept {
const uint8_t *value; const uint8_t *value; // Used to keep a value around between states
logger::log_start(); logger::log_start();

View File

@ -1,4 +1,4 @@
#include "generic/stage2/structural_parser.h" #include "generic/stage2/json_iterator.h"
#include "generic/stage2/tape_writer.h" #include "generic/stage2/tape_writer.h"
#include "generic/stage2/atomparsing.h" #include "generic/stage2/atomparsing.h"
@ -12,12 +12,12 @@ struct tape_builder {
dom_parser_implementation &dom_parser, dom_parser_implementation &dom_parser,
dom::document &doc) noexcept { dom::document &doc) noexcept {
dom_parser.doc = &doc; dom_parser.doc = &doc;
structural_parser iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
tape_builder builder(doc); tape_builder builder(doc);
return iter.walk_document<STREAMING>(builder); return iter.walk_document<STREAMING>(builder);
} }
simdjson_really_inline error_code root_primitive(structural_parser &iter, const uint8_t *value) { simdjson_really_inline error_code root_primitive(json_iterator &iter, const uint8_t *value) {
switch (*value) { switch (*value) {
case '"': return parse_string(iter, value); case '"': return parse_string(iter, value);
case 't': return parse_root_true_atom(iter, value); case 't': return parse_root_true_atom(iter, value);
@ -32,7 +32,7 @@ struct tape_builder {
return TAPE_ERROR; return TAPE_ERROR;
} }
} }
simdjson_really_inline error_code primitive(structural_parser &iter, const uint8_t *value) { simdjson_really_inline error_code primitive(json_iterator &iter, const uint8_t *value) {
switch (*value) { switch (*value) {
case '"': return parse_string(iter, value); case '"': return parse_string(iter, value);
case 't': return parse_true_atom(iter, value); case 't': return parse_true_atom(iter, value);
@ -47,54 +47,64 @@ struct tape_builder {
return TAPE_ERROR; return TAPE_ERROR;
} }
} }
simdjson_really_inline void empty_object(structural_parser &iter) { simdjson_really_inline void empty_object(json_iterator &iter) {
iter.log_value("empty object"); iter.log_value("empty object");
empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
} }
simdjson_really_inline void empty_array(structural_parser &iter) { simdjson_really_inline void empty_array(json_iterator &iter) {
iter.log_value("empty array"); iter.log_value("empty array");
empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
} }
simdjson_really_inline void start_document(structural_parser &iter) { simdjson_really_inline void start_document(json_iterator &iter) {
iter.log_start_value("document"); iter.log_start_value("document");
start_container(iter); start_container(iter);
iter.dom_parser.is_array[iter.depth] = false; iter.dom_parser.is_array[iter.depth] = false;
} }
simdjson_really_inline void start_object(structural_parser &iter) { simdjson_really_inline void start_object(json_iterator &iter) {
iter.log_start_value("object"); iter.log_start_value("object");
start_container(iter); start_container(iter);
iter.dom_parser.is_array[iter.depth] = false; iter.dom_parser.is_array[iter.depth] = false;
} }
simdjson_really_inline void start_array(structural_parser &iter) { simdjson_really_inline void start_array(json_iterator &iter) {
iter.log_start_value("array"); iter.log_start_value("array");
start_container(iter); start_container(iter);
iter.dom_parser.is_array[iter.depth] = true; iter.dom_parser.is_array[iter.depth] = true;
} }
simdjson_really_inline void end_object(structural_parser &iter) { simdjson_really_inline void end_object(json_iterator &iter) {
iter.log_end_value("object"); iter.log_end_value("object");
end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
} }
simdjson_really_inline void end_array(structural_parser &iter) { simdjson_really_inline void end_array(json_iterator &iter) {
iter.log_end_value("array"); iter.log_end_value("array");
end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
} }
simdjson_really_inline void end_document(structural_parser &iter) { simdjson_really_inline void end_document(json_iterator &iter) {
iter.log_end_value("document"); iter.log_end_value("document");
constexpr uint32_t start_tape_index = 0; constexpr uint32_t start_tape_index = 0;
tape.append(start_tape_index, internal::tape_type::ROOT); tape.append(start_tape_index, internal::tape_type::ROOT);
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT); tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
} }
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code key(structural_parser &iter, const uint8_t *key) { SIMDJSON_WARN_UNUSED simdjson_really_inline error_code key(json_iterator &iter, const uint8_t *key) {
return parse_string(iter, key, true); return parse_string(iter, key, true);
} }
// Called after end_object/end_array. Not called after empty_object/empty_array,
// as the parent is already known in those cases.
//
// The object returned from end_container() should support the in_container(),
// in_array() and in_object() methods, allowing the iterator to branch to the
// correct place.
simdjson_really_inline tape_builder &end_container(json_iterator &iter) {
iter.depth--;
return *this;
}
// increment_count increments the count of keys in an object or values in an array. // increment_count increments the count of keys in an object or values in an array.
simdjson_really_inline void increment_count(structural_parser &iter) { simdjson_really_inline void increment_count(json_iterator &iter) {
iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1 iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
} }
simdjson_really_inline bool in_array(structural_parser &iter) noexcept { simdjson_really_inline bool in_array(json_iterator &iter) noexcept {
return iter.dom_parser.is_array[iter.depth]; return iter.dom_parser.is_array[iter.depth];
} }
@ -106,7 +116,7 @@ private:
simdjson_really_inline tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {} simdjson_really_inline tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_string(structural_parser &iter, const uint8_t *value, bool key = false) { SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_string(json_iterator &iter, const uint8_t *value, bool key = false) {
iter.log_value(key ? "key" : "string"); iter.log_value(key ? "key" : "string");
uint8_t *dst = on_start_string(iter); uint8_t *dst = on_start_string(iter);
dst = stringparsing::parse_string(value, dst); dst = stringparsing::parse_string(value, dst);
@ -118,13 +128,13 @@ private:
return SUCCESS; return SUCCESS;
} }
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_number(structural_parser &iter, const uint8_t *value) { SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_number(json_iterator &iter, const uint8_t *value) {
iter.log_value("number"); iter.log_value("number");
if (!numberparsing::parse_number(value, tape)) { iter.log_error("Invalid number"); return NUMBER_ERROR; } if (!numberparsing::parse_number(value, tape)) { iter.log_error("Invalid number"); return NUMBER_ERROR; }
return SUCCESS; return SUCCESS;
} }
simdjson_really_inline error_code parse_root_number(structural_parser &iter, const uint8_t *value) { simdjson_really_inline error_code parse_root_number(json_iterator &iter, const uint8_t *value) {
// //
// We need to make a copy to make sure that the string is space terminated. // We need to make a copy to make sure that the string is space terminated.
// This is not about padding the input, which should already padded up // This is not about padding the input, which should already padded up
@ -149,42 +159,42 @@ private:
return error; return error;
} }
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_true_atom(structural_parser &iter, const uint8_t *value) { SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_true_atom(json_iterator &iter, const uint8_t *value) {
iter.log_value("true"); iter.log_value("true");
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; } if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
tape.append(0, internal::tape_type::TRUE_VALUE); tape.append(0, internal::tape_type::TRUE_VALUE);
return SUCCESS; return SUCCESS;
} }
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_root_true_atom(structural_parser &iter, const uint8_t *value) { SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_root_true_atom(json_iterator &iter, const uint8_t *value) {
iter.log_value("true"); iter.log_value("true");
if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; } if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
tape.append(0, internal::tape_type::TRUE_VALUE); tape.append(0, internal::tape_type::TRUE_VALUE);
return SUCCESS; return SUCCESS;
} }
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_false_atom(structural_parser &iter, const uint8_t *value) { SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_false_atom(json_iterator &iter, const uint8_t *value) {
iter.log_value("false"); iter.log_value("false");
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; } if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
tape.append(0, internal::tape_type::FALSE_VALUE); tape.append(0, internal::tape_type::FALSE_VALUE);
return SUCCESS; return SUCCESS;
} }
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_root_false_atom(structural_parser &iter, const uint8_t *value) { SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_root_false_atom(json_iterator &iter, const uint8_t *value) {
iter.log_value("false"); iter.log_value("false");
if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; } if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
tape.append(0, internal::tape_type::FALSE_VALUE); tape.append(0, internal::tape_type::FALSE_VALUE);
return SUCCESS; return SUCCESS;
} }
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_null_atom(structural_parser &iter, const uint8_t *value) { SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_null_atom(json_iterator &iter, const uint8_t *value) {
iter.log_value("null"); iter.log_value("null");
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; } if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
tape.append(0, internal::tape_type::NULL_VALUE); tape.append(0, internal::tape_type::NULL_VALUE);
return SUCCESS; return SUCCESS;
} }
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_root_null_atom(structural_parser &iter, const uint8_t *value) { SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_root_null_atom(json_iterator &iter, const uint8_t *value) {
iter.log_value("null"); iter.log_value("null");
if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; } if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
tape.append(0, internal::tape_type::NULL_VALUE); tape.append(0, internal::tape_type::NULL_VALUE);
@ -193,23 +203,23 @@ private:
// private: // private:
simdjson_really_inline uint32_t next_tape_index(structural_parser &iter) { simdjson_really_inline uint32_t next_tape_index(json_iterator &iter) {
return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get()); return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
} }
simdjson_really_inline void empty_container(structural_parser &iter, internal::tape_type start, internal::tape_type end) { simdjson_really_inline void empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) {
auto start_index = next_tape_index(iter); auto start_index = next_tape_index(iter);
tape.append(start_index+2, start); tape.append(start_index+2, start);
tape.append(start_index, end); tape.append(start_index, end);
} }
simdjson_really_inline void start_container(structural_parser &iter) { simdjson_really_inline void start_container(json_iterator &iter) {
iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter); iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
iter.dom_parser.open_containers[iter.depth].count = 0; iter.dom_parser.open_containers[iter.depth].count = 0;
tape.skip(); // We don't actually *write* the start element until the end. tape.skip(); // We don't actually *write* the start element until the end.
} }
simdjson_really_inline void end_container(structural_parser &iter, internal::tape_type start, internal::tape_type end) noexcept { simdjson_really_inline void end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
// Write the ending tape element, pointing at the start location // Write the ending tape element, pointing at the start location
const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index; const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
tape.append(start_tape_index, end); tape.append(start_tape_index, end);
@ -221,7 +231,7 @@ private:
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start); tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
} }
simdjson_really_inline uint8_t *on_start_string(structural_parser &iter) noexcept { simdjson_really_inline uint8_t *on_start_string(json_iterator &iter) noexcept {
// we advance the point, accounting for the fact that we have a NULL termination // we advance the point, accounting for the fact that we have a NULL termination
tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING); tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
return current_string_buf_loc + sizeof(uint32_t); return current_string_buf_loc + sizeof(uint32_t);

View File

@ -4,6 +4,8 @@
#include <initializer_list> #include <initializer_list>
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
// Static array of known implementations. We're hoping these get baked into the executable // Static array of known implementations. We're hoping these get baked into the executable
// without requiring a static initializer. // without requiring a static initializer.