Use SAX model for stage 2

This commit is contained in:
John Keiser 2020-08-03 15:05:30 -07:00
parent 553e6d7549
commit 03d54f8f6e
3 changed files with 245 additions and 159 deletions

View File

@ -28,8 +28,8 @@ namespace logger {
log_depth = 0;
printf("| %-*s | %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#", 5, "Tape#");
printf("|%.*s|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES, 5+2, DASHES);
printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
@ -71,7 +71,7 @@ namespace logger {
} else {
printf("| %-*s ", LOG_INDEX_LEN, "");
printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
// printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
printf("| %-s ", detail);

View File

@ -3,103 +3,63 @@
// We assume the file in which it is include already includes
// "simdjson/stage2.h" (this simplifies amalgation)
#include "generic/stage2/tape_writer.h"
#include "generic/stage2/logger.h"
#include "generic/stage2/atomparsing.h"
#include "generic/stage2/structural_iterator.h"
namespace { // Make everything here private
namespace stage2 {
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
template<typename T>
struct structural_parser : structural_iterator {
/** Lets you append to the tape */
tape_writer tape;
/** Next write location in the string buf for stage 2 parsing */
uint8_t *current_string_buf_loc;
/** Receiver that actually parses the strings and builds the tape */
T builder;
/** Current depth (nested objects and arrays) */
uint32_t depth{0};
// For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
: structural_iterator(_parser, start_structural_index),
current_string_buf_loc{parser.doc->string_buf.get()} {
WARN_UNUSED really_inline error_code start_scope(bool is_array) {
if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
parser.containing_scope[depth].tape_index = next_tape_index();
parser.containing_scope[depth].count = 0;
tape.skip(); // We don't actually *write* the start element until the end.
parser.is_array[depth] = is_array;
return SUCCESS;
builder{parser.doc->tape.get(), parser.doc->string_buf.get()} {
WARN_UNUSED really_inline error_code start_document() {
parser.containing_scope[depth].tape_index = next_tape_index();
parser.containing_scope[depth].count = 0;
tape.skip(); // We don't actually *write* the start element until the end.
parser.is_array[depth] = false;
return SUCCESS;
WARN_UNUSED really_inline error_code start_object() {
return start_scope(false);
if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
parser.is_array[depth] = false;
return SUCCESS;
WARN_UNUSED really_inline error_code start_array() {
return start_scope(true);
if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
parser.is_array[depth] = true;
return SUCCESS;
// this function is responsible for annotating the start of the scope
really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
// SIMDJSON_ASSUME(depth > 0);
// Write the ending tape element, pointing at the start location
const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
tape.append(start_tape_index, end);
// Write the start tape element, pointing at the end location (and including count)
// count can overflow if it exceeds 24 bits... so we saturate
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
const uint32_t count = parser.containing_scope[depth].count;
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
really_inline void end_object() {
really_inline uint32_t next_tape_index() {
return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
really_inline void end_object() {
end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
really_inline void end_array() {
end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
really_inline void end_document() {
constexpr uint32_t start_tape_index = 0;
tape.append(start_tape_index, internal::tape_type::ROOT);
tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index(), internal::tape_type::ROOT);
really_inline void empty_container(internal::tape_type start, internal::tape_type end) {
auto start_index = next_tape_index();
tape.append(start_index+2, start);
tape.append(start_index, end);
WARN_UNUSED really_inline bool empty_object() {
if (peek_next_char() == '}') {
log_value("empty object");
empty_container(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
return true;
return false;
@ -107,122 +67,45 @@ struct structural_parser : structural_iterator {
WARN_UNUSED really_inline bool empty_array() {
if (peek_next_char() == ']') {
log_value("empty array");
empty_container(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
return true;
return false;
// increment_count increments the count of keys in an object or values in an array.
really_inline void increment_count() {
parser.containing_scope[depth].count++; // we have a key value pair in the object at parser.depth - 1
really_inline uint8_t *on_start_string() noexcept {
// we advance the point, accounting for the fact that we have a NULL termination
tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
return current_string_buf_loc + sizeof(uint32_t);
really_inline void on_end_string(uint8_t *dst) noexcept {
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
// TODO check for overflow in case someone has a crazy string (>=4GB?)
// But only add the overflow check when the document itself exceeds 4GB
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
// NULL termination is still handy if you expect all your strings to
// be NULL terminated? It comes at a small cost
*dst = 0;
current_string_buf_loc = dst + 1;
WARN_UNUSED really_inline error_code parse_key(const uint8_t *key) {
return parse_string(key, true);
return builder.parse_key(*this, key);
WARN_UNUSED really_inline error_code parse_string(const uint8_t *value, bool key = false) {
log_value(key ? "key" : "string");
uint8_t *dst = on_start_string();
dst = stringparsing::parse_string(value, dst);
if (dst == nullptr) {
log_error("Invalid escape in string");
return SUCCESS;
WARN_UNUSED really_inline error_code parse_string(const uint8_t *value) {
return builder.parse_string(*this, value);
WARN_UNUSED really_inline error_code parse_number(const uint8_t *value) {
if (!numberparsing::parse_number(value, tape)) { log_error("Invalid number"); return NUMBER_ERROR; }
return SUCCESS;
return builder.parse_number(*this, value);
really_inline error_code parse_root_number(const uint8_t *value) {
// We need to make a copy to make sure that the string is space terminated.
// This is not about padding the input, which should already padded up
// to len + SIMDJSON_PADDING. However, we have no control at this stage
// on how the padding was done. What if the input string was padded with nulls?
// It is quite common for an input string to have an extra null character (C string).
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
// document, but the string "9\0" by itself is fine. So we make a copy and
// pad the input with spaces when we know that there is just one input element.
// This copy is relatively expensive, but it will almost never be called in
// practice unless you are in the strange scenario where you have many JSON
// documents made of single atoms.
uint8_t *copy = static_cast<uint8_t *>(malloc(remaining_len() + SIMDJSON_PADDING));
if (copy == nullptr) {
return MEMALLOC;
memcpy(copy, value, remaining_len());
memset(copy + remaining_len(), ' ', SIMDJSON_PADDING);
error_code error = parse_number(copy);
return error;
WARN_UNUSED really_inline error_code parse_root_number(const uint8_t *value) {
return builder.parse_root_number(*this, value);
WARN_UNUSED really_inline error_code parse_true_atom(const uint8_t *value) {
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
tape.append(0, internal::tape_type::TRUE_VALUE);
return SUCCESS;
return builder.parse_true_atom(*this, value);
WARN_UNUSED really_inline error_code parse_root_true_atom(const uint8_t *value) {
if (!atomparsing::is_valid_true_atom(value, remaining_len())) { return T_ATOM_ERROR; }
tape.append(0, internal::tape_type::TRUE_VALUE);
return SUCCESS;
return builder.parse_root_true_atom(*this, value);
WARN_UNUSED really_inline error_code parse_false_atom(const uint8_t *value) {
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
tape.append(0, internal::tape_type::FALSE_VALUE);
return SUCCESS;
return builder.parse_false_atom(*this, value);
WARN_UNUSED really_inline error_code parse_root_false_atom(const uint8_t *value) {
if (!atomparsing::is_valid_false_atom(value, remaining_len())) { return F_ATOM_ERROR; }
tape.append(0, internal::tape_type::FALSE_VALUE);
return SUCCESS;
return builder.parse_root_false_atom(*this, value);
WARN_UNUSED really_inline error_code parse_null_atom(const uint8_t *value) {
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
tape.append(0, internal::tape_type::NULL_VALUE);
return SUCCESS;
return builder.parse_null_atom(*this, value);
WARN_UNUSED really_inline error_code parse_root_null_atom(const uint8_t *value) {
if (!atomparsing::is_valid_null_atom(value, remaining_len())) { return N_ATOM_ERROR; }
tape.append(0, internal::tape_type::NULL_VALUE);
return SUCCESS;
return builder.parse_root_null_atom(*this, value);
WARN_UNUSED really_inline error_code start() {
@ -266,12 +149,20 @@ struct structural_parser : structural_iterator {
}; // struct structural_parser
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
} // namespace stage2
} // unnamed namespace
#include "generic/stage2/tape_builder.h"
namespace { // Make everything here private
namespace stage2 {
template<bool STREAMING>
WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
dom_parser.doc = &doc;
stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
stage2::structural_parser<stage2::tape_builder> parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
SIMDJSON_TRY( parser.start() );

View File

@ -0,0 +1,195 @@
#include "generic/stage2/tape_writer.h"
#include "generic/stage2/atomparsing.h"
namespace {
namespace stage2 {
struct tape_builder {
/** Next location to write to tape */
tape_writer tape;
/** Next write location in the string buf for stage 2 parsing */
uint8_t *current_string_buf_loc;
really_inline void empty_object(structural_parser<tape_builder> &parser) {
parser.log_value("empty object");
empty_container(parser, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
really_inline void empty_array(structural_parser<tape_builder> &parser) {
parser.log_value("empty array");
empty_container(parser, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
really_inline void start_document(structural_parser<tape_builder> &parser) {
really_inline void start_object(structural_parser<tape_builder> &parser) {
really_inline void start_array(structural_parser<tape_builder> &parser) {
really_inline void end_object(structural_parser<tape_builder> &parser) {
end_container(parser, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
really_inline void end_array(structural_parser<tape_builder> &parser) {
end_container(parser, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
really_inline void end_document(structural_parser<tape_builder> &parser) {
constexpr uint32_t start_tape_index = 0;
tape.append(start_tape_index, internal::tape_type::ROOT);
tape_writer::write(parser.parser.doc->tape[start_tape_index], next_tape_index(parser), internal::tape_type::ROOT);
WARN_UNUSED really_inline error_code parse_key(structural_parser<tape_builder> &parser, const uint8_t *value) {
return parse_string(parser, value, true);
WARN_UNUSED really_inline error_code parse_string(structural_parser<tape_builder> &parser, const uint8_t *value, bool key = false) {
parser.log_value(key ? "key" : "string");
uint8_t *dst = on_start_string(parser);
dst = stringparsing::parse_string(value, dst);
if (dst == nullptr) {
parser.log_error("Invalid escape in string");
return SUCCESS;
WARN_UNUSED really_inline error_code parse_number(structural_parser<tape_builder> &parser, const uint8_t *value) {
if (!numberparsing::parse_number(value, tape)) { parser.log_error("Invalid number"); return NUMBER_ERROR; }
return SUCCESS;
really_inline error_code parse_root_number(structural_parser<tape_builder> &parser, const uint8_t *value) {
// We need to make a copy to make sure that the string is space terminated.
// This is not about padding the input, which should already padded up
// to len + SIMDJSON_PADDING. However, we have no control at this stage
// on how the padding was done. What if the input string was padded with nulls?
// It is quite common for an input string to have an extra null character (C string).
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
// document, but the string "9\0" by itself is fine. So we make a copy and
// pad the input with spaces when we know that there is just one input element.
// This copy is relatively expensive, but it will almost never be called in
// practice unless you are in the strange scenario where you have many JSON
// documents made of single atoms.
uint8_t *copy = static_cast<uint8_t *>(malloc(parser.remaining_len() + SIMDJSON_PADDING));
if (copy == nullptr) {
return MEMALLOC;
memcpy(copy, value, parser.remaining_len());
memset(copy + parser.remaining_len(), ' ', SIMDJSON_PADDING);
error_code error = parse_number(parser, copy);
return error;
WARN_UNUSED really_inline error_code parse_true_atom(structural_parser<tape_builder> &parser, const uint8_t *value) {
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
tape.append(0, internal::tape_type::TRUE_VALUE);
return SUCCESS;
WARN_UNUSED really_inline error_code parse_root_true_atom(structural_parser<tape_builder> &parser, const uint8_t *value) {
if (!atomparsing::is_valid_true_atom(value, parser.remaining_len())) { return T_ATOM_ERROR; }
tape.append(0, internal::tape_type::TRUE_VALUE);
return SUCCESS;
WARN_UNUSED really_inline error_code parse_false_atom(structural_parser<tape_builder> &parser, const uint8_t *value) {
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
tape.append(0, internal::tape_type::FALSE_VALUE);
return SUCCESS;
WARN_UNUSED really_inline error_code parse_root_false_atom(structural_parser<tape_builder> &parser, const uint8_t *value) {
if (!atomparsing::is_valid_false_atom(value, parser.remaining_len())) { return F_ATOM_ERROR; }
tape.append(0, internal::tape_type::FALSE_VALUE);
return SUCCESS;
WARN_UNUSED really_inline error_code parse_null_atom(structural_parser<tape_builder> &parser, const uint8_t *value) {
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
tape.append(0, internal::tape_type::NULL_VALUE);
return SUCCESS;
WARN_UNUSED really_inline error_code parse_root_null_atom(structural_parser<tape_builder> &parser, const uint8_t *value) {
if (!atomparsing::is_valid_null_atom(value, parser.remaining_len())) { return N_ATOM_ERROR; }
tape.append(0, internal::tape_type::NULL_VALUE);
return SUCCESS;
// increment_count increments the count of keys in an object or values in an array.
really_inline void increment_count(structural_parser<tape_builder> &parser) {
parser.parser.containing_scope[parser.depth].count++; // we have a key value pair in the object at parser.parser.depth - 1
// private:
really_inline uint32_t next_tape_index(structural_parser<tape_builder> &parser) {
return uint32_t(tape.next_tape_loc - parser.parser.doc->tape.get());
really_inline void empty_container(structural_parser<tape_builder> &parser, internal::tape_type start, internal::tape_type end) {
auto start_index = next_tape_index(parser);
tape.append(start_index+2, start);
tape.append(start_index, end);
really_inline void start_container(structural_parser<tape_builder> &parser) {
parser.parser.containing_scope[parser.depth].tape_index = next_tape_index(parser);
parser.parser.containing_scope[parser.depth].count = 0;
tape.skip(); // We don't actually *write* the start element until the end.
really_inline void end_container(structural_parser<tape_builder> &parser, internal::tape_type start, internal::tape_type end) noexcept {
// Write the ending tape element, pointing at the start location
const uint32_t start_tape_index = parser.parser.containing_scope[parser.depth].tape_index;
tape.append(start_tape_index, end);
// Write the start tape element, pointing at the end location (and including count)
// count can overflow if it exceeds 24 bits... so we saturate
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
const uint32_t count = parser.parser.containing_scope[parser.depth].count;
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
tape_writer::write(parser.parser.doc->tape[start_tape_index], next_tape_index(parser) | (uint64_t(cntsat) << 32), start);
really_inline uint8_t *on_start_string(structural_parser<tape_builder> &parser) noexcept {
// we advance the point, accounting for the fact that we have a NULL termination
tape.append(current_string_buf_loc - parser.parser.doc->string_buf.get(), internal::tape_type::STRING);
return current_string_buf_loc + sizeof(uint32_t);
really_inline void on_end_string(uint8_t *dst) noexcept {
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
// TODO check for overflow in case someone has a crazy string (>=4GB?)
// But only add the overflow check when the document itself exceeds 4GB
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
// NULL termination is still handy if you expect all your strings to
// be NULL terminated? It comes at a small cost
*dst = 0;
current_string_buf_loc = dst + 1;
}; // class tape_builder
} // namespace stage2
} // unnamed namespace