Merge pull request #915 from simdjson/jkeiser/stage2-common

[2/4] Use same state machine for stage 2 streaming and non-streaming
This commit is contained in:
John Keiser 2020-06-10 08:37:08 -07:00 committed by GitHub
commit b4837f2e2f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 56 additions and 198 deletions

View File

@ -108,7 +108,6 @@ namespace arm64 {
#include "generic/stage2/atomparsing.h"
#include "generic/stage2/structural_iterator.h"
#include "generic/stage2/structural_parser.h"
#include "generic/stage2/streaming_structural_parser.h"
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
error_code err = stage1(_buf, _len, false);

View File

@ -260,7 +260,6 @@ namespace fallback {
#include "generic/stage2/atomparsing.h"
#include "generic/stage2/structural_iterator.h"
#include "generic/stage2/structural_parser.h"
#include "generic/stage2/streaming_structural_parser.h"
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
error_code err = stage1(_buf, _len, false);

View File

@ -30,6 +30,12 @@ namespace logger {
}
}
static really_inline void log_string(const char *message) {
if (LOG_ENABLED) {
printf("%s\n", message);
}
}
// Logs a single line of
template<typename S>
static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {

View File

@ -1,168 +0,0 @@
namespace stage2 {
struct streaming_structural_parser: structural_parser {
really_inline streaming_structural_parser(dom_parser_implementation &_parser) : structural_parser(_parser, _parser.next_structural_index) {}
// override to add streaming
WARN_UNUSED really_inline error_code start(ret_address_t finish_parser) {
// If there are no structurals left, return EMPTY
if (structurals.at_end(parser.n_structural_indexes)) {
return parser.error = EMPTY;
}
log_start();
init();
// Capacity ain't no thang for streaming, so we don't check it.
// Advance to the first character as soon as possible
advance_char();
// Push the root scope (there is always at least one scope)
if (start_document(finish_parser)) {
return parser.error = DEPTH_ERROR;
}
return SUCCESS;
}
// override to add streaming
WARN_UNUSED really_inline error_code finish() {
if ( structurals.past_end(parser.n_structural_indexes) ) {
log_error("IMPOSSIBLE: past the end of the JSON!");
return parser.error = TAPE_ERROR;
}
end_document();
parser.next_structural_index = uint32_t(structurals.next_structural_index());
if (depth != 0) {
log_error("Unclosed objects or arrays!");
return parser.error = TAPE_ERROR;
}
if (parser.containing_scope[depth].tape_index != 0) {
log_error("IMPOSSIBLE: root scope tape index did not start at 0!");
return parser.error = TAPE_ERROR;
}
return SUCCESS;
}
};
} // namespace stage2
/************
* The JSON is parsed to a tape, see the accompanying tape.md file
* for documentation.
***********/
WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
this->doc = &_doc;
static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
stage2::streaming_structural_parser parser(*this);
error_code result = parser.start(addresses.finish);
if (result) { return result; }
//
// Read first value
//
switch (parser.structurals.current_char()) {
case '{':
FAIL_IF( parser.start_object(addresses.finish) );
goto object_begin;
case '[':
FAIL_IF( parser.start_array(addresses.finish) );
goto array_begin;
case '"':
FAIL_IF( parser.parse_string() );
goto finish;
case 't': case 'f': case 'n':
FAIL_IF( parser.parse_single_atom() );
goto finish;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
FAIL_IF(
parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
return parser.parse_number(&copy[idx], false);
})
);
goto finish;
case '-':
FAIL_IF(
parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
return parser.parse_number(&copy[idx], true);
})
);
goto finish;
default:
parser.log_error("Document starts with a non-value character");
goto error;
}
//
// Object parser parsers
//
object_begin:
switch (parser.advance_char()) {
case '"': {
FAIL_IF( parser.parse_string(true) );
goto object_key_parser;
}
case '}':
parser.end_object();
goto scope_end;
default:
parser.log_error("Object does not start with a key");
goto error;
}
object_key_parser:
if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; }
parser.increment_count();
parser.advance_char();
GOTO( parser.parse_value(addresses, addresses.object_continue) );
object_continue:
switch (parser.advance_char()) {
case ',':
if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; }
FAIL_IF( parser.parse_string(true) );
goto object_key_parser;
case '}':
parser.end_object();
goto scope_end;
default:
parser.log_error("No comma between object fields");
goto error;
}
scope_end:
CONTINUE( parser.parser.ret_address[parser.depth] );
//
// Array parser parsers
//
array_begin:
if (parser.advance_char() == ']') {
parser.end_array();
goto scope_end;
}
parser.increment_count();
main_array_switch:
/* we call update char on all paths in, so we can peek at parser.c on the
* on paths that can accept a close square brace (post-, and at start) */
GOTO( parser.parse_value(addresses, addresses.array_continue) );
array_continue:
switch (parser.advance_char()) {
case ',':
parser.increment_count();
parser.advance_char();
goto main_array_switch;
case ']':
parser.end_array();
goto scope_end;
default:
parser.log_error("Missing comma between array values");
goto error;
}
finish:
return parser.finish();
error:
return parser.error();
}

View File

@ -4,6 +4,7 @@
// "simdjson/stage2.h" (this simplifies amalgation)
namespace stage2 {
namespace { // Make everything here private
#ifdef SIMDJSON_USE_COMPUTED_GOTO
#define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
@ -75,7 +76,12 @@ struct structural_parser {
uint8_t *current_string_buf_loc{};
uint32_t depth;
really_inline structural_parser(dom_parser_implementation &_parser, uint32_t next_structural = 0) : structurals(_parser.buf, _parser.len, _parser.structural_indexes.get(), next_structural), parser{_parser}, depth{0} {}
// For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
really_inline structural_parser(dom_parser_implementation &_parser, uint32_t next_structural)
: structurals(_parser.buf, _parser.len, _parser.structural_indexes.get(), next_structural),
parser{_parser},
depth{0} {
}
WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) {
parser.containing_scope[depth].tape_index = parser.current_loc;
@ -264,20 +270,13 @@ struct structural_parser {
}
WARN_UNUSED really_inline error_code finish() {
// the string might not be NULL terminated.
if ( !structurals.at_end(parser.n_structural_indexes) ) {
log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
return parser.error = TAPE_ERROR;
}
end_document();
parser.next_structural_index = uint32_t(structurals.next_structural_index());
if (depth != 0) {
log_error("Unclosed objects or arrays!");
return parser.error = TAPE_ERROR;
}
if (parser.containing_scope[depth].tape_index != 0) {
log_error("IMPOSSIBLE: root scope tape index did not start at 0!");
return parser.error = TAPE_ERROR;
}
return SUCCESS;
}
@ -328,12 +327,14 @@ struct structural_parser {
parser.error = UNINITIALIZED;
}
WARN_UNUSED really_inline error_code start(size_t len, ret_address_t finish_state) {
WARN_UNUSED really_inline error_code start(ret_address_t finish_state) {
// If there are no structurals left, return EMPTY
if (structurals.at_end(parser.n_structural_indexes)) {
return parser.error = EMPTY;
}
log_start();
init();
if (len > parser.capacity()) {
return parser.error = CAPACITY;
}
// Advance to the first character as soon as possible
structurals.advance_char();
// Push the root scope (there is always at least one scope)
@ -368,23 +369,18 @@ struct structural_parser {
really_inline void log_error(const char *error) {
logger::log_line(structurals, "", "ERROR", error);
}
};
}; // struct structural_parser
// Redefine FAIL_IF to use goto since it'll be used inside the function now
#undef FAIL_IF
#define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
} // namespace stage2
/************
* The JSON is parsed to a tape, see the accompanying tape.md file
* for documentation.
***********/
WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
this->doc = &_doc;
template<bool STREAMING>
WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
dom_parser.doc = &doc;
static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
stage2::structural_parser parser(*this);
error_code result = parser.start(len, addresses.finish);
stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
error_code result = parser.start(addresses.finish);
if (result) { return result; }
//
@ -398,7 +394,7 @@ WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) no
FAIL_IF( parser.start_array(addresses.finish) );
// Make sure the outer array is closed before continuing; otherwise, there are ways we could get
// into memory corruption. See https://github.com/simdjson/simdjson/issues/906
if (buf[structural_indexes[n_structural_indexes - 1]] != ']') {
if (parser.structurals.buf[parser.structurals.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') {
goto error;
}
goto array_begin;
@ -504,3 +500,31 @@ finish:
error:
return parser.error();
}
} // namespace {}
} // namespace stage2
/************
* The JSON is parsed to a tape, see the accompanying tape.md file
* for documentation.
***********/
WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
error_code result = stage2::parse_structurals<false>(*this, _doc);
if (result) { return result; }
// If we didn't make it to the end, it's an error
if ( next_structural_index != n_structural_indexes ) {
logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
return error = TAPE_ERROR;
}
return SUCCESS;
}
/************
* The JSON is parsed to a tape, see the accompanying tape.md file
* for documentation.
***********/
WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
return stage2::parse_structurals<true>(*this, _doc);
}

View File

@ -97,7 +97,6 @@ namespace haswell {
#include "generic/stage2/atomparsing.h"
#include "generic/stage2/structural_iterator.h"
#include "generic/stage2/structural_parser.h"
#include "generic/stage2/streaming_structural_parser.h"
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
error_code err = stage1(_buf, _len, false);

View File

@ -98,7 +98,6 @@ namespace westmere {
#include "generic/stage2/atomparsing.h"
#include "generic/stage2/structural_iterator.h"
#include "generic/stage2/structural_parser.h"
#include "generic/stage2/streaming_structural_parser.h"
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
error_code err = stage1(_buf, _len, false);