Merge pull request #918 from simdjson/jkeiser/remove-iterator-variables
[3/4] Remove unneeded structural_iterator variables
This commit is contained in:
commit
fe69928764
|
@ -61,10 +61,10 @@ namespace logger {
|
|||
printf(" ");
|
||||
}
|
||||
printf("| %c ", printable_char(structurals.at_beginning() ? ' ' : structurals.current_char()));
|
||||
printf("| %c ", printable_char(structurals.peek_char()));
|
||||
printf("| %5u ", structurals.structural_indexes[structurals.next_structural]);
|
||||
printf("| %c ", printable_char(structurals.peek_next_char()));
|
||||
printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural+1)]);
|
||||
printf("| %-*s ", LOG_DETAIL_LEN, detail);
|
||||
printf("| %*zu ", LOG_INDEX_LEN, structurals.idx);
|
||||
printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
|
||||
printf("|\n");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,168 @@
|
|||
namespace stage2 {
|
||||
|
||||
struct streaming_structural_parser: structural_parser {
|
||||
really_inline streaming_structural_parser(dom_parser_implementation &_parser) : structural_parser(_parser, _parser.next_structural_index) {}
|
||||
|
||||
// override to add streaming
|
||||
WARN_UNUSED really_inline error_code start(ret_address_t finish_parser) {
|
||||
// If there are no structurals left, return EMPTY
|
||||
if (structurals.at_end(parser.n_structural_indexes)) {
|
||||
return parser.error = EMPTY;
|
||||
}
|
||||
|
||||
log_start();
|
||||
init();
|
||||
|
||||
// Capacity ain't no thang for streaming, so we don't check it.
|
||||
// Advance to the first character as soon as possible
|
||||
advance_char();
|
||||
// Push the root scope (there is always at least one scope)
|
||||
if (start_document(finish_parser)) {
|
||||
return parser.error = DEPTH_ERROR;
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
// override to add streaming
|
||||
WARN_UNUSED really_inline error_code finish() {
|
||||
if ( structurals.past_end(parser.n_structural_indexes) ) {
|
||||
log_error("IMPOSSIBLE: past the end of the JSON!");
|
||||
return parser.error = TAPE_ERROR;
|
||||
}
|
||||
end_document();
|
||||
parser.next_structural_index = uint32_t(structurals.next_structural_index());
|
||||
if (depth != 0) {
|
||||
log_error("Unclosed objects or arrays!");
|
||||
return parser.error = TAPE_ERROR;
|
||||
}
|
||||
if (parser.containing_scope[depth].tape_index != 0) {
|
||||
log_error("IMPOSSIBLE: root scope tape index did not start at 0!");
|
||||
return parser.error = TAPE_ERROR;
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace stage2
|
||||
|
||||
/************
|
||||
* The JSON is parsed to a tape, see the accompanying tape.md file
|
||||
* for documentation.
|
||||
***********/
|
||||
WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
||||
this->doc = &_doc;
|
||||
static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
|
||||
stage2::streaming_structural_parser parser(*this);
|
||||
error_code result = parser.start(addresses.finish);
|
||||
if (result) { return result; }
|
||||
//
|
||||
// Read first value
|
||||
//
|
||||
switch (parser.structurals.current_char()) {
|
||||
case '{':
|
||||
FAIL_IF( parser.start_object(addresses.finish) );
|
||||
goto object_begin;
|
||||
case '[':
|
||||
FAIL_IF( parser.start_array(addresses.finish) );
|
||||
goto array_begin;
|
||||
case '"':
|
||||
FAIL_IF( parser.parse_string() );
|
||||
goto finish;
|
||||
case 't': case 'f': case 'n':
|
||||
FAIL_IF( parser.parse_single_atom() );
|
||||
goto finish;
|
||||
case '0': case '1': case '2': case '3': case '4':
|
||||
case '5': case '6': case '7': case '8': case '9':
|
||||
FAIL_IF(
|
||||
parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
|
||||
return parser.parse_number(©[idx], false);
|
||||
})
|
||||
);
|
||||
goto finish;
|
||||
case '-':
|
||||
FAIL_IF(
|
||||
parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
|
||||
return parser.parse_number(©[idx], true);
|
||||
})
|
||||
);
|
||||
goto finish;
|
||||
default:
|
||||
parser.log_error("Document starts with a non-value character");
|
||||
goto error;
|
||||
}
|
||||
|
||||
//
|
||||
// Object parser parsers
|
||||
//
|
||||
object_begin:
|
||||
switch (parser.advance_char()) {
|
||||
case '"': {
|
||||
FAIL_IF( parser.parse_string(true) );
|
||||
goto object_key_parser;
|
||||
}
|
||||
case '}':
|
||||
parser.end_object();
|
||||
goto scope_end;
|
||||
default:
|
||||
parser.log_error("Object does not start with a key");
|
||||
goto error;
|
||||
}
|
||||
|
||||
object_key_parser:
|
||||
if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; }
|
||||
parser.increment_count();
|
||||
parser.advance_char();
|
||||
GOTO( parser.parse_value(addresses, addresses.object_continue) );
|
||||
|
||||
object_continue:
|
||||
switch (parser.advance_char()) {
|
||||
case ',':
|
||||
if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; }
|
||||
FAIL_IF( parser.parse_string(true) );
|
||||
goto object_key_parser;
|
||||
case '}':
|
||||
parser.end_object();
|
||||
goto scope_end;
|
||||
default:
|
||||
parser.log_error("No comma between object fields");
|
||||
goto error;
|
||||
}
|
||||
|
||||
scope_end:
|
||||
CONTINUE( parser.parser.ret_address[parser.depth] );
|
||||
|
||||
//
|
||||
// Array parser parsers
|
||||
//
|
||||
array_begin:
|
||||
if (parser.advance_char() == ']') {
|
||||
parser.end_array();
|
||||
goto scope_end;
|
||||
}
|
||||
parser.increment_count();
|
||||
|
||||
main_array_switch:
|
||||
/* we call update char on all paths in, so we can peek at parser.c on the
|
||||
* on paths that can accept a close square brace (post-, and at start) */
|
||||
GOTO( parser.parse_value(addresses, addresses.array_continue) );
|
||||
|
||||
array_continue:
|
||||
switch (parser.advance_char()) {
|
||||
case ',':
|
||||
parser.increment_count();
|
||||
parser.advance_char();
|
||||
goto main_array_switch;
|
||||
case ']':
|
||||
parser.end_array();
|
||||
goto scope_end;
|
||||
default:
|
||||
parser.log_error("Missing comma between array values");
|
||||
goto error;
|
||||
}
|
||||
|
||||
finish:
|
||||
return parser.finish();
|
||||
|
||||
error:
|
||||
return parser.error();
|
||||
}
|
|
@ -2,29 +2,34 @@ namespace stage2 {
|
|||
|
||||
class structural_iterator {
|
||||
public:
|
||||
really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
|
||||
: buf{_buf},
|
||||
len{_len},
|
||||
structural_indexes{_structural_indexes},
|
||||
next_structural{next_structural_index}
|
||||
{}
|
||||
really_inline char advance_char() {
|
||||
idx = structural_indexes[next_structural];
|
||||
next_structural++;
|
||||
c = *current();
|
||||
return c;
|
||||
}
|
||||
really_inline char current_char() {
|
||||
return c;
|
||||
}
|
||||
really_inline char peek_char() {
|
||||
return buf[structural_indexes[next_structural]];
|
||||
const uint8_t* const buf;
|
||||
uint32_t *current_structural;
|
||||
dom_parser_implementation &parser;
|
||||
|
||||
// Start a structural
|
||||
really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
|
||||
: buf{_parser.buf},
|
||||
current_structural{&_parser.structural_indexes[start_structural_index]},
|
||||
parser{_parser} {
|
||||
}
|
||||
// Get the buffer position of the current structural character
|
||||
really_inline const uint8_t* current() {
|
||||
return &buf[idx];
|
||||
return &buf[*current_structural];
|
||||
}
|
||||
// Get the current structural character
|
||||
really_inline char current_char() {
|
||||
return buf[*current_structural];
|
||||
}
|
||||
// Get the next structural character without advancing
|
||||
really_inline char peek_next_char() {
|
||||
return buf[*(current_structural+1)];
|
||||
}
|
||||
really_inline char advance_char() {
|
||||
current_structural++;
|
||||
return buf[*current_structural];
|
||||
}
|
||||
really_inline size_t remaining_len() {
|
||||
return len - idx;
|
||||
return parser.len - *current_structural;
|
||||
}
|
||||
template<typename F>
|
||||
really_inline bool with_space_terminated_copy(const F& f) {
|
||||
|
@ -41,35 +46,25 @@ public:
|
|||
* practice unless you are in the strange scenario where you have many JSON
|
||||
* documents made of single atoms.
|
||||
*/
|
||||
char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||
char *copy = static_cast<char *>(malloc(parser.len + SIMDJSON_PADDING));
|
||||
if (copy == nullptr) {
|
||||
return true;
|
||||
}
|
||||
memcpy(copy, buf, len);
|
||||
memset(copy + len, ' ', SIMDJSON_PADDING);
|
||||
bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
|
||||
memcpy(copy, buf, parser.len);
|
||||
memset(copy + parser.len, ' ', SIMDJSON_PADDING);
|
||||
bool result = f(reinterpret_cast<const uint8_t*>(copy), *current_structural);
|
||||
free(copy);
|
||||
return result;
|
||||
}
|
||||
really_inline bool past_end(uint32_t n_structural_indexes) {
|
||||
return next_structural > n_structural_indexes;
|
||||
return current_structural >= &parser.structural_indexes[n_structural_indexes];
|
||||
}
|
||||
really_inline bool at_end(uint32_t n_structural_indexes) {
|
||||
return next_structural == n_structural_indexes;
|
||||
return current_structural == &parser.structural_indexes[n_structural_indexes];
|
||||
}
|
||||
really_inline bool at_beginning() {
|
||||
return next_structural == 0;
|
||||
return current_structural == parser.structural_indexes.get();
|
||||
}
|
||||
really_inline size_t next_structural_index() {
|
||||
return next_structural;
|
||||
}
|
||||
|
||||
const uint8_t* const buf;
|
||||
const size_t len;
|
||||
const uint32_t* const structural_indexes;
|
||||
size_t next_structural; // next structural index
|
||||
size_t idx{0}; // location of the structural character in the input (buf)
|
||||
uint8_t c{0}; // used to track the (structural) character we are looking at
|
||||
};
|
||||
|
||||
} // namespace stage2
|
||||
|
|
|
@ -69,17 +69,15 @@ struct number_writer {
|
|||
}
|
||||
}; // struct number_writer
|
||||
|
||||
struct structural_parser {
|
||||
structural_iterator structurals;
|
||||
dom_parser_implementation &parser;
|
||||
struct structural_parser : structural_iterator {
|
||||
/** Next write location in the string buf for stage 2 parsing */
|
||||
uint8_t *current_string_buf_loc{};
|
||||
/** Current depth (nested objects and arrays) */
|
||||
uint32_t depth;
|
||||
|
||||
// For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
|
||||
really_inline structural_parser(dom_parser_implementation &_parser, uint32_t next_structural)
|
||||
: structurals(_parser.buf, _parser.len, _parser.structural_indexes.get(), next_structural),
|
||||
parser{_parser},
|
||||
really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
|
||||
: structural_iterator(_parser, start_structural_index),
|
||||
depth{0} {
|
||||
}
|
||||
|
||||
|
@ -174,7 +172,7 @@ struct structural_parser {
|
|||
WARN_UNUSED really_inline bool parse_string(bool key = false) {
|
||||
log_value(key ? "key" : "string");
|
||||
uint8_t *dst = on_start_string();
|
||||
dst = stringparsing::parse_string(structurals.current(), dst);
|
||||
dst = stringparsing::parse_string(current(), dst);
|
||||
if (dst == nullptr) {
|
||||
log_error("Invalid escape in string");
|
||||
return true;
|
||||
|
@ -191,64 +189,28 @@ struct structural_parser {
|
|||
return !succeeded;
|
||||
}
|
||||
WARN_UNUSED really_inline bool parse_number(bool found_minus) {
|
||||
return parse_number(structurals.current(), found_minus);
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline bool parse_atom() {
|
||||
switch (structurals.current_char()) {
|
||||
case 't':
|
||||
log_value("true");
|
||||
if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
|
||||
append_tape(0, internal::tape_type::TRUE_VALUE);
|
||||
break;
|
||||
case 'f':
|
||||
log_value("false");
|
||||
if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
|
||||
append_tape(0, internal::tape_type::FALSE_VALUE);
|
||||
break;
|
||||
case 'n':
|
||||
log_value("null");
|
||||
if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
|
||||
append_tape(0, internal::tape_type::NULL_VALUE);
|
||||
break;
|
||||
default:
|
||||
log_error("IMPOSSIBLE: unrecognized parse_atom structural character");
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline bool parse_single_atom() {
|
||||
switch (structurals.current_char()) {
|
||||
case 't':
|
||||
log_value("true");
|
||||
if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
|
||||
append_tape(0, internal::tape_type::TRUE_VALUE);
|
||||
break;
|
||||
case 'f':
|
||||
log_value("false");
|
||||
if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
|
||||
append_tape(0, internal::tape_type::FALSE_VALUE);
|
||||
break;
|
||||
case 'n':
|
||||
log_value("null");
|
||||
if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
|
||||
append_tape(0, internal::tape_type::NULL_VALUE);
|
||||
break;
|
||||
default:
|
||||
log_error("IMPOSSIBLE: unrecognized parse_atom structural character");
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
return parse_number(current(), found_minus);
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) {
|
||||
switch (structurals.current_char()) {
|
||||
switch (advance_char()) {
|
||||
case '"':
|
||||
FAIL_IF( parse_string() );
|
||||
return continue_state;
|
||||
case 't': case 'f': case 'n':
|
||||
FAIL_IF( parse_atom() );
|
||||
case 't':
|
||||
log_value("true");
|
||||
FAIL_IF( !atomparsing::is_valid_true_atom(current()) );
|
||||
append_tape(0, internal::tape_type::TRUE_VALUE);
|
||||
return continue_state;
|
||||
case 'f':
|
||||
log_value("false");
|
||||
FAIL_IF( !atomparsing::is_valid_false_atom(current()) );
|
||||
append_tape(0, internal::tape_type::FALSE_VALUE);
|
||||
return continue_state;
|
||||
case 'n':
|
||||
log_value("null");
|
||||
FAIL_IF( !atomparsing::is_valid_null_atom(current()) );
|
||||
append_tape(0, internal::tape_type::NULL_VALUE);
|
||||
return continue_state;
|
||||
case '0': case '1': case '2': case '3': case '4':
|
||||
case '5': case '6': case '7': case '8': case '9':
|
||||
|
@ -271,7 +233,7 @@ struct structural_parser {
|
|||
|
||||
WARN_UNUSED really_inline error_code finish() {
|
||||
end_document();
|
||||
parser.next_structural_index = uint32_t(structurals.next_structural_index());
|
||||
parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
|
||||
|
||||
if (depth != 0) {
|
||||
log_error("Unclosed objects or arrays!");
|
||||
|
@ -295,7 +257,7 @@ struct structural_parser {
|
|||
if (depth >= parser.max_depth()) {
|
||||
return parser.error = DEPTH_ERROR;
|
||||
}
|
||||
switch (structurals.current_char()) {
|
||||
switch (current_char()) {
|
||||
case '"':
|
||||
return parser.error = STRING_ERROR;
|
||||
case '0':
|
||||
|
@ -322,6 +284,7 @@ struct structural_parser {
|
|||
}
|
||||
|
||||
really_inline void init() {
|
||||
log_start();
|
||||
current_string_buf_loc = parser.doc->string_buf.get();
|
||||
parser.current_loc = 0;
|
||||
parser.error = UNINITIALIZED;
|
||||
|
@ -329,14 +292,11 @@ struct structural_parser {
|
|||
|
||||
WARN_UNUSED really_inline error_code start(ret_address_t finish_state) {
|
||||
// If there are no structurals left, return EMPTY
|
||||
if (structurals.at_end(parser.n_structural_indexes)) {
|
||||
if (at_end(parser.n_structural_indexes)) {
|
||||
return parser.error = EMPTY;
|
||||
}
|
||||
|
||||
log_start();
|
||||
init();
|
||||
// Advance to the first character as soon as possible
|
||||
structurals.advance_char();
|
||||
// Push the root scope (there is always at least one scope)
|
||||
if (start_document(finish_state)) {
|
||||
return parser.error = DEPTH_ERROR;
|
||||
|
@ -344,12 +304,8 @@ struct structural_parser {
|
|||
return SUCCESS;
|
||||
}
|
||||
|
||||
really_inline char advance_char() {
|
||||
return structurals.advance_char();
|
||||
}
|
||||
|
||||
really_inline void log_value(const char *type) {
|
||||
logger::log_line(structurals, "", type, "");
|
||||
logger::log_line(*this, "", type, "");
|
||||
}
|
||||
|
||||
static really_inline void log_start() {
|
||||
|
@ -357,17 +313,17 @@ struct structural_parser {
|
|||
}
|
||||
|
||||
really_inline void log_start_value(const char *type) {
|
||||
logger::log_line(structurals, "+", type, "");
|
||||
logger::log_line(*this, "+", type, "");
|
||||
if (logger::LOG_ENABLED) { logger::log_depth++; }
|
||||
}
|
||||
|
||||
really_inline void log_end_value(const char *type) {
|
||||
if (logger::LOG_ENABLED) { logger::log_depth--; }
|
||||
logger::log_line(structurals, "-", type, "");
|
||||
logger::log_line(*this, "-", type, "");
|
||||
}
|
||||
|
||||
really_inline void log_error(const char *error) {
|
||||
logger::log_line(structurals, "", "ERROR", error);
|
||||
logger::log_line(*this, "", "ERROR", error);
|
||||
}
|
||||
}; // struct structural_parser
|
||||
|
||||
|
@ -386,7 +342,7 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
|
|||
//
|
||||
// Read first value
|
||||
//
|
||||
switch (parser.structurals.current_char()) {
|
||||
switch (parser.current_char()) {
|
||||
case '{':
|
||||
FAIL_IF( parser.start_object(addresses.finish) );
|
||||
goto object_begin;
|
||||
|
@ -394,27 +350,41 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
|
|||
FAIL_IF( parser.start_array(addresses.finish) );
|
||||
// Make sure the outer array is closed before continuing; otherwise, there are ways we could get
|
||||
// into memory corruption. See https://github.com/simdjson/simdjson/issues/906
|
||||
if (parser.structurals.buf[parser.structurals.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') {
|
||||
goto error;
|
||||
if (!STREAMING) {
|
||||
if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') {
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
goto array_begin;
|
||||
case '"':
|
||||
FAIL_IF( parser.parse_string() );
|
||||
goto finish;
|
||||
case 't': case 'f': case 'n':
|
||||
FAIL_IF( parser.parse_single_atom() );
|
||||
case 't':
|
||||
parser.log_value("true");
|
||||
FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) );
|
||||
parser.append_tape(0, internal::tape_type::TRUE_VALUE);
|
||||
goto finish;
|
||||
case 'f':
|
||||
parser.log_value("false");
|
||||
FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) );
|
||||
parser.append_tape(0, internal::tape_type::FALSE_VALUE);
|
||||
goto finish;
|
||||
case 'n':
|
||||
parser.log_value("null");
|
||||
FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) );
|
||||
parser.append_tape(0, internal::tape_type::NULL_VALUE);
|
||||
goto finish;
|
||||
case '0': case '1': case '2': case '3': case '4':
|
||||
case '5': case '6': case '7': case '8': case '9':
|
||||
FAIL_IF(
|
||||
parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
|
||||
parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
|
||||
return parser.parse_number(©[idx], false);
|
||||
})
|
||||
);
|
||||
goto finish;
|
||||
case '-':
|
||||
FAIL_IF(
|
||||
parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
|
||||
parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
|
||||
return parser.parse_number(©[idx], true);
|
||||
})
|
||||
);
|
||||
|
@ -444,7 +414,6 @@ object_begin:
|
|||
|
||||
object_key_state:
|
||||
if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; }
|
||||
parser.advance_char();
|
||||
GOTO( parser.parse_value(addresses, addresses.object_continue) );
|
||||
|
||||
object_continue:
|
||||
|
@ -469,7 +438,8 @@ scope_end:
|
|||
// Array parser states
|
||||
//
|
||||
array_begin:
|
||||
if (parser.advance_char() == ']') {
|
||||
if (parser.peek_next_char() == ']') {
|
||||
parser.advance_char();
|
||||
parser.end_array();
|
||||
goto scope_end;
|
||||
}
|
||||
|
@ -484,7 +454,6 @@ array_continue:
|
|||
switch (parser.advance_char()) {
|
||||
case ',':
|
||||
parser.increment_count();
|
||||
parser.advance_char();
|
||||
goto main_array_switch;
|
||||
case ']':
|
||||
parser.end_array();
|
||||
|
|
Loading…
Reference in New Issue