Merge pull request #900 from simdjson/jkeiser/delay-start-element

Don't write the start element at all until the end
This commit is contained in:
John Keiser 2020-05-21 14:57:54 -07:00 committed by GitHub
commit 219b02c1e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 27 additions and 23 deletions

View File

@ -51,21 +51,21 @@ struct number_writer {
parser &doc_parser; parser &doc_parser;
really_inline void write_s64(int64_t value) noexcept { really_inline void write_s64(int64_t value) noexcept {
write_tape(0, internal::tape_type::INT64); append_tape(0, internal::tape_type::INT64);
std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value)); std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value));
++doc_parser.current_loc; ++doc_parser.current_loc;
} }
really_inline void write_u64(uint64_t value) noexcept { really_inline void write_u64(uint64_t value) noexcept {
write_tape(0, internal::tape_type::UINT64); append_tape(0, internal::tape_type::UINT64);
doc_parser.doc.tape[doc_parser.current_loc++] = value; doc_parser.doc.tape[doc_parser.current_loc++] = value;
} }
really_inline void write_double(double value) noexcept { really_inline void write_double(double value) noexcept {
write_tape(0, internal::tape_type::DOUBLE); append_tape(0, internal::tape_type::DOUBLE);
static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size"); static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size");
memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double)); memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double));
// doc.tape[doc.current_loc++] = *((uint64_t *)&d); // doc.tape[doc.current_loc++] = *((uint64_t *)&d);
} }
really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { really_inline void append_tape(uint64_t val, internal::tape_type t) noexcept {
doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
} }
}; // struct number_writer }; // struct number_writer
@ -84,10 +84,10 @@ struct structural_parser {
uint32_t next_structural = 0 uint32_t next_structural = 0
) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {} ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) { WARN_UNUSED really_inline bool start_scope(ret_address continue_state) {
doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc; doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc;
doc_parser.containing_scope[depth].count = 0; doc_parser.containing_scope[depth].count = 0;
write_tape(0, type); // if the document is correct, this gets rewritten later doc_parser.current_loc++; // We don't actually *write* the start element until the end.
doc_parser.ret_address[depth] = continue_state; doc_parser.ret_address[depth] = continue_state;
depth++; depth++;
bool exceeded_max_depth = depth >= doc_parser.max_depth(); bool exceeded_max_depth = depth >= doc_parser.max_depth();
@ -97,51 +97,55 @@ struct structural_parser {
WARN_UNUSED really_inline bool start_document(ret_address continue_state) { WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
log_start_value("document"); log_start_value("document");
return start_scope(internal::tape_type::ROOT, continue_state); return start_scope(continue_state);
} }
WARN_UNUSED really_inline bool start_object(ret_address continue_state) { WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
log_start_value("object"); log_start_value("object");
return start_scope(internal::tape_type::START_OBJECT, continue_state); return start_scope(continue_state);
} }
WARN_UNUSED really_inline bool start_array(ret_address continue_state) { WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
log_start_value("array"); log_start_value("array");
return start_scope(internal::tape_type::START_ARRAY, continue_state); return start_scope(continue_state);
} }
// this function is responsible for annotating the start of the scope // this function is responsible for annotating the start of the scope
really_inline void end_scope(internal::tape_type type) noexcept { really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
depth--; depth--;
// write our doc.tape location to the header scope // write our doc.tape location to the header scope
// The root scope gets written *at* the previous location. // The root scope gets written *at* the previous location.
write_tape(doc_parser.containing_scope[depth].tape_index, type); append_tape(doc_parser.containing_scope[depth].tape_index, end);
// count can overflow if it exceeds 24 bits... so we saturate // count can overflow if it exceeds 24 bits... so we saturate
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index; const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index;
const uint32_t count = doc_parser.containing_scope[depth].count; const uint32_t count = doc_parser.containing_scope[depth].count;
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
// This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index] // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index]
doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32); write_tape(start_tape_index, doc_parser.current_loc | (uint64_t(cntsat) << 32), start);
} }
really_inline void end_object() { really_inline void end_object() {
log_end_value("object"); log_end_value("object");
end_scope(internal::tape_type::END_OBJECT); end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
} }
really_inline void end_array() { really_inline void end_array() {
log_end_value("array"); log_end_value("array");
end_scope(internal::tape_type::END_ARRAY); end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
} }
really_inline void end_document() { really_inline void end_document() {
log_end_value("document"); log_end_value("document");
end_scope(internal::tape_type::ROOT); end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
} }
really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { really_inline void append_tape(uint64_t val, internal::tape_type t) noexcept {
doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
} }
really_inline void write_tape(uint32_t loc, uint64_t val, internal::tape_type t) noexcept {
doc_parser.doc.tape[loc] = val | ((uint64_t(char(t))) << 56);
}
// increment_count increments the count of keys in an object or values in an array. // increment_count increments the count of keys in an object or values in an array.
// Note that if you are at the level of the values or elements, the count // Note that if you are at the level of the values or elements, the count
// must be increment in the preceding depth (depth-1) where the array or // must be increment in the preceding depth (depth-1) where the array or
@ -152,7 +156,7 @@ struct structural_parser {
really_inline uint8_t *on_start_string() noexcept { really_inline uint8_t *on_start_string() noexcept {
// we advance the point, accounting for the fact that we have a NULL termination // we advance the point, accounting for the fact that we have a NULL termination
write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING); append_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING);
return current_string_buf_loc + sizeof(uint32_t); return current_string_buf_loc + sizeof(uint32_t);
} }
@ -196,17 +200,17 @@ struct structural_parser {
case 't': case 't':
log_value("true"); log_value("true");
if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; } if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
write_tape(0, internal::tape_type::TRUE_VALUE); append_tape(0, internal::tape_type::TRUE_VALUE);
break; break;
case 'f': case 'f':
log_value("false"); log_value("false");
if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; } if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
write_tape(0, internal::tape_type::FALSE_VALUE); append_tape(0, internal::tape_type::FALSE_VALUE);
break; break;
case 'n': case 'n':
log_value("null"); log_value("null");
if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; } if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
write_tape(0, internal::tape_type::NULL_VALUE); append_tape(0, internal::tape_type::NULL_VALUE);
break; break;
default: default:
log_error("IMPOSSIBLE: unrecognized parse_atom structural character"); log_error("IMPOSSIBLE: unrecognized parse_atom structural character");
@ -220,17 +224,17 @@ struct structural_parser {
case 't': case 't':
log_value("true"); log_value("true");
if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; } if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
write_tape(0, internal::tape_type::TRUE_VALUE); append_tape(0, internal::tape_type::TRUE_VALUE);
break; break;
case 'f': case 'f':
log_value("false"); log_value("false");
if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; } if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
write_tape(0, internal::tape_type::FALSE_VALUE); append_tape(0, internal::tape_type::FALSE_VALUE);
break; break;
case 'n': case 'n':
log_value("null"); log_value("null");
if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; } if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
write_tape(0, internal::tape_type::NULL_VALUE); append_tape(0, internal::tape_type::NULL_VALUE);
break; break;
default: default:
log_error("IMPOSSIBLE: unrecognized parse_atom structural character"); log_error("IMPOSSIBLE: unrecognized parse_atom structural character");