diff --git a/singleheader/amalgamate_demo.cpp b/singleheader/amalgamate_demo.cpp index a7cfbf8f..37a54b13 100644 --- a/singleheader/amalgamate_demo.cpp +++ b/singleheader/amalgamate_demo.cpp @@ -1,35 +1,70 @@ -#include -#include "simdjson.h" #include "simdjson.cpp" +#include "simdjson.h" +#include int main(int argc, char *argv[]) { - if(argc < 2) { - std::cerr << "Please specify at least one file name. " << std::endl; + if (argc < 2) { + std::cerr << "Please specify at least one file name and" << std::endl; + std::cerr << "up to two files." << std::endl; + std::cerr << "The first file should be a JSON document." << std::endl; + std::cerr << "The secod file should container many JSON documents." + << std::endl; + std::cerr << "Try the test files: jsonexamples/twitter.json " + "jsonexamples/amazon_cellphones.ndjson" + << std::endl; return EXIT_FAILURE; } - const char * filename = argv[1]; - simdjson::dom::parser parser; - simdjson::dom::element elem; - auto error = parser.load(filename).get(elem); // do the parsing + const char *filename = argv[1]; + + simdjson::padded_string json; + std::cout << "loading: " << filename << std::endl; + auto error = simdjson::padded_string::load(filename).get(json); if (error) { - std::cout << "parse failed" << std::endl; + std::cout << "could not load the file " << filename << std::endl; std::cout << "error code: " << error << std::endl; - std::cout << error << std::endl; return EXIT_FAILURE; } else { - std::cout << "parse valid: " << elem << std::endl; + std::cout << "loaded: " << json.size() << " bytes." << std::endl; } - if(argc == 2) { + simdjson::ondemand::parser parser; + simdjson::ondemand::document doc; + error = parser.iterate(json).get(doc); + if (error) { + std::cout << error << std::endl; + return EXIT_FAILURE; + } + simdjson::ondemand::json_type type; + error = doc.type().get(type); + if (error) { + std::cout << error << std::endl; + return EXIT_FAILURE; + } + std::cout << "document has the following type at the root: " << type + << std::endl; + + if (argc == 2) { return EXIT_SUCCESS; } - // parse_many - const char * filename2 = argv[2]; - simdjson::dom::document_stream stream; - error = parser.load_many(filename2).get(stream); + // iterate_many + const char *filename2 = argv[2]; + std::cout << "loading: " << filename2 << std::endl; + simdjson::padded_string json2; + error = simdjson::padded_string::load(filename2).get(json2); + if (error) { + std::cout << "could not load the file " << filename2 << std::endl; + std::cout << "error code: " << error << std::endl; + return EXIT_FAILURE; + } else { + std::cout << "loaded: " << json2.size() << " bytes." << std::endl; + } + simdjson::ondemand::document_stream stream; + error = parser.iterate_many(json2).get(stream); + size_t counter{0}; if (!error) { for (auto result : stream) { error = result.error(); + counter++; } } if (error) { @@ -38,8 +73,8 @@ int main(int argc, char *argv[]) { std::cout << error << std::endl; return EXIT_FAILURE; } else { - std::cout << "parse_many valid" << std::endl; + std::cout << "iterate_many valid" << std::endl; + std::cout << "found " << counter << " documents" << std::endl; } return EXIT_SUCCESS; } - diff --git a/singleheader/simdjson.cpp b/singleheader/simdjson.cpp index 16450c53..b84ac74b 100644 --- a/singleheader/simdjson.cpp +++ b/singleheader/simdjson.cpp @@ -1,4 +1,4 @@ -/* auto-generated on 2021-03-18 11:30:40 -0400. Do not edit! */ +/* auto-generated on 2021-08-06 20:25:05 -0400. Do not edit! */ /* begin file src/simdjson.cpp */ #include "simdjson.h" @@ -9,6 +9,8 @@ SIMDJSON_DISABLE_UNDESIRED_WARNINGS #include #include #include +#include + namespace simdjson { namespace internal { /*! @@ -866,9 +868,9 @@ inline char *format_buffer(char *buf, int len, int decimal_exponent, std::memset(buf + k, '0', static_cast(n) - static_cast(k)); // Make it look like a floating-point number (#362, #378) - buf[n + 0] = '.'; - buf[n + 1] = '0'; - return buf + (static_cast(n) + 2); + // buf[n + 0] = '.'; + // buf[n + 1] = '0'; + return buf + (static_cast(n)); } if (0 < n && n <= max_exp) { @@ -921,7 +923,8 @@ format. Returns an iterator pointing past-the-end of the decimal representation. */ char *to_chars(char *first, const char *last, double value) { static_cast(last); // maybe unused - fix warning - if (value <= -0) { + bool negative = std::signbit(value); + if (negative) { value = -value; *first++ = '-'; } @@ -930,8 +933,10 @@ char *to_chars(char *first, const char *last, double value) { { *first++ = '0'; // Make it look like a floating-point number (#362, #378) - *first++ = '.'; - *first++ = '0'; + if(negative) { + *first++ = '.'; + *first++ = '0'; + } return first; } // Compute v = buffer * 10^decimal_exponent. @@ -1088,6 +1093,86 @@ decimal parse_decimal(const char *&p) noexcept { return answer; } +// This should always succeed since it follows a call to parse_number. +// Will not read at or beyond the "end" pointer. +decimal parse_decimal(const char *&p, const char * end) noexcept { + decimal answer; + answer.num_digits = 0; + answer.decimal_point = 0; + answer.truncated = false; + if(p == end) { return answer; } // should never happen + answer.negative = (*p == '-'); + if ((*p == '-') || (*p == '+')) { + ++p; + } + + while ((p != end) && (*p == '0')) { + ++p; + } + while ((p != end) && is_integer(*p)) { + if (answer.num_digits < max_digits) { + answer.digits[answer.num_digits] = uint8_t(*p - '0'); + } + answer.num_digits++; + ++p; + } + if ((p != end) && (*p == '.')) { + ++p; + if(p == end) { return answer; } // should never happen + const char *first_after_period = p; + // if we have not yet encountered a zero, we have to skip it as well + if (answer.num_digits == 0) { + // skip zeros + while (*p == '0') { + ++p; + } + } + while ((p != end) && is_integer(*p)) { + if (answer.num_digits < max_digits) { + answer.digits[answer.num_digits] = uint8_t(*p - '0'); + } + answer.num_digits++; + ++p; + } + answer.decimal_point = int32_t(first_after_period - p); + } + if(answer.num_digits > 0) { + const char *preverse = p - 1; + int32_t trailing_zeros = 0; + while ((*preverse == '0') || (*preverse == '.')) { + if(*preverse == '0') { trailing_zeros++; }; + --preverse; + } + answer.decimal_point += int32_t(answer.num_digits); + answer.num_digits -= uint32_t(trailing_zeros); + } + if(answer.num_digits > max_digits ) { + answer.num_digits = max_digits; + answer.truncated = true; + } + if ((p != end) && (('e' == *p) || ('E' == *p))) { + ++p; + if(p == end) { return answer; } // should never happen + bool neg_exp = false; + if ('-' == *p) { + neg_exp = true; + ++p; + } else if ('+' == *p) { + ++p; + } + int32_t exp_number = 0; // exponential part + while ((p != end) && is_integer(*p)) { + uint8_t digit = uint8_t(*p - '0'); + if (exp_number < 0x10000) { + exp_number = 10 * exp_number + digit; + } + ++p; + } + answer.decimal_point += (neg_exp ? -exp_number : exp_number); + } + return answer; +} + namespace { // remove all final zeroes @@ -1427,6 +1512,12 @@ adjusted_mantissa parse_long_mantissa(const char *first) { return compute_float(d); } +template +adjusted_mantissa parse_long_mantissa(const char *first, const char *end) { + decimal d = parse_decimal(first, end); + return compute_float(d); +} + double from_chars(const char *first) noexcept { bool negative = first[0] == '-'; if (negative) { @@ -1443,6 +1534,23 @@ double from_chars(const char *first) noexcept { return value; } + +double from_chars(const char *first, const char *end) noexcept { + bool negative = first[0] == '-'; + if (negative) { + first++; + } + adjusted_mantissa am = parse_long_mantissa>(first, end); + uint64_t word = am.mantissa; + word |= uint64_t(am.power2) + << binary_format::mantissa_explicit_bits(); + word = negative ? word | (uint64_t(1) << binary_format::sign_index()) + : word; + double value; + std::memcpy(&value, &word, sizeof(double)); + return value; +} + } // internal } // simdjson /* end file src/from_chars.cpp */ @@ -1478,7 +1586,8 @@ namespace internal { { UNEXPECTED_ERROR, "Unexpected error, consider reporting this problem as you may have found a bug in simdjson" }, { PARSER_IN_USE, "Cannot parse a new document while a document is still in use." }, { OUT_OF_ORDER_ITERATION, "Objects and arrays can only be iterated when they are first encountered." }, - { INSUFFICIENT_PADDING, "simdjson requires the input JSON string to have at least SIMDJSON_PADDING extra bytes allocated, beyond the string's length." } + { INSUFFICIENT_PADDING, "simdjson requires the input JSON string to have at least SIMDJSON_PADDING extra bytes allocated, beyond the string's length. Consider using the simdjson::padded_string class if needed." }, + { INCOMPLETE_ARRAY_OR_OBJECT, "JSON document ended early in the middle of an object or array." } }; // error_messages[] } // namespace internal @@ -2674,8 +2783,10 @@ simdjson_warn_unused error_code implementation::create_dom_parser_implementation ) const noexcept { dst.reset( new (std::nothrow) dom_parser_implementation() ); if (!dst) { return MEMALLOC; } - dst->set_capacity(capacity); - dst->set_max_depth(max_depth); + if (auto err = dst->set_capacity(capacity)) + return err; + if (auto err = dst->set_max_depth(max_depth)) + return err; return SUCCESS; } @@ -2962,7 +3073,6 @@ using namespace simd; } this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; - } } // do not forget to call check_eof! @@ -3505,8 +3615,8 @@ namespace { * * Simply put, we iterate over the structural characters, starting from * the end. We consider that we found the end of a JSON document when the - * first element of the pair is NOT one of these characters: '{' '[' ';' ',' - * and when the second element is NOT one of these characters: '}' '}' ';' ','. + * first element of the pair is NOT one of these characters: '{' '[' ':' ',' + * and when the second element is NOT one of these characters: '}' ']' ':' ','. * * This simple comparison works most of the time, but it does not cover cases * where the batch's structural indexes contain a perfect amount of documents. @@ -3520,7 +3630,8 @@ namespace { * batch. */ simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) { - // TODO don't count separately, just figure out depth + // Variant: do not count separately, just figure out depth + if(parser.n_structural_indexes == 0) { return 0; } auto arr_cnt = 0; auto obj_cnt = 0; for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { @@ -3557,6 +3668,25 @@ simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementati // Last document is incomplete; mark the document at i + 1 as the next one return i; } + // If we made it to the end, we want to finish counting to see if we have a full document. + switch (parser.buf[parser.structural_indexes[0]]) { + case '}': + obj_cnt--; + break; + case ']': + arr_cnt--; + break; + case '{': + obj_cnt++; + break; + case '[': + arr_cnt++; + break; + } + if (!arr_cnt && !obj_cnt) { + // We have a complete document. + return parser.n_structural_indexes; + } return 0; } @@ -3587,8 +3717,62 @@ public: // it helps tremendously. if (bits == 0) return; - int cnt = static_cast(count_ones(bits)); +#if defined(SIMDJSON_PREFER_REVERSE_BITS) + /** + * ARM lacks a fast trailing zero instruction, but it has a fast + * bit reversal instruction and a fast leading zero instruction. + * Thus it may be profitable to reverse the bits (once) and then + * to rely on a sequence of instructions that call the leading + * zero instruction. + * + * Performance notes: + * The chosen routine is not optimal in terms of data dependency + * since zero_leading_bit might require two instructions. However, + * it tends to minimize the total number of instructions which is + * beneficial. + */ + uint64_t rev_bits = reverse_bits(bits); + int cnt = static_cast(count_ones(bits)); + int i = 0; + // Do the first 8 all together + for (; i<8; i++) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + // Do the next 8 all together (we hope in most cases it won't happen at all + // and the branch is easily predicted). + if (simdjson_unlikely(cnt > 8)) { + i = 8; + for (; i<16; i++) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + + + // Most files don't have 16+ structurals per block, so we take several basically guaranteed + // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) + // or the start of a value ("abc" true 123) every four characters. + if (simdjson_unlikely(cnt > 16)) { + i = 16; + while (rev_bits != 0) { + int lz = leading_zeroes(rev_bits); + this->tail[i++] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + } + } + this->tail += cnt; +#else // SIMDJSON_PREFER_REVERSE_BITS + /** + * Under recent x64 systems, we often have both a fast trailing zero + * instruction and a fast 'clear-lower-bit' instruction so the following + * algorithm can be competitive. + */ + + int cnt = static_cast(count_ones(bits)); // Do the first 8 all together for (int i=0; i<8; i++) { this->tail[i] = idx + trailing_zeroes(bits); @@ -3617,6 +3801,7 @@ public: } this->tail += cnt; +#endif } }; @@ -3630,14 +3815,14 @@ public: * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8. */ template - static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept; + static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept; private: simdjson_really_inline json_structural_indexer(uint32_t *structural_indexes); template simdjson_really_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; simdjson_really_inline void next(const simd::simd8x64& in, const json_block& block, size_t idx); - simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial); + simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial); json_scanner scanner{}; utf8_checker checker{}; @@ -3687,10 +3872,17 @@ simdjson_really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) // workout. // template -error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept { +error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept { if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; } - if (partial) { len = trim_partial_utf8(buf, len); } - + // We guard the rest of the code so that we can assume that len > 0 throughout. + if (len == 0) { return EMPTY; } + if (is_streaming(partial)) { + len = trim_partial_utf8(buf, len); + // If you end up with an empty window after trimming + // the partial UTF-8 bytes, then chances are good that you + // have an UTF-8 formatting error. + if(len == 0) { return UTF8_ERROR; } + } buf_block_reader reader(buf, len); json_structural_indexer indexer(parser.structural_indexes.get()); @@ -3698,12 +3890,11 @@ error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_pa while (reader.has_full_block()) { indexer.step(reader.full_block(), reader); } - - // Take care of the last block (will always be there unless file is empty) + // Take care of the last block (will always be there unless file is empty which is + // not supposed to happen.) uint8_t block[STEP_SIZE]; - if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return EMPTY; } + if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; } indexer.step(block, reader); - return indexer.finish(parser, reader.block_index(), len, partial); } @@ -3734,14 +3925,13 @@ simdjson_really_inline void json_structural_indexer::next(const simd::simd8x64 len)) { return UNEXPECTED_ERROR; } - if (partial) { + if (partial == stage1_mode::streaming_partial) { // If we have an unclosed string, then the last structural // will be the quote and we want to make sure to omit it. if(have_unclosed_string) { @@ -3785,11 +3979,48 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp // a valid JSON file cannot have zero structural indexes - we should have found something if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; } } + // We truncate the input to the end of the last complete document (or zero). auto new_structural_indexes = find_next_document_index(parser); if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { - return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. + if(parser.structural_indexes[0] == 0) { + // If the buffer is partial and we started at index 0 but the document is + // incomplete, it's too big to parse. + return CAPACITY; + } else { + // It is possible that the document could be parsed, we just had a lot + // of white space. + parser.n_structural_indexes = 0; + return EMPTY; + } } + parser.n_structural_indexes = new_structural_indexes; + } else if (partial == stage1_mode::streaming_final) { + if(have_unclosed_string) { parser.n_structural_indexes--; } + // We truncate the input to the end of the last complete document (or zero). + // Because partial == stage1_mode::streaming_final, it means that we may + // silently ignore trailing garbage. Though it sounds bad, we do it + // deliberately because many people who have streams of JSON documents + // will truncate them for processing. E.g., imagine that you are uncompressing + // the data from a size file or receiving it in chunks from the network. You + // may not know where exactly the last document will be. Meanwhile the + // document_stream instances allow people to know the JSON documents they are + // parsing (see the iterator.source() method). + parser.n_structural_indexes = find_next_document_index(parser); + // We store the initial n_structural_indexes so that the client can see + // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes, + // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len, + // otherwise, it will copy some prior index. + parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes]; + // This next line is critical, do not change it unless you understand what you are + // doing. + parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); + if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { + // We tolerate an unclosed string at the very end of the stream. Indeed, users + // often load their data in bulk without being careful and they want us to ignore + // the trailing garbage. + return EMPTY; + } } checker.check_eof(); return checker.errors(); @@ -4059,12 +4290,12 @@ simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_docum { auto value = advance(); - // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we + // Make sure the outer object or array is closed before continuing; otherwise, there are ways we // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906 if (!STREAMING) { switch (*value) { - case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break; - case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break; + case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break; + case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break; } } @@ -4662,7 +4893,7 @@ simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_ return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len); } -simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept { +simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept { this->buf = _buf; this->len = _len; return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming); @@ -4681,7 +4912,7 @@ simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::docu } simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { - auto error = stage1(_buf, _len, false); + auto error = stage1(_buf, _len, stage1_mode::regular); if (error) { return error; } return stage2(_doc); } @@ -4710,8 +4941,10 @@ simdjson_warn_unused error_code implementation::create_dom_parser_implementation ) const noexcept { dst.reset( new (std::nothrow) dom_parser_implementation() ); if (!dst) { return MEMALLOC; } - dst->set_capacity(capacity); - dst->set_max_depth(max_depth); + if (auto err = dst->set_capacity(capacity)) + return err; + if (auto err = dst->set_max_depth(max_depth)) + return err; return SUCCESS; } @@ -4746,8 +4979,8 @@ namespace { * * Simply put, we iterate over the structural characters, starting from * the end. We consider that we found the end of a JSON document when the - * first element of the pair is NOT one of these characters: '{' '[' ';' ',' - * and when the second element is NOT one of these characters: '}' '}' ';' ','. + * first element of the pair is NOT one of these characters: '{' '[' ':' ',' + * and when the second element is NOT one of these characters: '}' ']' ':' ','. * * This simple comparison works most of the time, but it does not cover cases * where the batch's structural indexes contain a perfect amount of documents. @@ -4761,7 +4994,8 @@ namespace { * batch. */ simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) { - // TODO don't count separately, just figure out depth + // Variant: do not count separately, just figure out depth + if(parser.n_structural_indexes == 0) { return 0; } auto arr_cnt = 0; auto obj_cnt = 0; for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { @@ -4798,6 +5032,25 @@ simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementati // Last document is incomplete; mark the document at i + 1 as the next one return i; } + // If we made it to the end, we want to finish counting to see if we have a full document. + switch (parser.buf[parser.structural_indexes[0]]) { + case '}': + obj_cnt--; + break; + case ']': + arr_cnt--; + break; + case '{': + obj_cnt++; + break; + case '[': + arr_cnt++; + break; + } + if (!arr_cnt && !obj_cnt) { + // We have a complete document. + return parser.n_structural_indexes; + } return 0; } @@ -4814,7 +5067,7 @@ namespace stage1 { class structural_scanner { public: -simdjson_really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial) +simdjson_really_inline structural_scanner(dom_parser_implementation &_parser, stage1_mode _partial) : buf{_parser.buf}, next_structural_index{_parser.structural_indexes.get()}, parser{_parser}, @@ -4844,7 +5097,7 @@ simdjson_really_inline void validate_utf8_character() { if ((buf[idx] & 0b00100000) == 0) { // missing continuation if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { - if (idx+1 > len && partial) { idx = len; return; } + if (idx+1 > len && is_streaming(partial)) { idx = len; return; } error = UTF8_ERROR; idx++; return; @@ -4859,7 +5112,7 @@ simdjson_really_inline void validate_utf8_character() { if ((buf[idx] & 0b00010000) == 0) { // missing continuation if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { - if (idx+2 > len && partial) { idx = len; return; } + if (idx+2 > len && is_streaming(partial)) { idx = len; return; } error = UTF8_ERROR; idx++; return; @@ -4875,7 +5128,7 @@ simdjson_really_inline void validate_utf8_character() { // 4-byte // missing continuation if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { - if (idx+2 > len && partial) { idx = len; return; } + if (idx+2 > len && is_streaming(partial)) { idx = len; return; } error = UTF8_ERROR; idx++; return; @@ -4948,24 +5201,56 @@ simdjson_really_inline error_code scan() { break; } } - *next_structural_index = len; // We pad beyond. // https://github.com/simdjson/simdjson/issues/906 + // See json_structural_indexer.h for an explanation. + *next_structural_index = len; // assumed later in partial == stage1_mode::streaming_final next_structural_index[1] = len; next_structural_index[2] = 0; parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get()); if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; } parser.next_structural_index = 0; - if (partial) { + if (partial == stage1_mode::streaming_partial) { if(unclosed_string) { parser.n_structural_indexes--; if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; } } + // We truncate the input to the end of the last complete document (or zero). auto new_structural_indexes = find_next_document_index(parser); if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { - return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. + if(parser.structural_indexes[0] == 0) { + // If the buffer is partial and we started at index 0 but the document is + // incomplete, it's too big to parse. + return CAPACITY; + } else { + // It is possible that the document could be parsed, we just had a lot + // of white space. + parser.n_structural_indexes = 0; + return EMPTY; + } } parser.n_structural_indexes = new_structural_indexes; + } else if(partial == stage1_mode::streaming_final) { + if(unclosed_string) { parser.n_structural_indexes--; } + // We truncate the input to the end of the last complete document (or zero). + // Because partial == stage1_mode::streaming_final, it means that we may + // silently ignore trailing garbage. Though it sounds bad, we do it + // deliberately because many people who have streams of JSON documents + // will truncate them for processing. E.g., imagine that you are uncompressing + // the data from a size file or receiving it in chunks from the network. You + // may not know where exactly the last document will be. Meanwhile the + // document_stream instances allow people to know the JSON documents they are + // parsing (see the iterator.source() method). + parser.n_structural_indexes = find_next_document_index(parser); + // We store the initial n_structural_indexes so that the client can see + // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes, + // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len, + // otherwise, it will copy some prior index. + parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes]; + // This next line is critical, do not change it unless you understand what you are + // doing. + parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); + if (parser.n_structural_indexes == 0) { return EMPTY; } } else if(unclosed_string) { error = UNCLOSED_STRING; } return error; } @@ -4977,13 +5262,13 @@ private: uint32_t len; uint32_t idx{0}; error_code error{SUCCESS}; - bool partial; + stage1_mode partial; }; // structural_scanner } // namespace stage1 } // unnamed namespace -simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept { +simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode partial) noexcept { this->buf = _buf; this->len = _len; stage1::structural_scanner scanner(*this, partial); @@ -5333,12 +5618,12 @@ simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_docum { auto value = advance(); - // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we + // Make sure the outer object or array is closed before continuing; otherwise, there are ways we // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906 if (!STREAMING) { switch (*value) { - case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break; - case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break; + case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break; + case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break; } } @@ -5926,7 +6211,7 @@ simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::docu } simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { - auto error = stage1(_buf, _len, false); + auto error = stage1(_buf, _len, stage1_mode::regular); if (error) { return error; } return stage2(_doc); } @@ -5956,8 +6241,10 @@ simdjson_warn_unused error_code implementation::create_dom_parser_implementation ) const noexcept { dst.reset( new (std::nothrow) dom_parser_implementation() ); if (!dst) { return MEMALLOC; } - dst->set_capacity(capacity); - dst->set_max_depth(max_depth); + if (auto err = dst->set_capacity(capacity)) + return err; + if (auto err = dst->set_max_depth(max_depth)) + return err; return SUCCESS; } @@ -6251,7 +6538,6 @@ using namespace simd; } this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; - } } // do not forget to call check_eof! @@ -6794,8 +7080,8 @@ namespace { * * Simply put, we iterate over the structural characters, starting from * the end. We consider that we found the end of a JSON document when the - * first element of the pair is NOT one of these characters: '{' '[' ';' ',' - * and when the second element is NOT one of these characters: '}' '}' ';' ','. + * first element of the pair is NOT one of these characters: '{' '[' ':' ',' + * and when the second element is NOT one of these characters: '}' ']' ':' ','. * * This simple comparison works most of the time, but it does not cover cases * where the batch's structural indexes contain a perfect amount of documents. @@ -6809,7 +7095,8 @@ namespace { * batch. */ simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) { - // TODO don't count separately, just figure out depth + // Variant: do not count separately, just figure out depth + if(parser.n_structural_indexes == 0) { return 0; } auto arr_cnt = 0; auto obj_cnt = 0; for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { @@ -6846,6 +7133,25 @@ simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementati // Last document is incomplete; mark the document at i + 1 as the next one return i; } + // If we made it to the end, we want to finish counting to see if we have a full document. + switch (parser.buf[parser.structural_indexes[0]]) { + case '}': + obj_cnt--; + break; + case ']': + arr_cnt--; + break; + case '{': + obj_cnt++; + break; + case '[': + arr_cnt++; + break; + } + if (!arr_cnt && !obj_cnt) { + // We have a complete document. + return parser.n_structural_indexes; + } return 0; } @@ -6876,8 +7182,62 @@ public: // it helps tremendously. if (bits == 0) return; - int cnt = static_cast(count_ones(bits)); +#if defined(SIMDJSON_PREFER_REVERSE_BITS) + /** + * ARM lacks a fast trailing zero instruction, but it has a fast + * bit reversal instruction and a fast leading zero instruction. + * Thus it may be profitable to reverse the bits (once) and then + * to rely on a sequence of instructions that call the leading + * zero instruction. + * + * Performance notes: + * The chosen routine is not optimal in terms of data dependency + * since zero_leading_bit might require two instructions. However, + * it tends to minimize the total number of instructions which is + * beneficial. + */ + uint64_t rev_bits = reverse_bits(bits); + int cnt = static_cast(count_ones(bits)); + int i = 0; + // Do the first 8 all together + for (; i<8; i++) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + // Do the next 8 all together (we hope in most cases it won't happen at all + // and the branch is easily predicted). + if (simdjson_unlikely(cnt > 8)) { + i = 8; + for (; i<16; i++) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + + + // Most files don't have 16+ structurals per block, so we take several basically guaranteed + // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) + // or the start of a value ("abc" true 123) every four characters. + if (simdjson_unlikely(cnt > 16)) { + i = 16; + while (rev_bits != 0) { + int lz = leading_zeroes(rev_bits); + this->tail[i++] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + } + } + this->tail += cnt; +#else // SIMDJSON_PREFER_REVERSE_BITS + /** + * Under recent x64 systems, we often have both a fast trailing zero + * instruction and a fast 'clear-lower-bit' instruction so the following + * algorithm can be competitive. + */ + + int cnt = static_cast(count_ones(bits)); // Do the first 8 all together for (int i=0; i<8; i++) { this->tail[i] = idx + trailing_zeroes(bits); @@ -6906,6 +7266,7 @@ public: } this->tail += cnt; +#endif } }; @@ -6919,14 +7280,14 @@ public: * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8. */ template - static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept; + static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept; private: simdjson_really_inline json_structural_indexer(uint32_t *structural_indexes); template simdjson_really_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; simdjson_really_inline void next(const simd::simd8x64& in, const json_block& block, size_t idx); - simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial); + simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial); json_scanner scanner{}; utf8_checker checker{}; @@ -6976,10 +7337,17 @@ simdjson_really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) // workout. // template -error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept { +error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept { if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; } - if (partial) { len = trim_partial_utf8(buf, len); } - + // We guard the rest of the code so that we can assume that len > 0 throughout. + if (len == 0) { return EMPTY; } + if (is_streaming(partial)) { + len = trim_partial_utf8(buf, len); + // If you end up with an empty window after trimming + // the partial UTF-8 bytes, then chances are good that you + // have an UTF-8 formatting error. + if(len == 0) { return UTF8_ERROR; } + } buf_block_reader reader(buf, len); json_structural_indexer indexer(parser.structural_indexes.get()); @@ -6987,12 +7355,11 @@ error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_pa while (reader.has_full_block()) { indexer.step(reader.full_block(), reader); } - - // Take care of the last block (will always be there unless file is empty) + // Take care of the last block (will always be there unless file is empty which is + // not supposed to happen.) uint8_t block[STEP_SIZE]; - if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return EMPTY; } + if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; } indexer.step(block, reader); - return indexer.finish(parser, reader.block_index(), len, partial); } @@ -7023,14 +7390,13 @@ simdjson_really_inline void json_structural_indexer::next(const simd::simd8x64 len)) { return UNEXPECTED_ERROR; } - if (partial) { + if (partial == stage1_mode::streaming_partial) { // If we have an unclosed string, then the last structural // will be the quote and we want to make sure to omit it. if(have_unclosed_string) { @@ -7074,11 +7444,48 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp // a valid JSON file cannot have zero structural indexes - we should have found something if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; } } + // We truncate the input to the end of the last complete document (or zero). auto new_structural_indexes = find_next_document_index(parser); if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { - return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. + if(parser.structural_indexes[0] == 0) { + // If the buffer is partial and we started at index 0 but the document is + // incomplete, it's too big to parse. + return CAPACITY; + } else { + // It is possible that the document could be parsed, we just had a lot + // of white space. + parser.n_structural_indexes = 0; + return EMPTY; + } } + parser.n_structural_indexes = new_structural_indexes; + } else if (partial == stage1_mode::streaming_final) { + if(have_unclosed_string) { parser.n_structural_indexes--; } + // We truncate the input to the end of the last complete document (or zero). + // Because partial == stage1_mode::streaming_final, it means that we may + // silently ignore trailing garbage. Though it sounds bad, we do it + // deliberately because many people who have streams of JSON documents + // will truncate them for processing. E.g., imagine that you are uncompressing + // the data from a size file or receiving it in chunks from the network. You + // may not know where exactly the last document will be. Meanwhile the + // document_stream instances allow people to know the JSON documents they are + // parsing (see the iterator.source() method). + parser.n_structural_indexes = find_next_document_index(parser); + // We store the initial n_structural_indexes so that the client can see + // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes, + // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len, + // otherwise, it will copy some prior index. + parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes]; + // This next line is critical, do not change it unless you understand what you are + // doing. + parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); + if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { + // We tolerate an unclosed string at the very end of the stream. Indeed, users + // often load their data in bulk without being careful and they want us to ignore + // the trailing garbage. + return EMPTY; + } } checker.check_eof(); return checker.errors(); @@ -7347,12 +7754,12 @@ simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_docum { auto value = advance(); - // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we + // Make sure the outer object or array is closed before continuing; otherwise, there are ways we // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906 if (!STREAMING) { switch (*value) { - case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break; - case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break; + case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break; + case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break; } } @@ -7948,7 +8355,7 @@ simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_ return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len); } -simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept { +simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept { this->buf = _buf; this->len = _len; return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming); @@ -7967,7 +8374,7 @@ simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::docu } simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { - auto error = stage1(_buf, _len, false); + auto error = stage1(_buf, _len, stage1_mode::regular); if (error) { return error; } return stage2(_doc); } @@ -7997,8 +8404,10 @@ simdjson_warn_unused error_code implementation::create_dom_parser_implementation ) const noexcept { dst.reset( new (std::nothrow) dom_parser_implementation() ); if (!dst) { return MEMALLOC; } - dst->set_capacity(capacity); - dst->set_max_depth(max_depth); + if (auto err = dst->set_capacity(capacity)) + return err; + if (auto err = dst->set_max_depth(max_depth)) + return err; return SUCCESS; } @@ -8256,7 +8665,6 @@ using namespace simd; } this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; - } } // do not forget to call check_eof! @@ -8799,8 +9207,8 @@ namespace { * * Simply put, we iterate over the structural characters, starting from * the end. We consider that we found the end of a JSON document when the - * first element of the pair is NOT one of these characters: '{' '[' ';' ',' - * and when the second element is NOT one of these characters: '}' '}' ';' ','. + * first element of the pair is NOT one of these characters: '{' '[' ':' ',' + * and when the second element is NOT one of these characters: '}' ']' ':' ','. * * This simple comparison works most of the time, but it does not cover cases * where the batch's structural indexes contain a perfect amount of documents. @@ -8814,7 +9222,8 @@ namespace { * batch. */ simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) { - // TODO don't count separately, just figure out depth + // Variant: do not count separately, just figure out depth + if(parser.n_structural_indexes == 0) { return 0; } auto arr_cnt = 0; auto obj_cnt = 0; for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { @@ -8851,6 +9260,25 @@ simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementati // Last document is incomplete; mark the document at i + 1 as the next one return i; } + // If we made it to the end, we want to finish counting to see if we have a full document. + switch (parser.buf[parser.structural_indexes[0]]) { + case '}': + obj_cnt--; + break; + case ']': + arr_cnt--; + break; + case '{': + obj_cnt++; + break; + case '[': + arr_cnt++; + break; + } + if (!arr_cnt && !obj_cnt) { + // We have a complete document. + return parser.n_structural_indexes; + } return 0; } @@ -8881,8 +9309,62 @@ public: // it helps tremendously. if (bits == 0) return; - int cnt = static_cast(count_ones(bits)); +#if defined(SIMDJSON_PREFER_REVERSE_BITS) + /** + * ARM lacks a fast trailing zero instruction, but it has a fast + * bit reversal instruction and a fast leading zero instruction. + * Thus it may be profitable to reverse the bits (once) and then + * to rely on a sequence of instructions that call the leading + * zero instruction. + * + * Performance notes: + * The chosen routine is not optimal in terms of data dependency + * since zero_leading_bit might require two instructions. However, + * it tends to minimize the total number of instructions which is + * beneficial. + */ + uint64_t rev_bits = reverse_bits(bits); + int cnt = static_cast(count_ones(bits)); + int i = 0; + // Do the first 8 all together + for (; i<8; i++) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + // Do the next 8 all together (we hope in most cases it won't happen at all + // and the branch is easily predicted). + if (simdjson_unlikely(cnt > 8)) { + i = 8; + for (; i<16; i++) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + + + // Most files don't have 16+ structurals per block, so we take several basically guaranteed + // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) + // or the start of a value ("abc" true 123) every four characters. + if (simdjson_unlikely(cnt > 16)) { + i = 16; + while (rev_bits != 0) { + int lz = leading_zeroes(rev_bits); + this->tail[i++] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + } + } + this->tail += cnt; +#else // SIMDJSON_PREFER_REVERSE_BITS + /** + * Under recent x64 systems, we often have both a fast trailing zero + * instruction and a fast 'clear-lower-bit' instruction so the following + * algorithm can be competitive. + */ + + int cnt = static_cast(count_ones(bits)); // Do the first 8 all together for (int i=0; i<8; i++) { this->tail[i] = idx + trailing_zeroes(bits); @@ -8911,6 +9393,7 @@ public: } this->tail += cnt; +#endif } }; @@ -8924,14 +9407,14 @@ public: * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8. */ template - static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept; + static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept; private: simdjson_really_inline json_structural_indexer(uint32_t *structural_indexes); template simdjson_really_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; simdjson_really_inline void next(const simd::simd8x64& in, const json_block& block, size_t idx); - simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial); + simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial); json_scanner scanner{}; utf8_checker checker{}; @@ -8981,10 +9464,17 @@ simdjson_really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) // workout. // template -error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept { +error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept { if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; } - if (partial) { len = trim_partial_utf8(buf, len); } - + // We guard the rest of the code so that we can assume that len > 0 throughout. + if (len == 0) { return EMPTY; } + if (is_streaming(partial)) { + len = trim_partial_utf8(buf, len); + // If you end up with an empty window after trimming + // the partial UTF-8 bytes, then chances are good that you + // have an UTF-8 formatting error. + if(len == 0) { return UTF8_ERROR; } + } buf_block_reader reader(buf, len); json_structural_indexer indexer(parser.structural_indexes.get()); @@ -8992,12 +9482,11 @@ error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_pa while (reader.has_full_block()) { indexer.step(reader.full_block(), reader); } - - // Take care of the last block (will always be there unless file is empty) + // Take care of the last block (will always be there unless file is empty which is + // not supposed to happen.) uint8_t block[STEP_SIZE]; - if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return EMPTY; } + if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; } indexer.step(block, reader); - return indexer.finish(parser, reader.block_index(), len, partial); } @@ -9028,14 +9517,13 @@ simdjson_really_inline void json_structural_indexer::next(const simd::simd8x64 len)) { return UNEXPECTED_ERROR; } - if (partial) { + if (partial == stage1_mode::streaming_partial) { // If we have an unclosed string, then the last structural // will be the quote and we want to make sure to omit it. if(have_unclosed_string) { @@ -9079,11 +9571,48 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp // a valid JSON file cannot have zero structural indexes - we should have found something if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; } } + // We truncate the input to the end of the last complete document (or zero). auto new_structural_indexes = find_next_document_index(parser); if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { - return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. + if(parser.structural_indexes[0] == 0) { + // If the buffer is partial and we started at index 0 but the document is + // incomplete, it's too big to parse. + return CAPACITY; + } else { + // It is possible that the document could be parsed, we just had a lot + // of white space. + parser.n_structural_indexes = 0; + return EMPTY; + } } + parser.n_structural_indexes = new_structural_indexes; + } else if (partial == stage1_mode::streaming_final) { + if(have_unclosed_string) { parser.n_structural_indexes--; } + // We truncate the input to the end of the last complete document (or zero). + // Because partial == stage1_mode::streaming_final, it means that we may + // silently ignore trailing garbage. Though it sounds bad, we do it + // deliberately because many people who have streams of JSON documents + // will truncate them for processing. E.g., imagine that you are uncompressing + // the data from a size file or receiving it in chunks from the network. You + // may not know where exactly the last document will be. Meanwhile the + // document_stream instances allow people to know the JSON documents they are + // parsing (see the iterator.source() method). + parser.n_structural_indexes = find_next_document_index(parser); + // We store the initial n_structural_indexes so that the client can see + // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes, + // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len, + // otherwise, it will copy some prior index. + parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes]; + // This next line is critical, do not change it unless you understand what you are + // doing. + parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); + if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { + // We tolerate an unclosed string at the very end of the stream. Indeed, users + // often load their data in bulk without being careful and they want us to ignore + // the trailing garbage. + return EMPTY; + } } checker.check_eof(); return checker.errors(); @@ -9353,12 +9882,12 @@ simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_docum { auto value = advance(); - // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we + // Make sure the outer object or array is closed before continuing; otherwise, there are ways we // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906 if (!STREAMING) { switch (*value) { - case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break; - case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break; + case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break; + case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break; } } @@ -9956,7 +10485,7 @@ simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_ return ppc64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len); } -simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept { +simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept { this->buf = _buf; this->len = _len; return ppc64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming); @@ -9975,7 +10504,7 @@ simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::docu } simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { - auto error = stage1(_buf, _len, false); + auto error = stage1(_buf, _len, stage1_mode::regular); if (error) { return error; } return stage2(_doc); } @@ -10005,8 +10534,10 @@ simdjson_warn_unused error_code implementation::create_dom_parser_implementation ) const noexcept { dst.reset( new (std::nothrow) dom_parser_implementation() ); if (!dst) { return MEMALLOC; } - dst->set_capacity(capacity); - dst->set_max_depth(max_depth); + if (auto err = dst->set_capacity(capacity)) + return err; + if (auto err = dst->set_max_depth(max_depth)) + return err; return SUCCESS; } @@ -10297,7 +10828,6 @@ using namespace simd; } this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; - } } // do not forget to call check_eof! @@ -10840,8 +11370,8 @@ namespace { * * Simply put, we iterate over the structural characters, starting from * the end. We consider that we found the end of a JSON document when the - * first element of the pair is NOT one of these characters: '{' '[' ';' ',' - * and when the second element is NOT one of these characters: '}' '}' ';' ','. + * first element of the pair is NOT one of these characters: '{' '[' ':' ',' + * and when the second element is NOT one of these characters: '}' ']' ':' ','. * * This simple comparison works most of the time, but it does not cover cases * where the batch's structural indexes contain a perfect amount of documents. @@ -10855,7 +11385,8 @@ namespace { * batch. */ simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) { - // TODO don't count separately, just figure out depth + // Variant: do not count separately, just figure out depth + if(parser.n_structural_indexes == 0) { return 0; } auto arr_cnt = 0; auto obj_cnt = 0; for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { @@ -10892,6 +11423,25 @@ simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementati // Last document is incomplete; mark the document at i + 1 as the next one return i; } + // If we made it to the end, we want to finish counting to see if we have a full document. + switch (parser.buf[parser.structural_indexes[0]]) { + case '}': + obj_cnt--; + break; + case ']': + arr_cnt--; + break; + case '{': + obj_cnt++; + break; + case '[': + arr_cnt++; + break; + } + if (!arr_cnt && !obj_cnt) { + // We have a complete document. + return parser.n_structural_indexes; + } return 0; } @@ -10922,8 +11472,62 @@ public: // it helps tremendously. if (bits == 0) return; - int cnt = static_cast(count_ones(bits)); +#if defined(SIMDJSON_PREFER_REVERSE_BITS) + /** + * ARM lacks a fast trailing zero instruction, but it has a fast + * bit reversal instruction and a fast leading zero instruction. + * Thus it may be profitable to reverse the bits (once) and then + * to rely on a sequence of instructions that call the leading + * zero instruction. + * + * Performance notes: + * The chosen routine is not optimal in terms of data dependency + * since zero_leading_bit might require two instructions. However, + * it tends to minimize the total number of instructions which is + * beneficial. + */ + uint64_t rev_bits = reverse_bits(bits); + int cnt = static_cast(count_ones(bits)); + int i = 0; + // Do the first 8 all together + for (; i<8; i++) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + // Do the next 8 all together (we hope in most cases it won't happen at all + // and the branch is easily predicted). + if (simdjson_unlikely(cnt > 8)) { + i = 8; + for (; i<16; i++) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + + + // Most files don't have 16+ structurals per block, so we take several basically guaranteed + // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) + // or the start of a value ("abc" true 123) every four characters. + if (simdjson_unlikely(cnt > 16)) { + i = 16; + while (rev_bits != 0) { + int lz = leading_zeroes(rev_bits); + this->tail[i++] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + } + } + this->tail += cnt; +#else // SIMDJSON_PREFER_REVERSE_BITS + /** + * Under recent x64 systems, we often have both a fast trailing zero + * instruction and a fast 'clear-lower-bit' instruction so the following + * algorithm can be competitive. + */ + + int cnt = static_cast(count_ones(bits)); // Do the first 8 all together for (int i=0; i<8; i++) { this->tail[i] = idx + trailing_zeroes(bits); @@ -10952,6 +11556,7 @@ public: } this->tail += cnt; +#endif } }; @@ -10965,14 +11570,14 @@ public: * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8. */ template - static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept; + static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept; private: simdjson_really_inline json_structural_indexer(uint32_t *structural_indexes); template simdjson_really_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; simdjson_really_inline void next(const simd::simd8x64& in, const json_block& block, size_t idx); - simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial); + simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial); json_scanner scanner{}; utf8_checker checker{}; @@ -11022,10 +11627,17 @@ simdjson_really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) // workout. // template -error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept { +error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept { if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; } - if (partial) { len = trim_partial_utf8(buf, len); } - + // We guard the rest of the code so that we can assume that len > 0 throughout. + if (len == 0) { return EMPTY; } + if (is_streaming(partial)) { + len = trim_partial_utf8(buf, len); + // If you end up with an empty window after trimming + // the partial UTF-8 bytes, then chances are good that you + // have an UTF-8 formatting error. + if(len == 0) { return UTF8_ERROR; } + } buf_block_reader reader(buf, len); json_structural_indexer indexer(parser.structural_indexes.get()); @@ -11033,12 +11645,11 @@ error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_pa while (reader.has_full_block()) { indexer.step(reader.full_block(), reader); } - - // Take care of the last block (will always be there unless file is empty) + // Take care of the last block (will always be there unless file is empty which is + // not supposed to happen.) uint8_t block[STEP_SIZE]; - if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return EMPTY; } + if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; } indexer.step(block, reader); - return indexer.finish(parser, reader.block_index(), len, partial); } @@ -11069,14 +11680,13 @@ simdjson_really_inline void json_structural_indexer::next(const simd::simd8x64 len)) { return UNEXPECTED_ERROR; } - if (partial) { + if (partial == stage1_mode::streaming_partial) { // If we have an unclosed string, then the last structural // will be the quote and we want to make sure to omit it. if(have_unclosed_string) { @@ -11120,11 +11734,48 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp // a valid JSON file cannot have zero structural indexes - we should have found something if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; } } + // We truncate the input to the end of the last complete document (or zero). auto new_structural_indexes = find_next_document_index(parser); if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { - return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. + if(parser.structural_indexes[0] == 0) { + // If the buffer is partial and we started at index 0 but the document is + // incomplete, it's too big to parse. + return CAPACITY; + } else { + // It is possible that the document could be parsed, we just had a lot + // of white space. + parser.n_structural_indexes = 0; + return EMPTY; + } } + parser.n_structural_indexes = new_structural_indexes; + } else if (partial == stage1_mode::streaming_final) { + if(have_unclosed_string) { parser.n_structural_indexes--; } + // We truncate the input to the end of the last complete document (or zero). + // Because partial == stage1_mode::streaming_final, it means that we may + // silently ignore trailing garbage. Though it sounds bad, we do it + // deliberately because many people who have streams of JSON documents + // will truncate them for processing. E.g., imagine that you are uncompressing + // the data from a size file or receiving it in chunks from the network. You + // may not know where exactly the last document will be. Meanwhile the + // document_stream instances allow people to know the JSON documents they are + // parsing (see the iterator.source() method). + parser.n_structural_indexes = find_next_document_index(parser); + // We store the initial n_structural_indexes so that the client can see + // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes, + // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len, + // otherwise, it will copy some prior index. + parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes]; + // This next line is critical, do not change it unless you understand what you are + // doing. + parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); + if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { + // We tolerate an unclosed string at the very end of the stream. Indeed, users + // often load their data in bulk without being careful and they want us to ignore + // the trailing garbage. + return EMPTY; + } } checker.check_eof(); return checker.errors(); @@ -11393,12 +12044,12 @@ simdjson_warn_unused simdjson_really_inline error_code json_iterator::walk_docum { auto value = advance(); - // Make sure the outer hash or array is closed before continuing; otherwise, there are ways we + // Make sure the outer object or array is closed before continuing; otherwise, there are ways we // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906 if (!STREAMING) { switch (*value) { - case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break; - case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break; + case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break; + case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break; } } @@ -11995,7 +12646,7 @@ simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_ return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len); } -simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept { +simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept { this->buf = _buf; this->len = _len; return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming); @@ -12014,7 +12665,7 @@ simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::docu } simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { - auto error = stage1(_buf, _len, false); + auto error = stage1(_buf, _len, stage1_mode::regular); if (error) { return error; } return stage2(_doc); } diff --git a/singleheader/simdjson.h b/singleheader/simdjson.h index b54de37d..fc114bdb 100644 --- a/singleheader/simdjson.h +++ b/singleheader/simdjson.h @@ -1,4 +1,4 @@ -/* auto-generated on 2021-03-18 11:30:40 -0400. Do not edit! */ +/* auto-generated on 2021-08-06 20:25:05 -0400. Do not edit! */ /* begin file include/simdjson.h */ #ifndef SIMDJSON_H #define SIMDJSON_H @@ -6,7 +6,7 @@ /** * @mainpage * - * Check the [README.md](https://github.com/lemire/simdjson/blob/master/README.md#simdjson--parsing-gigabytes-of-json-per-second). + * Check the [README.md](https://github.com/simdjson/simdjson/blob/master/README.md#simdjson--parsing-gigabytes-of-json-per-second). * * Sample code. See https://github.com/simdjson/simdjson/blob/master/doc/basics.md for more examples. @@ -283,6 +283,8 @@ char *to_chars(char *first, const char *last, double value); * Defined in src/from_chars */ double from_chars(const char *first) noexcept; +double from_chars(const char *first, const char* end) noexcept; + } #ifndef SIMDJSON_EXCEPTIONS @@ -302,7 +304,7 @@ constexpr size_t SIMDJSON_MAXSIZE_BYTES = 0xFFFFFFFF; * the input buf should be readable up to buf + SIMDJSON_PADDING * this is a stopgap; there should be a better description of the * main loop and its behavior that abstracts over this - * See https://github.com/lemire/simdjson/issues/174 + * See https://github.com/simdjson/simdjson/issues/174 */ constexpr size_t SIMDJSON_PADDING = 32; @@ -425,7 +427,7 @@ constexpr size_t DEFAULT_MAX_DEPTH = 1024; * the regular visual studio or clang under visual * studio, you still need to handle these issues. * - * Non-Windows sytems do not have this complexity. + * Non-Windows systems do not have this complexity. */ #if SIMDJSON_BUILDING_WINDOWS_DYNAMIC_LIBRARY // We set SIMDJSON_BUILDING_WINDOWS_DYNAMIC_LIBRARY when we build a DLL under Windows. @@ -479,7 +481,7 @@ constexpr size_t DEFAULT_MAX_DEPTH = 1024; // now it is safe to trigger the include #include // though the file is there, it does not follow that we got the implementation #if defined(_LIBCPP_STRING_VIEW) -// Ah! So we under libc++ which under its Library Fundamentals Technical Specification, which preceeded C++17, +// Ah! So we under libc++ which under its Library Fundamentals Technical Specification, which preceded C++17, // included string_view. // This means that we have string_view *even though* we may not have C++17. #define SIMDJSON_HAS_STRING_VIEW @@ -498,7 +500,7 @@ constexpr size_t DEFAULT_MAX_DEPTH = 1024; #ifndef SIMDJSON_HAS_STRING_VIEW SIMDJSON_PUSH_DISABLE_ALL_WARNINGS /* begin file include/simdjson/nonstd/string_view.hpp */ -// Copyright 2017-2019 by Martin Moene +// Copyright 2017-2020 by Martin Moene // // string-view lite, a C++17-like string_view for C++98 and later. // For more information see https://github.com/martinmoene/string-view-lite @@ -512,7 +514,7 @@ SIMDJSON_PUSH_DISABLE_ALL_WARNINGS #define NONSTD_SV_LITE_H_INCLUDED #define string_view_lite_MAJOR 1 -#define string_view_lite_MINOR 4 +#define string_view_lite_MINOR 6 #define string_view_lite_PATCH 0 #define string_view_lite_VERSION nssv_STRINGIFY(string_view_lite_MAJOR) "." nssv_STRINGIFY(string_view_lite_MINOR) "." nssv_STRINGIFY(string_view_lite_PATCH) @@ -526,12 +528,22 @@ SIMDJSON_PUSH_DISABLE_ALL_WARNINGS #define nssv_STRING_VIEW_NONSTD 1 #define nssv_STRING_VIEW_STD 2 -#if !defined( nssv_CONFIG_SELECT_STRING_VIEW ) -# define nssv_CONFIG_SELECT_STRING_VIEW ( nssv_HAVE_STD_STRING_VIEW ? nssv_STRING_VIEW_STD : nssv_STRING_VIEW_NONSTD ) +// tweak header support: + +#ifdef __has_include +# if __has_include() +# include +# endif +#define nssv_HAVE_TWEAK_HEADER 1 +#else +#define nssv_HAVE_TWEAK_HEADER 0 +//# pragma message("string_view.hpp: Note: Tweak header not supported.") #endif -#if defined( nssv_CONFIG_SELECT_STD_STRING_VIEW ) || defined( nssv_CONFIG_SELECT_NONSTD_STRING_VIEW ) -# error nssv_CONFIG_SELECT_STD_STRING_VIEW and nssv_CONFIG_SELECT_NONSTD_STRING_VIEW are deprecated and removed, please use nssv_CONFIG_SELECT_STRING_VIEW=nssv_STRING_VIEW_... +// string_view selection and configuration: + +#if !defined( nssv_CONFIG_SELECT_STRING_VIEW ) +# define nssv_CONFIG_SELECT_STRING_VIEW ( nssv_HAVE_STD_STRING_VIEW ? nssv_STRING_VIEW_STD : nssv_STRING_VIEW_NONSTD ) #endif #ifndef nssv_CONFIG_STD_SV_OPERATOR @@ -555,10 +567,17 @@ SIMDJSON_PUSH_DISABLE_ALL_WARNINGS # define nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS 1 #endif +#ifndef nssv_CONFIG_NO_STREAM_INSERTION +# define nssv_CONFIG_NO_STREAM_INSERTION 0 +#endif + // Control presence of exception handling (try and auto discover): #ifndef nssv_CONFIG_NO_EXCEPTIONS -# if defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND) +# if _MSC_VER +# include // for _HAS_EXCEPTIONS +# endif +# if defined(__cpp_exceptions) || defined(__EXCEPTIONS) || (_HAS_EXCEPTIONS) # define nssv_CONFIG_NO_EXCEPTIONS 0 # else # define nssv_CONFIG_NO_EXCEPTIONS 1 @@ -721,16 +740,21 @@ using std::operator<<; #define nssv_COMPILER_VERSION( major, minor, patch ) ( 10 * ( 10 * (major) + (minor) ) + (patch) ) -#if defined(__clang__) -# define nssv_COMPILER_CLANG_VERSION nssv_COMPILER_VERSION(__clang_major__, __clang_minor__, __clang_patchlevel__) +#if defined( __apple_build_version__ ) +# define nssv_COMPILER_APPLECLANG_VERSION nssv_COMPILER_VERSION(__clang_major__, __clang_minor__, __clang_patchlevel__) +# define nssv_COMPILER_CLANG_VERSION 0 +#elif defined( __clang__ ) +# define nssv_COMPILER_APPLECLANG_VERSION 0 +# define nssv_COMPILER_CLANG_VERSION nssv_COMPILER_VERSION(__clang_major__, __clang_minor__, __clang_patchlevel__) #else -# define nssv_COMPILER_CLANG_VERSION 0 +# define nssv_COMPILER_APPLECLANG_VERSION 0 +# define nssv_COMPILER_CLANG_VERSION 0 #endif #if defined(__GNUC__) && !defined(__clang__) # define nssv_COMPILER_GNUC_VERSION nssv_COMPILER_VERSION(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__) #else -# define nssv_COMPILER_GNUC_VERSION 0 +# define nssv_COMPILER_GNUC_VERSION 0 #endif // half-open range [lo..hi): @@ -792,6 +816,45 @@ using std::operator<<; #define nssv_HAVE_STD_HASH nssv_CPP11_120 +// Presence of compiler intrinsics: + +// Providing char-type specializations for compare() and length() that +// use compiler intrinsics can improve compile- and run-time performance. +// +// The challenge is in using the right combinations of builtin availability +// and its constexpr-ness. +// +// | compiler | __builtin_memcmp (constexpr) | memcmp (constexpr) | +// |----------|------------------------------|---------------------| +// | clang | 4.0 (>= 4.0 ) | any (? ) | +// | clang-a | 9.0 (>= 9.0 ) | any (? ) | +// | gcc | any (constexpr) | any (? ) | +// | msvc | >= 14.2 C++17 (>= 14.2 ) | any (? ) | + +#define nssv_HAVE_BUILTIN_VER ( (nssv_CPP17_000 && nssv_COMPILER_MSVC_VERSION >= 142) || nssv_COMPILER_GNUC_VERSION > 0 || nssv_COMPILER_CLANG_VERSION >= 400 || nssv_COMPILER_APPLECLANG_VERSION >= 900 ) +#define nssv_HAVE_BUILTIN_CE ( nssv_HAVE_BUILTIN_VER ) + +#define nssv_HAVE_BUILTIN_MEMCMP ( (nssv_HAVE_CONSTEXPR_14 && nssv_HAVE_BUILTIN_CE) || !nssv_HAVE_CONSTEXPR_14 ) +#define nssv_HAVE_BUILTIN_STRLEN ( (nssv_HAVE_CONSTEXPR_11 && nssv_HAVE_BUILTIN_CE) || !nssv_HAVE_CONSTEXPR_11 ) + +#ifdef __has_builtin +# define nssv_HAVE_BUILTIN( x ) __has_builtin( x ) +#else +# define nssv_HAVE_BUILTIN( x ) 0 +#endif + +#if nssv_HAVE_BUILTIN(__builtin_memcmp) || nssv_HAVE_BUILTIN_VER +# define nssv_BUILTIN_MEMCMP __builtin_memcmp +#else +# define nssv_BUILTIN_MEMCMP memcmp +#endif + +#if nssv_HAVE_BUILTIN(__builtin_strlen) || nssv_HAVE_BUILTIN_VER +# define nssv_BUILTIN_STRLEN __builtin_strlen +#else +# define nssv_BUILTIN_STRLEN strlen +#endif + // C++ feature usage: #if nssv_HAVE_CONSTEXPR_11 @@ -850,9 +913,12 @@ using std::operator<<; #include #include #include -#include #include // std::char_traits<> +#if ! nssv_CONFIG_NO_STREAM_INSERTION +# include +#endif + #if ! nssv_CONFIG_NO_EXCEPTIONS # include #endif @@ -905,40 +971,75 @@ nssv_DISABLE_MSVC_WARNINGS( 4455 26481 26472 ) namespace nonstd { namespace sv_lite { -#if nssv_CPP11_OR_GREATER - namespace detail { -#if nssv_CPP14_OR_GREATER +// support constexpr comparison in C++14; +// for C++17 and later, use provided traits: template< typename CharT > -inline constexpr std::size_t length( CharT * s, std::size_t result = 0 ) +inline nssv_constexpr14 int compare( CharT const * s1, CharT const * s2, std::size_t count ) { - CharT * v = s; - std::size_t r = result; - while ( *v != '\0' ) { - ++v; - ++r; + while ( count-- != 0 ) + { + if ( *s1 < *s2 ) return -1; + if ( *s1 > *s2 ) return +1; + ++s1; ++s2; } - return r; + return 0; } -#else // nssv_CPP14_OR_GREATER +#if nssv_HAVE_BUILTIN_MEMCMP +// specialization of compare() for char, see also generic compare() above: + +inline nssv_constexpr14 int compare( char const * s1, char const * s2, std::size_t count ) +{ + return nssv_BUILTIN_MEMCMP( s1, s2, count ); +} + +#endif + +#if nssv_HAVE_BUILTIN_STRLEN + +// specialization of length() for char, see also generic length() further below: + +inline nssv_constexpr std::size_t length( char const * s ) +{ + return nssv_BUILTIN_STRLEN( s ); +} + +#endif + +#if defined(__OPTIMIZE__) + +// gcc, clang provide __OPTIMIZE__ // Expect tail call optimization to make length() non-recursive: template< typename CharT > -inline constexpr std::size_t length( CharT * s, std::size_t result = 0 ) +inline nssv_constexpr std::size_t length( CharT * s, std::size_t result = 0 ) { return *s == '\0' ? result : length( s + 1, result + 1 ); } -#endif // nssv_CPP14_OR_GREATER +#else // OPTIMIZE + +// non-recursive: + +template< typename CharT > +inline nssv_constexpr14 std::size_t length( CharT * s ) +{ + std::size_t result = 0; + while ( *s++ != '\0' ) + { + ++result; + } + return result; +} + +#endif // OPTIMIZE } // namespace detail -#endif // nssv_CPP11_OR_GREATER - template < class CharT, @@ -1089,9 +1190,9 @@ public: nssv_constexpr14 void swap( basic_string_view & other ) nssv_noexcept { - using std::swap; - swap( data_, other.data_ ); - swap( size_, other.size_ ); + const basic_string_view tmp(other); + other = *this; + *this = tmp; } // 24.4.2.6 String operations: @@ -1130,7 +1231,11 @@ public: nssv_constexpr14 int compare( basic_string_view other ) const nssv_noexcept // (1) { +#if nssv_CPP17_OR_GREATER if ( const int result = Traits::compare( data(), other.data(), (std::min)( size(), other.size() ) ) ) +#else + if ( const int result = detail::compare( data(), other.data(), (std::min)( size(), other.size() ) ) ) +#endif { return result; } @@ -1374,7 +1479,7 @@ private: { const basic_string_view v; - nssv_constexpr explicit not_in_view( basic_string_view v ) : v( v ) {} + nssv_constexpr explicit not_in_view( basic_string_view v_ ) : v( v_ ) {} nssv_constexpr bool operator()( CharT c ) const { @@ -1464,37 +1569,37 @@ template< class CharT, class Traits > nssv_constexpr bool operator== ( basic_string_view lhs, basic_string_view rhs ) nssv_noexcept -{ return lhs.compare( rhs ) == 0 ; } +{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; } template< class CharT, class Traits > nssv_constexpr bool operator!= ( basic_string_view lhs, basic_string_view rhs ) nssv_noexcept -{ return lhs.compare( rhs ) != 0 ; } +{ return !( lhs == rhs ); } template< class CharT, class Traits > nssv_constexpr bool operator< ( basic_string_view lhs, basic_string_view rhs ) nssv_noexcept -{ return lhs.compare( rhs ) < 0 ; } +{ return lhs.compare( rhs ) < 0; } template< class CharT, class Traits > nssv_constexpr bool operator<= ( basic_string_view lhs, basic_string_view rhs ) nssv_noexcept -{ return lhs.compare( rhs ) <= 0 ; } +{ return lhs.compare( rhs ) <= 0; } template< class CharT, class Traits > nssv_constexpr bool operator> ( basic_string_view lhs, basic_string_view rhs ) nssv_noexcept -{ return lhs.compare( rhs ) > 0 ; } +{ return lhs.compare( rhs ) > 0; } template< class CharT, class Traits > nssv_constexpr bool operator>= ( basic_string_view lhs, basic_string_view rhs ) nssv_noexcept -{ return lhs.compare( rhs ) >= 0 ; } +{ return lhs.compare( rhs ) >= 0; } // Let S be basic_string_view, and sv be an instance of S. // Implementations shall provide sufficient additional overloads marked @@ -1503,21 +1608,21 @@ nssv_constexpr bool operator>= ( #if ! nssv_CPP11_OR_GREATER || nssv_BETWEEN( nssv_COMPILER_MSVC_VERSION, 100, 141 ) -// accomodate for older compilers: +// accommodate for older compilers: // == template< class CharT, class Traits> nssv_constexpr bool operator==( basic_string_view lhs, - char const * rhs ) nssv_noexcept -{ return lhs.compare( rhs ) == 0; } + CharT const * rhs ) nssv_noexcept +{ return lhs.size() == detail::length( rhs ) && lhs.compare( rhs ) == 0; } template< class CharT, class Traits> nssv_constexpr bool operator==( - char const * lhs, + CharT const * lhs, basic_string_view rhs ) nssv_noexcept -{ return rhs.compare( lhs ) == 0; } +{ return detail::length( lhs ) == rhs.size() && rhs.compare( lhs ) == 0; } template< class CharT, class Traits> nssv_constexpr bool operator==( @@ -1536,38 +1641,38 @@ nssv_constexpr bool operator==( template< class CharT, class Traits> nssv_constexpr bool operator!=( basic_string_view lhs, - char const * rhs ) nssv_noexcept -{ return lhs.compare( rhs ) != 0; } + CharT const * rhs ) nssv_noexcept +{ return !( lhs == rhs ); } template< class CharT, class Traits> nssv_constexpr bool operator!=( - char const * lhs, + CharT const * lhs, basic_string_view rhs ) nssv_noexcept -{ return rhs.compare( lhs ) != 0; } +{ return !( lhs == rhs ); } template< class CharT, class Traits> nssv_constexpr bool operator!=( basic_string_view lhs, std::basic_string rhs ) nssv_noexcept -{ return lhs.size() != rhs.size() && lhs.compare( rhs ) != 0; } +{ return !( lhs == rhs ); } template< class CharT, class Traits> nssv_constexpr bool operator!=( std::basic_string rhs, basic_string_view lhs ) nssv_noexcept -{ return lhs.size() != rhs.size() || rhs.compare( lhs ) != 0; } +{ return !( lhs == rhs ); } // < template< class CharT, class Traits> nssv_constexpr bool operator<( basic_string_view lhs, - char const * rhs ) nssv_noexcept + CharT const * rhs ) nssv_noexcept { return lhs.compare( rhs ) < 0; } template< class CharT, class Traits> nssv_constexpr bool operator<( - char const * lhs, + CharT const * lhs, basic_string_view rhs ) nssv_noexcept { return rhs.compare( lhs ) > 0; } @@ -1588,12 +1693,12 @@ nssv_constexpr bool operator<( template< class CharT, class Traits> nssv_constexpr bool operator<=( basic_string_view lhs, - char const * rhs ) nssv_noexcept + CharT const * rhs ) nssv_noexcept { return lhs.compare( rhs ) <= 0; } template< class CharT, class Traits> nssv_constexpr bool operator<=( - char const * lhs, + CharT const * lhs, basic_string_view rhs ) nssv_noexcept { return rhs.compare( lhs ) >= 0; } @@ -1614,12 +1719,12 @@ nssv_constexpr bool operator<=( template< class CharT, class Traits> nssv_constexpr bool operator>( basic_string_view lhs, - char const * rhs ) nssv_noexcept + CharT const * rhs ) nssv_noexcept { return lhs.compare( rhs ) > 0; } template< class CharT, class Traits> nssv_constexpr bool operator>( - char const * lhs, + CharT const * lhs, basic_string_view rhs ) nssv_noexcept { return rhs.compare( lhs ) < 0; } @@ -1640,12 +1745,12 @@ nssv_constexpr bool operator>( template< class CharT, class Traits> nssv_constexpr bool operator>=( basic_string_view lhs, - char const * rhs ) nssv_noexcept + CharT const * rhs ) nssv_noexcept { return lhs.compare( rhs ) >= 0; } template< class CharT, class Traits> nssv_constexpr bool operator>=( - char const * lhs, + CharT const * lhs, basic_string_view rhs ) nssv_noexcept { return rhs.compare( lhs ) <= 0; } @@ -1665,7 +1770,7 @@ nssv_constexpr bool operator>=( #define nssv_BASIC_STRING_VIEW_I(T,U) typename std::decay< basic_string_view >::type -#if nssv_BETWEEN( nssv_COMPILER_MSVC_VERSION, 140, 150 ) +#if defined(_MSC_VER) // issue 40 # define nssv_MSVC_ORDER(x) , int=x #else # define nssv_MSVC_ORDER(x) /*, int=x*/ @@ -1677,7 +1782,7 @@ template< class CharT, class Traits nssv_MSVC_ORDER(1) > nssv_constexpr bool operator==( basic_string_view lhs, nssv_BASIC_STRING_VIEW_I(CharT, Traits) rhs ) nssv_noexcept -{ return lhs.compare( rhs ) == 0; } +{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; } template< class CharT, class Traits nssv_MSVC_ORDER(2) > nssv_constexpr bool operator==( @@ -1691,13 +1796,13 @@ template< class CharT, class Traits nssv_MSVC_ORDER(1) > nssv_constexpr bool operator!= ( basic_string_view < CharT, Traits > lhs, nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept -{ return lhs.size() != rhs.size() || lhs.compare( rhs ) != 0 ; } +{ return !( lhs == rhs ); } template< class CharT, class Traits nssv_MSVC_ORDER(2) > nssv_constexpr bool operator!= ( nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs, basic_string_view < CharT, Traits > rhs ) nssv_noexcept -{ return lhs.compare( rhs ) != 0 ; } +{ return !( lhs == rhs ); } // < @@ -1705,13 +1810,13 @@ template< class CharT, class Traits nssv_MSVC_ORDER(1) > nssv_constexpr bool operator< ( basic_string_view < CharT, Traits > lhs, nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept -{ return lhs.compare( rhs ) < 0 ; } +{ return lhs.compare( rhs ) < 0; } template< class CharT, class Traits nssv_MSVC_ORDER(2) > nssv_constexpr bool operator< ( nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs, basic_string_view < CharT, Traits > rhs ) nssv_noexcept -{ return lhs.compare( rhs ) < 0 ; } +{ return lhs.compare( rhs ) < 0; } // <= @@ -1719,13 +1824,13 @@ template< class CharT, class Traits nssv_MSVC_ORDER(1) > nssv_constexpr bool operator<= ( basic_string_view < CharT, Traits > lhs, nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept -{ return lhs.compare( rhs ) <= 0 ; } +{ return lhs.compare( rhs ) <= 0; } template< class CharT, class Traits nssv_MSVC_ORDER(2) > nssv_constexpr bool operator<= ( nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs, basic_string_view < CharT, Traits > rhs ) nssv_noexcept -{ return lhs.compare( rhs ) <= 0 ; } +{ return lhs.compare( rhs ) <= 0; } // > @@ -1733,13 +1838,13 @@ template< class CharT, class Traits nssv_MSVC_ORDER(1) > nssv_constexpr bool operator> ( basic_string_view < CharT, Traits > lhs, nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept -{ return lhs.compare( rhs ) > 0 ; } +{ return lhs.compare( rhs ) > 0; } template< class CharT, class Traits nssv_MSVC_ORDER(2) > nssv_constexpr bool operator> ( nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs, basic_string_view < CharT, Traits > rhs ) nssv_noexcept -{ return lhs.compare( rhs ) > 0 ; } +{ return lhs.compare( rhs ) > 0; } // >= @@ -1747,13 +1852,13 @@ template< class CharT, class Traits nssv_MSVC_ORDER(1) > nssv_constexpr bool operator>= ( basic_string_view < CharT, Traits > lhs, nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept -{ return lhs.compare( rhs ) >= 0 ; } +{ return lhs.compare( rhs ) >= 0; } template< class CharT, class Traits nssv_MSVC_ORDER(2) > nssv_constexpr bool operator>= ( nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs, basic_string_view < CharT, Traits > rhs ) nssv_noexcept -{ return lhs.compare( rhs ) >= 0 ; } +{ return lhs.compare( rhs ) >= 0; } #undef nssv_MSVC_ORDER #undef nssv_BASIC_STRING_VIEW_I @@ -1762,6 +1867,8 @@ nssv_constexpr bool operator>= ( // 24.4.4 Inserters and extractors: +#if ! nssv_CONFIG_NO_STREAM_INSERTION + namespace detail { template< class Stream > @@ -1811,6 +1918,8 @@ operator<<( return detail::write_to_stream( os, sv ); } +#endif // nssv_CONFIG_NO_STREAM_INSERTION + // Several typedefs for common character types are provided: typedef basic_string_view string_view; @@ -1959,7 +2068,9 @@ using sv_lite::operator<=; using sv_lite::operator>; using sv_lite::operator>=; +#if ! nssv_CONFIG_NO_STREAM_INSERTION using sv_lite::operator<<; +#endif #if nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS using sv_lite::to_string; @@ -2045,6 +2156,25 @@ namespace std { #endif #endif +// The SIMDJSON_CHECK_EOF macro is a feature flag for the "don't require padding" +// feature. + +#if SIMDJSON_CPLUSPLUS17 +// if we have C++, then fallthrough is a default attribute +# define simdjson_fallthrough [[fallthrough]] +// check if we have __attribute__ support +#elif defined(__has_attribute) +// check if we have the __fallthrough__ attribute +#if __has_attribute(__fallthrough__) +// we are good to go: +# define simdjson_fallthrough __attribute__((__fallthrough__)) +#endif // __has_attribute(__fallthrough__) +#endif // SIMDJSON_CPLUSPLUS17 +// on some systems, we simply do not have support for fallthrough, so use a default: +#ifndef simdjson_fallthrough +# define simdjson_fallthrough do {} while (0) /* fallthrough */ +#endif // simdjson_fallthrough + #endif // SIMDJSON_COMMON_DEFS_H /* end file include/simdjson/common_defs.h */ @@ -2092,33 +2222,34 @@ namespace simdjson { * All possible errors returned by simdjson. */ enum error_code { - SUCCESS = 0, ///< No error - CAPACITY, ///< This parser can't support a document that big - MEMALLOC, ///< Error allocating memory, most likely out of memory - TAPE_ERROR, ///< Something went wrong while writing to the tape (stage 2), this is a generic error - DEPTH_ERROR, ///< Your document exceeds the user-specified depth limitation - STRING_ERROR, ///< Problem while parsing a string - T_ATOM_ERROR, ///< Problem while parsing an atom starting with the letter 't' - F_ATOM_ERROR, ///< Problem while parsing an atom starting with the letter 'f' - N_ATOM_ERROR, ///< Problem while parsing an atom starting with the letter 'n' - NUMBER_ERROR, ///< Problem while parsing a number - UTF8_ERROR, ///< the input is not valid UTF-8 - UNINITIALIZED, ///< unknown error, or uninitialized document - EMPTY, ///< no structural element found - UNESCAPED_CHARS, ///< found unescaped characters in a string. - UNCLOSED_STRING, ///< missing quote at the end - UNSUPPORTED_ARCHITECTURE, ///< unsupported architecture - INCORRECT_TYPE, ///< JSON element has a different type than user expected - NUMBER_OUT_OF_RANGE, ///< JSON number does not fit in 64 bits - INDEX_OUT_OF_BOUNDS, ///< JSON array index too large - NO_SUCH_FIELD, ///< JSON field not found in object - IO_ERROR, ///< Error reading a file - INVALID_JSON_POINTER, ///< Invalid JSON pointer reference - INVALID_URI_FRAGMENT, ///< Invalid URI fragment - UNEXPECTED_ERROR, ///< indicative of a bug in simdjson - PARSER_IN_USE, ///< parser is already in use. - OUT_OF_ORDER_ITERATION, ///< tried to iterate an array or object out of order - INSUFFICIENT_PADDING, ///< The JSON doesn't have enough padding for simdjson to safely parse it. + SUCCESS = 0, ///< No error + CAPACITY, ///< This parser can't support a document that big + MEMALLOC, ///< Error allocating memory, most likely out of memory + TAPE_ERROR, ///< Something went wrong while writing to the tape (stage 2), this is a generic error + DEPTH_ERROR, ///< Your document exceeds the user-specified depth limitation + STRING_ERROR, ///< Problem while parsing a string + T_ATOM_ERROR, ///< Problem while parsing an atom starting with the letter 't' + F_ATOM_ERROR, ///< Problem while parsing an atom starting with the letter 'f' + N_ATOM_ERROR, ///< Problem while parsing an atom starting with the letter 'n' + NUMBER_ERROR, ///< Problem while parsing a number + UTF8_ERROR, ///< the input is not valid UTF-8 + UNINITIALIZED, ///< unknown error, or uninitialized document + EMPTY, ///< no structural element found + UNESCAPED_CHARS, ///< found unescaped characters in a string. + UNCLOSED_STRING, ///< missing quote at the end + UNSUPPORTED_ARCHITECTURE, ///< unsupported architecture + INCORRECT_TYPE, ///< JSON element has a different type than user expected + NUMBER_OUT_OF_RANGE, ///< JSON number does not fit in 64 bits + INDEX_OUT_OF_BOUNDS, ///< JSON array index too large + NO_SUCH_FIELD, ///< JSON field not found in object + IO_ERROR, ///< Error reading a file + INVALID_JSON_POINTER, ///< Invalid JSON pointer reference + INVALID_URI_FRAGMENT, ///< Invalid URI fragment + UNEXPECTED_ERROR, ///< indicative of a bug in simdjson + PARSER_IN_USE, ///< parser is already in use. + OUT_OF_ORDER_ITERATION, ///< tried to iterate an array or object out of order + INSUFFICIENT_PADDING, ///< The JSON doesn't have enough padding for simdjson to safely parse it. + INCOMPLETE_ARRAY_OR_OBJECT, ///< The document ends early. NUM_ERROR_CODES }; @@ -2368,8 +2499,7 @@ struct simdjson_result : public internal::simdjson_result_base { #if SIMDJSON_EXCEPTIONS template -inline std::ostream& operator<<(std::ostream& out, simdjson_result value) noexcept { return out << value.value(); } - +inline std::ostream& operator<<(std::ostream& out, simdjson_result value) { return out << value.value(); } #endif // SIMDJSON_EXCEPTIONS #ifndef SIMDJSON_DISABLE_DEPRECATED_API @@ -2503,7 +2633,7 @@ struct padded_string final { * * @param path the path to the file. **/ - inline static simdjson_result load(const std::string &path) noexcept; + inline static simdjson_result load(std::string_view path) noexcept; private: padded_string &operator=(const padded_string &o) = delete; @@ -2679,8 +2809,30 @@ namespace dom { class document; } // namespace dom +/** +* This enum is used with the dom_parser_implementation::stage1 function. +* 1) The regular mode expects a fully formed JSON document. +* 2) The streaming_partial mode expects a possibly truncated +* input within a stream on JSON documents. +* 3) The stream_final mode allows us to truncate final +* unterminated strings. It is useful in conjunction with streaming_partial. +*/ +enum class stage1_mode { regular, streaming_partial, streaming_final}; + +/** + * Returns true if mode == streaming_partial or mode == streaming_final + */ +inline bool is_streaming(stage1_mode mode) { + // performance note: it is probably faster to check that mode is different + // from regular than checking that it is either streaming_partial or streaming_final. + return (mode != stage1_mode::regular); + // return (mode == stage1_mode::streaming_partial || mode == stage1_mode::streaming_final); +} + + namespace internal { + /** * An implementation of simdjson's DOM parser for a particular CPU architecture. * @@ -2719,7 +2871,7 @@ public: * @param streaming Whether this is being called by parser::parse_many. * @return The error code, or SUCCESS if there was no error. */ - simdjson_warn_unused virtual error_code stage1(const uint8_t *buf, size_t len, bool streaming) noexcept = 0; + simdjson_warn_unused virtual error_code stage1(const uint8_t *buf, size_t len, stage1_mode streaming) noexcept = 0; /** * @private For internal implementation use @@ -3596,11 +3748,11 @@ inline padded_string::operator padded_string_view() const noexcept { return padded_string_view(data(), length(), length() + SIMDJSON_PADDING); } -inline simdjson_result padded_string::load(const std::string &filename) noexcept { +inline simdjson_result padded_string::load(std::string_view filename) noexcept { // Open the file SIMDJSON_PUSH_DISABLE_WARNINGS SIMDJSON_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe - std::FILE *fp = std::fopen(filename.c_str(), "rb"); + std::FILE *fp = std::fopen(filename.data(), "rb"); SIMDJSON_POP_DISABLE_WARNINGS if (fp == nullptr) { @@ -4392,7 +4544,7 @@ public: * - other json errors if parsing fails. You should not rely on these errors to always the same for the * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). */ - inline simdjson_result load_many(const std::string &path, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; + inline simdjson_result load_many(const std::string &path, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; /** * Parse a buffer containing many JSON documents. @@ -4486,18 +4638,18 @@ public: * - other json errors if parsing fails. You should not rely on these errors to always the same for the * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). */ - inline simdjson_result parse_many(const uint8_t *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; + inline simdjson_result parse_many(const uint8_t *buf, size_t len, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ - inline simdjson_result parse_many(const char *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; + inline simdjson_result parse_many(const char *buf, size_t len, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ - inline simdjson_result parse_many(const std::string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; + inline simdjson_result parse_many(const std::string &s, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; inline simdjson_result parse_many(const std::string &&s, size_t batch_size) = delete;// unsafe /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ - inline simdjson_result parse_many(const padded_string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; + inline simdjson_result parse_many(const padded_string &s, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; inline simdjson_result parse_many(const padded_string &&s, size_t batch_size) = delete;// unsafe /** @private We do not want to allow implicit conversion from C string to std::string. */ - simdjson_result parse_many(const char *buf, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept = delete; + simdjson_result parse_many(const char *buf, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept = delete; /** * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length @@ -4593,7 +4745,7 @@ public: /** * @private return an error code corresponding to the last parsing attempt, see - * simdjson.h will return UNITIALIZED if no parsing was attempted + * simdjson.h will return UNINITIALIZED if no parsing was attempted */ [[deprecated("Use the result of parser.parse() instead")]] inline int get_error_code() const noexcept; @@ -4750,7 +4902,29 @@ public: simdjson_really_inline document_stream &operator=(document_stream &&other) noexcept = default; simdjson_really_inline ~document_stream() noexcept; - + /** + * Returns the input size in bytes. + */ + inline size_t size_in_bytes() const noexcept; + /** + * After iterating through the stream, this method + * returns the number of bytes that were not parsed at the end + * of the stream. If truncated_bytes() differs from zero, + * then the input was truncated maybe because incomplete JSON + * documents were found at the end of the stream. You + * may need to process the bytes in the interval [size_in_bytes()-truncated_bytes(), size_in_bytes()). + * + * You should only call truncated_bytes() after streaming through all + * documents, like so: + * + * document_stream stream = parser.parse_many(json,window); + * for(auto doc : stream) { + * // do something with doc + * } + * size_t truncated = stream.truncated_bytes(); + * + */ + inline size_t truncated_bytes() const noexcept; /** * An iterator through a forward-only stream of documents. */ @@ -4764,7 +4938,7 @@ public: using iterator_category = std::input_iterator_tag; /** - * Default contructor. + * Default constructor. */ simdjson_really_inline iterator() noexcept; /** @@ -4908,7 +5082,6 @@ private: error_code error; size_t batch_start{0}; size_t doc_index{}; - #ifdef SIMDJSON_THREADS_ENABLED /** Indicates whether we use threads. Note that this needs to be a constant during the execution of the parsing. */ bool use_thread; @@ -5320,7 +5493,7 @@ public: * The key will be matched against **unescaped** JSON: * * dom::parser parser; - * parser.parse(R"({ "a\n": 1 })"_padded)["a\n"].get_uint64().first == 1 + * int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1 * parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD * * @return The value associated with this field, or: @@ -5335,7 +5508,7 @@ public: * The key will be matched against **unescaped** JSON: * * dom::parser parser; - * parser.parse(R"({ "a\n": 1 })"_padded)["a\n"].get_uint64().first == 1 + * int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1 * parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD * * @return The value associated with this field, or: @@ -5407,7 +5580,7 @@ public: * The key will be matched against **unescaped** JSON: * * dom::parser parser; - * parser.parse(R"({ "a\n": 1 })"_padded)["a\n"].get_uint64().first == 1 + * int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1 * parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD * * @return The value associated with this field, or: @@ -5630,7 +5803,7 @@ public: * The key will be matched against **unescaped** JSON: * * dom::parser parser; - * parser.parse(R"({ "a\n": 1 })"_padded)["a\n"].get_uint64().first == 1 + * int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1 * parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD * * This function has linear-time complexity: the keys are checked one by one. @@ -5647,7 +5820,7 @@ public: * The key will be matched against **unescaped** JSON: * * dom::parser parser; - * parser.parse(R"({ "a\n": 1 })"_padded)["a\n"].get_uint64().first == 1 + * int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1 * parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD * * This function has linear-time complexity: the keys are checked one by one. @@ -5689,7 +5862,7 @@ public: * The key will be matched against **unescaped** JSON: * * dom::parser parser; - * parser.parse(R"({ "a\n": 1 })"_padded)["a\n"].get_uint64().first == 1 + * int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1 * parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD * * This function has linear-time complexity: the keys are checked one by one. @@ -5813,7 +5986,7 @@ public: inline void append(simdjson::dom::element value); /** Append an array to the builder (to be printed) **/ inline void append(simdjson::dom::array value); - /** Append an objet to the builder (to be printed) **/ + /** Append an object to the builder (to be printed) **/ inline void append(simdjson::dom::object value); /** Reset the builder (so that it would print the empty string) **/ simdjson_really_inline void clear(); @@ -5868,7 +6041,7 @@ public: /** Clears out the content. **/ simdjson_really_inline void clear(); /** - * Get access to the buffer, it is own by the instance, but + * Get access to the buffer, it is owned by the instance, but * the user can make a copy. **/ simdjson_really_inline std::string_view str() const; @@ -5925,7 +6098,7 @@ inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result= len) { return; } @@ -7292,7 +7465,6 @@ inline void document_stream::start() noexcept { if (error) { return; } } #endif // SIMDJSON_THREADS_ENABLED - next(); } @@ -7301,13 +7473,20 @@ simdjson_really_inline size_t document_stream::iterator::current_index() const n } simdjson_really_inline std::string_view document_stream::iterator::source() const noexcept { - size_t next_doc_index = stream->batch_start + stream->parser->implementation->structural_indexes[stream->parser->implementation->next_structural_index]; - return std::string_view(reinterpret_cast(stream->buf) + current_index(), next_doc_index - current_index() - 1); + const char* start = reinterpret_cast(stream->buf) + current_index(); + bool object_or_array = ((*start == '[') || (*start == '{')); + if(object_or_array) { + size_t next_doc_index = stream->batch_start + stream->parser->implementation->structural_indexes[stream->parser->implementation->next_structural_index - 1]; + return std::string_view(start, next_doc_index - current_index() + 1); + } else { + size_t next_doc_index = stream->batch_start + stream->parser->implementation->structural_indexes[stream->parser->implementation->next_structural_index]; + return std::string_view(reinterpret_cast(stream->buf) + current_index(), next_doc_index - current_index() - 1); + } } inline void document_stream::next() noexcept { - // We always enter at once once in an error condition. + // We always exit at once, once in an error condition. if (error) { return; } // Load the next document from the batch @@ -7333,18 +7512,25 @@ inline void document_stream::next() noexcept { error = parser->implementation->stage2_next(parser->doc); } } +inline size_t document_stream::size_in_bytes() const noexcept { + return len; +} + +inline size_t document_stream::truncated_bytes() const noexcept { + if(error == CAPACITY) { return len - batch_start; } + return parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] - parser->implementation->structural_indexes[parser->implementation->n_structural_indexes + 1]; +} inline size_t document_stream::next_batch_start() const noexcept { return batch_start + parser->implementation->structural_indexes[parser->implementation->n_structural_indexes]; } inline error_code document_stream::run_stage1(dom::parser &p, size_t _batch_start) noexcept { - // If this is the final batch, pass partial = false size_t remaining = len - _batch_start; if (remaining <= batch_size) { - return p.implementation->stage1(&buf[_batch_start], remaining, false); + return p.implementation->stage1(&buf[_batch_start], remaining, stage1_mode::streaming_final); } else { - return p.implementation->stage1(&buf[_batch_start], batch_size, true); + return p.implementation->stage1(&buf[_batch_start], batch_size, stage1_mode::streaming_partial); } } @@ -7450,7 +7636,7 @@ inline error_code document::allocate(size_t capacity) noexcept { // need a capacity of at least capacity + 1, but it is also possible to do // worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6" //where capacity + 1 tape elements are - // generated, see issue https://github.com/lemire/simdjson/issues/345 + // generated, see issue https://github.com/simdjson/simdjson/issues/345 size_t tape_capacity = SIMDJSON_ROUNDUP_N(capacity + 3, 64); // a document with only zero-length strings... could have capacity/3 string // and we would need capacity/3 * 5 bytes on the string buffer @@ -8408,13 +8594,18 @@ inline simdjson_result parser::parse_into_document(document& provided_d // Important: It is possible that provided_doc is actually the internal 'doc' within the parser!!! error_code _error = ensure_capacity(provided_doc, len); if (_error) { return _error; } - std::unique_ptr tmp_buf; if (realloc_if_needed) { - tmp_buf.reset(reinterpret_cast( internal::allocate_padded_buffer(len) )); - if (tmp_buf.get() == nullptr) { return MEMALLOC; } - std::memcpy(static_cast(tmp_buf.get()), buf, len); + // Make sure we have enough capacity to copy len bytes + if (!loaded_bytes || _loaded_bytes_capacity < len) { + loaded_bytes.reset( internal::allocate_padded_buffer(len) ); + if (!loaded_bytes) { + return MEMALLOC; + } + _loaded_bytes_capacity = len; + } + std::memcpy(static_cast(loaded_bytes.get()), buf, len); } - _error = implementation->parse(realloc_if_needed ? tmp_buf.get() : buf, len, provided_doc); + _error = implementation->parse(realloc_if_needed ? reinterpret_cast(loaded_bytes.get()): buf, len, provided_doc); if (_error) { return _error; } @@ -9332,7 +9523,7 @@ public: dom_parser_implementation &operator=(const dom_parser_implementation &) = delete; simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; - simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final; + simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final; simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final; simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final; inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final; @@ -9438,6 +9629,40 @@ simdjson_really_inline int count_ones(uint64_t input_num) { return vaddv_u8(vcnt_u8(vcreate_u8(input_num))); } + +#if defined(__GNUC__) // catches clang and gcc +/** + * ARM has a fast 64-bit "bit reversal function" that is handy. However, + * it is not generally available as an intrinsic function under Visual + * Studio (though this might be changing). Even under clang/gcc, we + * apparently need to invoke inline assembly. + */ +/* + * We use SIMDJSON_PREFER_REVERSE_BITS as a hint that algorithms that + * work well with bit reversal may use it. + */ +#define SIMDJSON_PREFER_REVERSE_BITS 1 + +/* reverse the bits */ +simdjson_really_inline uint64_t reverse_bits(uint64_t input_num) { + uint64_t rev_bits; + __asm("rbit %0, %1" : "=r"(rev_bits) : "r"(input_num)); + return rev_bits; +} + +/** + * Flips bit at index 63 - lz. Thus if you have 'leading_zeroes' leading zeroes, + * then this will set to zero the leading bit. It is possible for leading_zeroes to be + * greating or equal to 63 in which case we trigger undefined behavior, but the output + * of such undefined behavior is never used. + **/ +NO_SANITIZE_UNDEFINED +simdjson_really_inline uint64_t zero_leading_bit(uint64_t rev_bits, int leading_zeroes) { + return rev_bits ^ (uint64_t(0x8000000000000000) >> leading_zeroes); +} + +#endif + simdjson_really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO *result = value1 + value2; @@ -9860,7 +10085,7 @@ simdjson_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x // Explicit conversion to/from unsigned // // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type. - // In theory, we could check this occurence with std::same_as and std::enabled_if but it is C++14 + // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14 // and relatively ugly and hard to read. #ifndef SIMDJSON_REGULAR_VISUAL_STUDIO simdjson_really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {} @@ -10386,7 +10611,7 @@ static simdjson_really_inline uint32_t parse_eight_digits_unrolled(const uint8_t } // namespace arm64 } // namespace simdjson -#define SIMDJSON_SWAR_NUMBER_PARSING +#define SIMDJSON_SWAR_NUMBER_PARSING 1 /* begin file include/simdjson/generic/numberparsing.h */ #include @@ -10551,8 +10776,8 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg // Both i and power_of_five_128[index] have their most significant bit set to 1 which // implies that the either the most or the second most significant bit of the product // is 1. We pack values in this manner for efficiency reasons: it maximizes the use - // we make of the product. It also makes it easy to reason aboutthe product: there - // 0 or 1 leading zero in the product. + // we make of the product. It also makes it easy to reason about the product: there + // is 0 or 1 leading zero in the product. // Unless the least significant 9 bits of the high (64-bit) part of the full // product are all 1s, then we know that the most significant 55 bits are @@ -10667,7 +10892,7 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg mantissa &= ~(1ULL << 52); // we have to check that real_exponent is in range, otherwise we bail out if (simdjson_unlikely(real_exponent > 2046)) { - // We have an infinte value!!! We could actually throw an error here if we could. + // We have an infinite value!!! We could actually throw an error here if we could. return false; } d = to_double(mantissa, real_exponent, negative); @@ -10695,6 +10920,20 @@ static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) { // to handle that max may be a macro on windows). return !(*outDouble > (std::numeric_limits::max)() || *outDouble < std::numeric_limits::lowest()); } +static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) { + *outDouble = simdjson::internal::from_chars(reinterpret_cast(ptr), reinterpret_cast(end_ptr)); + // We do not accept infinite values. + + // Detecting finite values in a portable manner is ridiculously hard, ideally + // we would want to do: + // return !std::isfinite(*outDouble); + // but that mysteriously fails under legacy/old libc++ libraries, see + // https://github.com/simdjson/simdjson/issues/1286 + // + // Therefore, fall back to this solution (the extra parens are there + // to handle that max may be a macro on windows). + return !(*outDouble > (std::numeric_limits::max)() || *outDouble < std::numeric_limits::lowest()); +} // check quickly whether the next 8 chars are made of digits // at a glance, it looks better than Mula's @@ -10744,13 +10983,15 @@ simdjson_really_inline error_code parse_decimal(simdjson_unused const uint8_t *c const uint8_t *const first_after_period = p; #ifdef SIMDJSON_SWAR_NUMBER_PARSING +#if SIMDJSON_SWAR_NUMBER_PARSING // this helps if we have lots of decimals! // this turns out to be frequent enough. if (is_made_of_eight_digits_fast(p)) { i = i * 100000000 + parse_eight_digits_unrolled(p); p += 8; } -#endif +#endif // SIMDJSON_SWAR_NUMBER_PARSING +#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING // Unrolling the first digit makes a small difference on some implementations (e.g. westmere) if (parse_digit(*p, i)) { ++p; } while (parse_digit(*p, i)) { p++; } @@ -10817,9 +11058,7 @@ simdjson_really_inline size_t significant_digits(const uint8_t * start_digits, s // It is possible that the integer had an overflow. // We have to handle the case where we have 0.0000somenumber. const uint8_t *start = start_digits; - while ((*start == '0') || (*start == '.')) { - start++; - } + while ((*start == '0') || (*start == '.')) { ++start; } // we over-decrement by one when there is a '.' return digit_count - size_t(start - start_digits); } @@ -10830,7 +11069,7 @@ simdjson_really_inline error_code write_float(const uint8_t *const src, bool neg // we could extend our code by using a 128-bit integer instead // of a 64-bit integer. However, this is uncommon in practice. // - // 9999999999999999999 < 2**64 so we can accomodate 19 digits. + // 9999999999999999999 < 2**64 so we can accommodate 19 digits. // If we have a decimal separator, then digit_count - 1 is the number of digits, but we // may not have a decimal separator! if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) { @@ -10889,6 +11128,9 @@ simdjson_really_inline error_code parse_number(const uint8_t *const, W &writer) simdjson_unused simdjson_really_inline simdjson_result parse_unsigned(const uint8_t * const src) noexcept { return 0; } simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t * const src) noexcept { return 0; } simdjson_unused simdjson_really_inline simdjson_result parse_double(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_integer_in_string(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_double_in_string(const uint8_t * const src) noexcept { return 0; } #else @@ -11099,6 +11341,104 @@ simdjson_unused simdjson_really_inline simdjson_result parse_unsigned( return i; } + +// Parse any number from 0 to 18,446,744,073,709,551,615 +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept { + const uint8_t *p = src; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > 20)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + + if (digit_count == 20) { + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). + // + if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } + } + + return i; +} + +// Parse any number from 0 to 18,446,744,073,709,551,615 +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned_in_string(const uint8_t * const src) noexcept { + const uint8_t *p = src + 1; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > 20)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if (*p != '"') { return NUMBER_ERROR; } + + if (digit_count == 20) { + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). + // + if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } + } + + return i; +} + // Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t *src) noexcept { // @@ -11118,10 +11458,10 @@ simdjson_unused simdjson_really_inline simdjson_result parse_integer(co // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. // Optimization note: size_t is expected to be unsigned. size_t digit_count = size_t(p - start_digits); - // The longest negative 64-bit number is 19 digits. - // The longest positive 64-bit number is 20 digits. - // We do it this way so we don't trigger this branch unless we must. - size_t longest_digit_count = negative ? 19 : 20; + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; // Optimization note: the compiler can probably merge // ((digit_count == 0) || (digit_count > longest_digit_count)) // into a single branch since digit_count is unsigned. @@ -11134,27 +11474,96 @@ simdjson_unused simdjson_really_inline simdjson_result parse_integer(co // } // as a single table lookup: if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } - if (digit_count == longest_digit_count) { - if (negative) { - // Anything negative above INT64_MAX+1 is invalid - if (i > uint64_t(INT64_MAX)+1) { return INCORRECT_TYPE; } - return ~i+1; + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } + return negative ? (~i+1) : i; +} - // Positive overflow check: - // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the - // biggest uint64_t. - // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. - // If we got here, it's a 20 digit number starting with the digit "1". - // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller - // than 1,553,255,926,290,448,384. - // - That is smaller than the smallest possible 20-digit number the user could write: - // 10,000,000,000,000,000,000. - // - Therefore, if the number is positive and lower than that, it's overflow. - // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). - // - } else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } - } +// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept { + // + // Check for minus sign + // + if(src == src_end) { return NUMBER_ERROR; } + bool negative = (*src == '-'); + const uint8_t *p = src + negative; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > longest_digit_count)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } + return negative ? (~i+1) : i; +} + +// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 +simdjson_unused simdjson_really_inline simdjson_result parse_integer_in_string(const uint8_t *src) noexcept { + // + // Check for minus sign + // + bool negative = (*(src + 1) == '-'); + const uint8_t *p = src + negative + 1; + + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > longest_digit_count)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if(*p != '"') { return NUMBER_ERROR; } + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } return negative ? (~i+1) : i; } @@ -11235,6 +11644,167 @@ simdjson_unused simdjson_really_inline simdjson_result parse_double(cons } return d; } + + +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept { + if(src == src_end) { return NUMBER_ERROR; } + // + // Check for minus sign + // + bool negative = (*src == '-'); + src += negative; + + // + // Parse the integer part. + // + uint64_t i = 0; + const uint8_t *p = src; + if(p == src_end) { return NUMBER_ERROR; } + p += parse_digit(*p, i); + bool leading_zero = (i == 0); + while ((p != src_end) && parse_digit(*p, i)) { p++; } + // no integer digits, or 0123 (zero must be solo) + if ( p == src ) { return INCORRECT_TYPE; } + if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } + + // + // Parse the decimal part. + // + int64_t exponent = 0; + bool overflow; + if (simdjson_likely((p != src_end) && (*p == '.'))) { + p++; + const uint8_t *start_decimal_digits = p; + if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits + p++; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + exponent = -(p - start_decimal_digits); + + // Overflow check. More than 19 digits (minus the decimal) may be overflow. + overflow = p-src-1 > 19; + if (simdjson_unlikely(overflow && leading_zero)) { + // Skip leading 0.00000 and see if it still overflows + const uint8_t *start_digits = src + 2; + while (*start_digits == '0') { start_digits++; } + overflow = start_digits-src > 19; + } + } else { + overflow = p-src > 19; + } + + // + // Parse the exponent + // + if ((p != src_end) && (*p == 'e' || *p == 'E')) { + p++; + if(p == src_end) { return NUMBER_ERROR; } + bool exp_neg = *p == '-'; + p += exp_neg || *p == '+'; + + uint64_t exp = 0; + const uint8_t *start_exp_digits = p; + while ((p != src_end) && parse_digit(*p, exp)) { p++; } + // no exp digits, or 20+ exp digits + if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } + + exponent += exp_neg ? 0-exp : exp; + } + + if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; } + + overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; + + // + // Assemble (or slow-parse) the float + // + double d; + if (simdjson_likely(!overflow)) { + if (compute_float_64(exponent, i, negative, d)) { return d; } + } + if (!parse_float_fallback(src-negative, src_end, &d)) { + return NUMBER_ERROR; + } + return d; +} + +simdjson_unused simdjson_really_inline simdjson_result parse_double_in_string(const uint8_t * src) noexcept { + // + // Check for minus sign + // + bool negative = (*(src + 1) == '-'); + src += negative + 1; + + // + // Parse the integer part. + // + uint64_t i = 0; + const uint8_t *p = src; + p += parse_digit(*p, i); + bool leading_zero = (i == 0); + while (parse_digit(*p, i)) { p++; } + // no integer digits, or 0123 (zero must be solo) + if ( p == src ) { return INCORRECT_TYPE; } + if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } + + // + // Parse the decimal part. + // + int64_t exponent = 0; + bool overflow; + if (simdjson_likely(*p == '.')) { + p++; + const uint8_t *start_decimal_digits = p; + if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits + p++; + while (parse_digit(*p, i)) { p++; } + exponent = -(p - start_decimal_digits); + + // Overflow check. More than 19 digits (minus the decimal) may be overflow. + overflow = p-src-1 > 19; + if (simdjson_unlikely(overflow && leading_zero)) { + // Skip leading 0.00000 and see if it still overflows + const uint8_t *start_digits = src + 2; + while (*start_digits == '0') { start_digits++; } + overflow = start_digits-src > 19; + } + } else { + overflow = p-src > 19; + } + + // + // Parse the exponent + // + if (*p == 'e' || *p == 'E') { + p++; + bool exp_neg = *p == '-'; + p += exp_neg || *p == '+'; + + uint64_t exp = 0; + const uint8_t *start_exp_digits = p; + while (parse_digit(*p, exp)) { p++; } + // no exp digits, or 20+ exp digits + if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } + + exponent += exp_neg ? 0-exp : exp; + } + + if (*p != '"') { return NUMBER_ERROR; } + + overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; + + // + // Assemble (or slow-parse) the float + // + double d; + if (simdjson_likely(!overflow)) { + if (compute_float_64(exponent, i, negative, d)) { return d; } + } + if (!parse_float_fallback(src-negative, &d)) { + return NUMBER_ERROR; + } + return d; +} } //namespace {} #endif // SIMDJSON_SKIPNUMBERPARSING @@ -11342,7 +11912,7 @@ public: dom_parser_implementation &operator=(const dom_parser_implementation &) = delete; simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; - simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final; + simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final; simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final; simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final; inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final; @@ -11826,7 +12396,8 @@ static simdjson_really_inline uint32_t parse_eight_digits_unrolled(const uint8_t } // namespace fallback } // namespace simdjson -#define SIMDJSON_SWAR_NUMBER_PARSING +#define SIMDJSON_SWAR_NUMBER_PARSING 1 + /* begin file include/simdjson/generic/numberparsing.h */ #include @@ -11990,8 +12561,8 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg // Both i and power_of_five_128[index] have their most significant bit set to 1 which // implies that the either the most or the second most significant bit of the product // is 1. We pack values in this manner for efficiency reasons: it maximizes the use - // we make of the product. It also makes it easy to reason aboutthe product: there - // 0 or 1 leading zero in the product. + // we make of the product. It also makes it easy to reason about the product: there + // is 0 or 1 leading zero in the product. // Unless the least significant 9 bits of the high (64-bit) part of the full // product are all 1s, then we know that the most significant 55 bits are @@ -12106,7 +12677,7 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg mantissa &= ~(1ULL << 52); // we have to check that real_exponent is in range, otherwise we bail out if (simdjson_unlikely(real_exponent > 2046)) { - // We have an infinte value!!! We could actually throw an error here if we could. + // We have an infinite value!!! We could actually throw an error here if we could. return false; } d = to_double(mantissa, real_exponent, negative); @@ -12134,6 +12705,20 @@ static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) { // to handle that max may be a macro on windows). return !(*outDouble > (std::numeric_limits::max)() || *outDouble < std::numeric_limits::lowest()); } +static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) { + *outDouble = simdjson::internal::from_chars(reinterpret_cast(ptr), reinterpret_cast(end_ptr)); + // We do not accept infinite values. + + // Detecting finite values in a portable manner is ridiculously hard, ideally + // we would want to do: + // return !std::isfinite(*outDouble); + // but that mysteriously fails under legacy/old libc++ libraries, see + // https://github.com/simdjson/simdjson/issues/1286 + // + // Therefore, fall back to this solution (the extra parens are there + // to handle that max may be a macro on windows). + return !(*outDouble > (std::numeric_limits::max)() || *outDouble < std::numeric_limits::lowest()); +} // check quickly whether the next 8 chars are made of digits // at a glance, it looks better than Mula's @@ -12183,13 +12768,15 @@ simdjson_really_inline error_code parse_decimal(simdjson_unused const uint8_t *c const uint8_t *const first_after_period = p; #ifdef SIMDJSON_SWAR_NUMBER_PARSING +#if SIMDJSON_SWAR_NUMBER_PARSING // this helps if we have lots of decimals! // this turns out to be frequent enough. if (is_made_of_eight_digits_fast(p)) { i = i * 100000000 + parse_eight_digits_unrolled(p); p += 8; } -#endif +#endif // SIMDJSON_SWAR_NUMBER_PARSING +#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING // Unrolling the first digit makes a small difference on some implementations (e.g. westmere) if (parse_digit(*p, i)) { ++p; } while (parse_digit(*p, i)) { p++; } @@ -12256,9 +12843,7 @@ simdjson_really_inline size_t significant_digits(const uint8_t * start_digits, s // It is possible that the integer had an overflow. // We have to handle the case where we have 0.0000somenumber. const uint8_t *start = start_digits; - while ((*start == '0') || (*start == '.')) { - start++; - } + while ((*start == '0') || (*start == '.')) { ++start; } // we over-decrement by one when there is a '.' return digit_count - size_t(start - start_digits); } @@ -12269,7 +12854,7 @@ simdjson_really_inline error_code write_float(const uint8_t *const src, bool neg // we could extend our code by using a 128-bit integer instead // of a 64-bit integer. However, this is uncommon in practice. // - // 9999999999999999999 < 2**64 so we can accomodate 19 digits. + // 9999999999999999999 < 2**64 so we can accommodate 19 digits. // If we have a decimal separator, then digit_count - 1 is the number of digits, but we // may not have a decimal separator! if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) { @@ -12328,6 +12913,9 @@ simdjson_really_inline error_code parse_number(const uint8_t *const, W &writer) simdjson_unused simdjson_really_inline simdjson_result parse_unsigned(const uint8_t * const src) noexcept { return 0; } simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t * const src) noexcept { return 0; } simdjson_unused simdjson_really_inline simdjson_result parse_double(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_integer_in_string(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_double_in_string(const uint8_t * const src) noexcept { return 0; } #else @@ -12538,6 +13126,104 @@ simdjson_unused simdjson_really_inline simdjson_result parse_unsigned( return i; } + +// Parse any number from 0 to 18,446,744,073,709,551,615 +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept { + const uint8_t *p = src; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > 20)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + + if (digit_count == 20) { + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). + // + if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } + } + + return i; +} + +// Parse any number from 0 to 18,446,744,073,709,551,615 +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned_in_string(const uint8_t * const src) noexcept { + const uint8_t *p = src + 1; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > 20)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if (*p != '"') { return NUMBER_ERROR; } + + if (digit_count == 20) { + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). + // + if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } + } + + return i; +} + // Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t *src) noexcept { // @@ -12557,10 +13243,10 @@ simdjson_unused simdjson_really_inline simdjson_result parse_integer(co // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. // Optimization note: size_t is expected to be unsigned. size_t digit_count = size_t(p - start_digits); - // The longest negative 64-bit number is 19 digits. - // The longest positive 64-bit number is 20 digits. - // We do it this way so we don't trigger this branch unless we must. - size_t longest_digit_count = negative ? 19 : 20; + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; // Optimization note: the compiler can probably merge // ((digit_count == 0) || (digit_count > longest_digit_count)) // into a single branch since digit_count is unsigned. @@ -12573,27 +13259,96 @@ simdjson_unused simdjson_really_inline simdjson_result parse_integer(co // } // as a single table lookup: if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } - if (digit_count == longest_digit_count) { - if (negative) { - // Anything negative above INT64_MAX+1 is invalid - if (i > uint64_t(INT64_MAX)+1) { return INCORRECT_TYPE; } - return ~i+1; + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } + return negative ? (~i+1) : i; +} - // Positive overflow check: - // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the - // biggest uint64_t. - // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. - // If we got here, it's a 20 digit number starting with the digit "1". - // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller - // than 1,553,255,926,290,448,384. - // - That is smaller than the smallest possible 20-digit number the user could write: - // 10,000,000,000,000,000,000. - // - Therefore, if the number is positive and lower than that, it's overflow. - // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). - // - } else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } - } +// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept { + // + // Check for minus sign + // + if(src == src_end) { return NUMBER_ERROR; } + bool negative = (*src == '-'); + const uint8_t *p = src + negative; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > longest_digit_count)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } + return negative ? (~i+1) : i; +} + +// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 +simdjson_unused simdjson_really_inline simdjson_result parse_integer_in_string(const uint8_t *src) noexcept { + // + // Check for minus sign + // + bool negative = (*(src + 1) == '-'); + const uint8_t *p = src + negative + 1; + + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > longest_digit_count)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if(*p != '"') { return NUMBER_ERROR; } + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } return negative ? (~i+1) : i; } @@ -12674,6 +13429,167 @@ simdjson_unused simdjson_really_inline simdjson_result parse_double(cons } return d; } + + +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept { + if(src == src_end) { return NUMBER_ERROR; } + // + // Check for minus sign + // + bool negative = (*src == '-'); + src += negative; + + // + // Parse the integer part. + // + uint64_t i = 0; + const uint8_t *p = src; + if(p == src_end) { return NUMBER_ERROR; } + p += parse_digit(*p, i); + bool leading_zero = (i == 0); + while ((p != src_end) && parse_digit(*p, i)) { p++; } + // no integer digits, or 0123 (zero must be solo) + if ( p == src ) { return INCORRECT_TYPE; } + if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } + + // + // Parse the decimal part. + // + int64_t exponent = 0; + bool overflow; + if (simdjson_likely((p != src_end) && (*p == '.'))) { + p++; + const uint8_t *start_decimal_digits = p; + if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits + p++; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + exponent = -(p - start_decimal_digits); + + // Overflow check. More than 19 digits (minus the decimal) may be overflow. + overflow = p-src-1 > 19; + if (simdjson_unlikely(overflow && leading_zero)) { + // Skip leading 0.00000 and see if it still overflows + const uint8_t *start_digits = src + 2; + while (*start_digits == '0') { start_digits++; } + overflow = start_digits-src > 19; + } + } else { + overflow = p-src > 19; + } + + // + // Parse the exponent + // + if ((p != src_end) && (*p == 'e' || *p == 'E')) { + p++; + if(p == src_end) { return NUMBER_ERROR; } + bool exp_neg = *p == '-'; + p += exp_neg || *p == '+'; + + uint64_t exp = 0; + const uint8_t *start_exp_digits = p; + while ((p != src_end) && parse_digit(*p, exp)) { p++; } + // no exp digits, or 20+ exp digits + if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } + + exponent += exp_neg ? 0-exp : exp; + } + + if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; } + + overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; + + // + // Assemble (or slow-parse) the float + // + double d; + if (simdjson_likely(!overflow)) { + if (compute_float_64(exponent, i, negative, d)) { return d; } + } + if (!parse_float_fallback(src-negative, src_end, &d)) { + return NUMBER_ERROR; + } + return d; +} + +simdjson_unused simdjson_really_inline simdjson_result parse_double_in_string(const uint8_t * src) noexcept { + // + // Check for minus sign + // + bool negative = (*(src + 1) == '-'); + src += negative + 1; + + // + // Parse the integer part. + // + uint64_t i = 0; + const uint8_t *p = src; + p += parse_digit(*p, i); + bool leading_zero = (i == 0); + while (parse_digit(*p, i)) { p++; } + // no integer digits, or 0123 (zero must be solo) + if ( p == src ) { return INCORRECT_TYPE; } + if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } + + // + // Parse the decimal part. + // + int64_t exponent = 0; + bool overflow; + if (simdjson_likely(*p == '.')) { + p++; + const uint8_t *start_decimal_digits = p; + if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits + p++; + while (parse_digit(*p, i)) { p++; } + exponent = -(p - start_decimal_digits); + + // Overflow check. More than 19 digits (minus the decimal) may be overflow. + overflow = p-src-1 > 19; + if (simdjson_unlikely(overflow && leading_zero)) { + // Skip leading 0.00000 and see if it still overflows + const uint8_t *start_digits = src + 2; + while (*start_digits == '0') { start_digits++; } + overflow = start_digits-src > 19; + } + } else { + overflow = p-src > 19; + } + + // + // Parse the exponent + // + if (*p == 'e' || *p == 'E') { + p++; + bool exp_neg = *p == '-'; + p += exp_neg || *p == '+'; + + uint64_t exp = 0; + const uint8_t *start_exp_digits = p; + while (parse_digit(*p, exp)) { p++; } + // no exp digits, or 20+ exp digits + if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } + + exponent += exp_neg ? 0-exp : exp; + } + + if (*p != '"') { return NUMBER_ERROR; } + + overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; + + // + // Assemble (or slow-parse) the float + // + double d; + if (simdjson_likely(!overflow)) { + if (compute_float_64(exponent, i, negative, d)) { return d; } + } + if (!parse_float_fallback(src-negative, &d)) { + return NUMBER_ERROR; + } + return d; +} } //namespace {} #endif // SIMDJSON_SKIPNUMBERPARSING @@ -12846,7 +13762,7 @@ public: dom_parser_implementation &operator=(const dom_parser_implementation &) = delete; simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; - simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final; + simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final; simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final; simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final; inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final; @@ -13749,7 +14665,7 @@ static simdjson_really_inline uint32_t parse_eight_digits_unrolled(const uint8_t } // namespace haswell } // namespace simdjson -#define SIMDJSON_SWAR_NUMBER_PARSING +#define SIMDJSON_SWAR_NUMBER_PARSING 1 /* begin file include/simdjson/generic/numberparsing.h */ #include @@ -13914,8 +14830,8 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg // Both i and power_of_five_128[index] have their most significant bit set to 1 which // implies that the either the most or the second most significant bit of the product // is 1. We pack values in this manner for efficiency reasons: it maximizes the use - // we make of the product. It also makes it easy to reason aboutthe product: there - // 0 or 1 leading zero in the product. + // we make of the product. It also makes it easy to reason about the product: there + // is 0 or 1 leading zero in the product. // Unless the least significant 9 bits of the high (64-bit) part of the full // product are all 1s, then we know that the most significant 55 bits are @@ -14030,7 +14946,7 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg mantissa &= ~(1ULL << 52); // we have to check that real_exponent is in range, otherwise we bail out if (simdjson_unlikely(real_exponent > 2046)) { - // We have an infinte value!!! We could actually throw an error here if we could. + // We have an infinite value!!! We could actually throw an error here if we could. return false; } d = to_double(mantissa, real_exponent, negative); @@ -14058,6 +14974,20 @@ static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) { // to handle that max may be a macro on windows). return !(*outDouble > (std::numeric_limits::max)() || *outDouble < std::numeric_limits::lowest()); } +static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) { + *outDouble = simdjson::internal::from_chars(reinterpret_cast(ptr), reinterpret_cast(end_ptr)); + // We do not accept infinite values. + + // Detecting finite values in a portable manner is ridiculously hard, ideally + // we would want to do: + // return !std::isfinite(*outDouble); + // but that mysteriously fails under legacy/old libc++ libraries, see + // https://github.com/simdjson/simdjson/issues/1286 + // + // Therefore, fall back to this solution (the extra parens are there + // to handle that max may be a macro on windows). + return !(*outDouble > (std::numeric_limits::max)() || *outDouble < std::numeric_limits::lowest()); +} // check quickly whether the next 8 chars are made of digits // at a glance, it looks better than Mula's @@ -14107,13 +15037,15 @@ simdjson_really_inline error_code parse_decimal(simdjson_unused const uint8_t *c const uint8_t *const first_after_period = p; #ifdef SIMDJSON_SWAR_NUMBER_PARSING +#if SIMDJSON_SWAR_NUMBER_PARSING // this helps if we have lots of decimals! // this turns out to be frequent enough. if (is_made_of_eight_digits_fast(p)) { i = i * 100000000 + parse_eight_digits_unrolled(p); p += 8; } -#endif +#endif // SIMDJSON_SWAR_NUMBER_PARSING +#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING // Unrolling the first digit makes a small difference on some implementations (e.g. westmere) if (parse_digit(*p, i)) { ++p; } while (parse_digit(*p, i)) { p++; } @@ -14180,9 +15112,7 @@ simdjson_really_inline size_t significant_digits(const uint8_t * start_digits, s // It is possible that the integer had an overflow. // We have to handle the case where we have 0.0000somenumber. const uint8_t *start = start_digits; - while ((*start == '0') || (*start == '.')) { - start++; - } + while ((*start == '0') || (*start == '.')) { ++start; } // we over-decrement by one when there is a '.' return digit_count - size_t(start - start_digits); } @@ -14193,7 +15123,7 @@ simdjson_really_inline error_code write_float(const uint8_t *const src, bool neg // we could extend our code by using a 128-bit integer instead // of a 64-bit integer. However, this is uncommon in practice. // - // 9999999999999999999 < 2**64 so we can accomodate 19 digits. + // 9999999999999999999 < 2**64 so we can accommodate 19 digits. // If we have a decimal separator, then digit_count - 1 is the number of digits, but we // may not have a decimal separator! if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) { @@ -14252,6 +15182,9 @@ simdjson_really_inline error_code parse_number(const uint8_t *const, W &writer) simdjson_unused simdjson_really_inline simdjson_result parse_unsigned(const uint8_t * const src) noexcept { return 0; } simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t * const src) noexcept { return 0; } simdjson_unused simdjson_really_inline simdjson_result parse_double(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_integer_in_string(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_double_in_string(const uint8_t * const src) noexcept { return 0; } #else @@ -14462,6 +15395,104 @@ simdjson_unused simdjson_really_inline simdjson_result parse_unsigned( return i; } + +// Parse any number from 0 to 18,446,744,073,709,551,615 +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept { + const uint8_t *p = src; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > 20)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + + if (digit_count == 20) { + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). + // + if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } + } + + return i; +} + +// Parse any number from 0 to 18,446,744,073,709,551,615 +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned_in_string(const uint8_t * const src) noexcept { + const uint8_t *p = src + 1; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > 20)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if (*p != '"') { return NUMBER_ERROR; } + + if (digit_count == 20) { + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). + // + if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } + } + + return i; +} + // Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t *src) noexcept { // @@ -14481,10 +15512,10 @@ simdjson_unused simdjson_really_inline simdjson_result parse_integer(co // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. // Optimization note: size_t is expected to be unsigned. size_t digit_count = size_t(p - start_digits); - // The longest negative 64-bit number is 19 digits. - // The longest positive 64-bit number is 20 digits. - // We do it this way so we don't trigger this branch unless we must. - size_t longest_digit_count = negative ? 19 : 20; + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; // Optimization note: the compiler can probably merge // ((digit_count == 0) || (digit_count > longest_digit_count)) // into a single branch since digit_count is unsigned. @@ -14497,27 +15528,96 @@ simdjson_unused simdjson_really_inline simdjson_result parse_integer(co // } // as a single table lookup: if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } - if (digit_count == longest_digit_count) { - if (negative) { - // Anything negative above INT64_MAX+1 is invalid - if (i > uint64_t(INT64_MAX)+1) { return INCORRECT_TYPE; } - return ~i+1; + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } + return negative ? (~i+1) : i; +} - // Positive overflow check: - // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the - // biggest uint64_t. - // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. - // If we got here, it's a 20 digit number starting with the digit "1". - // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller - // than 1,553,255,926,290,448,384. - // - That is smaller than the smallest possible 20-digit number the user could write: - // 10,000,000,000,000,000,000. - // - Therefore, if the number is positive and lower than that, it's overflow. - // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). - // - } else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } - } +// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept { + // + // Check for minus sign + // + if(src == src_end) { return NUMBER_ERROR; } + bool negative = (*src == '-'); + const uint8_t *p = src + negative; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > longest_digit_count)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } + return negative ? (~i+1) : i; +} + +// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 +simdjson_unused simdjson_really_inline simdjson_result parse_integer_in_string(const uint8_t *src) noexcept { + // + // Check for minus sign + // + bool negative = (*(src + 1) == '-'); + const uint8_t *p = src + negative + 1; + + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > longest_digit_count)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if(*p != '"') { return NUMBER_ERROR; } + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } return negative ? (~i+1) : i; } @@ -14598,6 +15698,167 @@ simdjson_unused simdjson_really_inline simdjson_result parse_double(cons } return d; } + + +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept { + if(src == src_end) { return NUMBER_ERROR; } + // + // Check for minus sign + // + bool negative = (*src == '-'); + src += negative; + + // + // Parse the integer part. + // + uint64_t i = 0; + const uint8_t *p = src; + if(p == src_end) { return NUMBER_ERROR; } + p += parse_digit(*p, i); + bool leading_zero = (i == 0); + while ((p != src_end) && parse_digit(*p, i)) { p++; } + // no integer digits, or 0123 (zero must be solo) + if ( p == src ) { return INCORRECT_TYPE; } + if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } + + // + // Parse the decimal part. + // + int64_t exponent = 0; + bool overflow; + if (simdjson_likely((p != src_end) && (*p == '.'))) { + p++; + const uint8_t *start_decimal_digits = p; + if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits + p++; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + exponent = -(p - start_decimal_digits); + + // Overflow check. More than 19 digits (minus the decimal) may be overflow. + overflow = p-src-1 > 19; + if (simdjson_unlikely(overflow && leading_zero)) { + // Skip leading 0.00000 and see if it still overflows + const uint8_t *start_digits = src + 2; + while (*start_digits == '0') { start_digits++; } + overflow = start_digits-src > 19; + } + } else { + overflow = p-src > 19; + } + + // + // Parse the exponent + // + if ((p != src_end) && (*p == 'e' || *p == 'E')) { + p++; + if(p == src_end) { return NUMBER_ERROR; } + bool exp_neg = *p == '-'; + p += exp_neg || *p == '+'; + + uint64_t exp = 0; + const uint8_t *start_exp_digits = p; + while ((p != src_end) && parse_digit(*p, exp)) { p++; } + // no exp digits, or 20+ exp digits + if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } + + exponent += exp_neg ? 0-exp : exp; + } + + if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; } + + overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; + + // + // Assemble (or slow-parse) the float + // + double d; + if (simdjson_likely(!overflow)) { + if (compute_float_64(exponent, i, negative, d)) { return d; } + } + if (!parse_float_fallback(src-negative, src_end, &d)) { + return NUMBER_ERROR; + } + return d; +} + +simdjson_unused simdjson_really_inline simdjson_result parse_double_in_string(const uint8_t * src) noexcept { + // + // Check for minus sign + // + bool negative = (*(src + 1) == '-'); + src += negative + 1; + + // + // Parse the integer part. + // + uint64_t i = 0; + const uint8_t *p = src; + p += parse_digit(*p, i); + bool leading_zero = (i == 0); + while (parse_digit(*p, i)) { p++; } + // no integer digits, or 0123 (zero must be solo) + if ( p == src ) { return INCORRECT_TYPE; } + if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } + + // + // Parse the decimal part. + // + int64_t exponent = 0; + bool overflow; + if (simdjson_likely(*p == '.')) { + p++; + const uint8_t *start_decimal_digits = p; + if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits + p++; + while (parse_digit(*p, i)) { p++; } + exponent = -(p - start_decimal_digits); + + // Overflow check. More than 19 digits (minus the decimal) may be overflow. + overflow = p-src-1 > 19; + if (simdjson_unlikely(overflow && leading_zero)) { + // Skip leading 0.00000 and see if it still overflows + const uint8_t *start_digits = src + 2; + while (*start_digits == '0') { start_digits++; } + overflow = start_digits-src > 19; + } + } else { + overflow = p-src > 19; + } + + // + // Parse the exponent + // + if (*p == 'e' || *p == 'E') { + p++; + bool exp_neg = *p == '-'; + p += exp_neg || *p == '+'; + + uint64_t exp = 0; + const uint8_t *start_exp_digits = p; + while (parse_digit(*p, exp)) { p++; } + // no exp digits, or 20+ exp digits + if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } + + exponent += exp_neg ? 0-exp : exp; + } + + if (*p != '"') { return NUMBER_ERROR; } + + overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; + + // + // Assemble (or slow-parse) the float + // + double d; + if (simdjson_likely(!overflow)) { + if (compute_float_64(exponent, i, negative, d)) { return d; } + } + if (!parse_float_fallback(src-negative, &d)) { + return NUMBER_ERROR; + } + return d; +} } //namespace {} #endif // SIMDJSON_SKIPNUMBERPARSING @@ -14705,7 +15966,7 @@ public: dom_parser_implementation &operator=(const dom_parser_implementation &) = delete; simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; - simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final; + simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final; simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final; simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final; inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final; @@ -15772,7 +17033,7 @@ parse_eight_digits_unrolled(const uint8_t *chars) { } // namespace ppc64 } // namespace simdjson -#define SIMDJSON_SWAR_NUMBER_PARSING +#define SIMDJSON_SWAR_NUMBER_PARSING 1 /* begin file include/simdjson/generic/numberparsing.h */ #include @@ -15937,8 +17198,8 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg // Both i and power_of_five_128[index] have their most significant bit set to 1 which // implies that the either the most or the second most significant bit of the product // is 1. We pack values in this manner for efficiency reasons: it maximizes the use - // we make of the product. It also makes it easy to reason aboutthe product: there - // 0 or 1 leading zero in the product. + // we make of the product. It also makes it easy to reason about the product: there + // is 0 or 1 leading zero in the product. // Unless the least significant 9 bits of the high (64-bit) part of the full // product are all 1s, then we know that the most significant 55 bits are @@ -16053,7 +17314,7 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg mantissa &= ~(1ULL << 52); // we have to check that real_exponent is in range, otherwise we bail out if (simdjson_unlikely(real_exponent > 2046)) { - // We have an infinte value!!! We could actually throw an error here if we could. + // We have an infinite value!!! We could actually throw an error here if we could. return false; } d = to_double(mantissa, real_exponent, negative); @@ -16081,6 +17342,20 @@ static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) { // to handle that max may be a macro on windows). return !(*outDouble > (std::numeric_limits::max)() || *outDouble < std::numeric_limits::lowest()); } +static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) { + *outDouble = simdjson::internal::from_chars(reinterpret_cast(ptr), reinterpret_cast(end_ptr)); + // We do not accept infinite values. + + // Detecting finite values in a portable manner is ridiculously hard, ideally + // we would want to do: + // return !std::isfinite(*outDouble); + // but that mysteriously fails under legacy/old libc++ libraries, see + // https://github.com/simdjson/simdjson/issues/1286 + // + // Therefore, fall back to this solution (the extra parens are there + // to handle that max may be a macro on windows). + return !(*outDouble > (std::numeric_limits::max)() || *outDouble < std::numeric_limits::lowest()); +} // check quickly whether the next 8 chars are made of digits // at a glance, it looks better than Mula's @@ -16130,13 +17405,15 @@ simdjson_really_inline error_code parse_decimal(simdjson_unused const uint8_t *c const uint8_t *const first_after_period = p; #ifdef SIMDJSON_SWAR_NUMBER_PARSING +#if SIMDJSON_SWAR_NUMBER_PARSING // this helps if we have lots of decimals! // this turns out to be frequent enough. if (is_made_of_eight_digits_fast(p)) { i = i * 100000000 + parse_eight_digits_unrolled(p); p += 8; } -#endif +#endif // SIMDJSON_SWAR_NUMBER_PARSING +#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING // Unrolling the first digit makes a small difference on some implementations (e.g. westmere) if (parse_digit(*p, i)) { ++p; } while (parse_digit(*p, i)) { p++; } @@ -16203,9 +17480,7 @@ simdjson_really_inline size_t significant_digits(const uint8_t * start_digits, s // It is possible that the integer had an overflow. // We have to handle the case where we have 0.0000somenumber. const uint8_t *start = start_digits; - while ((*start == '0') || (*start == '.')) { - start++; - } + while ((*start == '0') || (*start == '.')) { ++start; } // we over-decrement by one when there is a '.' return digit_count - size_t(start - start_digits); } @@ -16216,7 +17491,7 @@ simdjson_really_inline error_code write_float(const uint8_t *const src, bool neg // we could extend our code by using a 128-bit integer instead // of a 64-bit integer. However, this is uncommon in practice. // - // 9999999999999999999 < 2**64 so we can accomodate 19 digits. + // 9999999999999999999 < 2**64 so we can accommodate 19 digits. // If we have a decimal separator, then digit_count - 1 is the number of digits, but we // may not have a decimal separator! if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) { @@ -16275,6 +17550,9 @@ simdjson_really_inline error_code parse_number(const uint8_t *const, W &writer) simdjson_unused simdjson_really_inline simdjson_result parse_unsigned(const uint8_t * const src) noexcept { return 0; } simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t * const src) noexcept { return 0; } simdjson_unused simdjson_really_inline simdjson_result parse_double(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_integer_in_string(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_double_in_string(const uint8_t * const src) noexcept { return 0; } #else @@ -16485,6 +17763,104 @@ simdjson_unused simdjson_really_inline simdjson_result parse_unsigned( return i; } + +// Parse any number from 0 to 18,446,744,073,709,551,615 +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept { + const uint8_t *p = src; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > 20)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + + if (digit_count == 20) { + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). + // + if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } + } + + return i; +} + +// Parse any number from 0 to 18,446,744,073,709,551,615 +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned_in_string(const uint8_t * const src) noexcept { + const uint8_t *p = src + 1; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > 20)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if (*p != '"') { return NUMBER_ERROR; } + + if (digit_count == 20) { + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). + // + if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } + } + + return i; +} + // Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t *src) noexcept { // @@ -16504,10 +17880,10 @@ simdjson_unused simdjson_really_inline simdjson_result parse_integer(co // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. // Optimization note: size_t is expected to be unsigned. size_t digit_count = size_t(p - start_digits); - // The longest negative 64-bit number is 19 digits. - // The longest positive 64-bit number is 20 digits. - // We do it this way so we don't trigger this branch unless we must. - size_t longest_digit_count = negative ? 19 : 20; + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; // Optimization note: the compiler can probably merge // ((digit_count == 0) || (digit_count > longest_digit_count)) // into a single branch since digit_count is unsigned. @@ -16520,27 +17896,96 @@ simdjson_unused simdjson_really_inline simdjson_result parse_integer(co // } // as a single table lookup: if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } - if (digit_count == longest_digit_count) { - if (negative) { - // Anything negative above INT64_MAX+1 is invalid - if (i > uint64_t(INT64_MAX)+1) { return INCORRECT_TYPE; } - return ~i+1; + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } + return negative ? (~i+1) : i; +} - // Positive overflow check: - // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the - // biggest uint64_t. - // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. - // If we got here, it's a 20 digit number starting with the digit "1". - // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller - // than 1,553,255,926,290,448,384. - // - That is smaller than the smallest possible 20-digit number the user could write: - // 10,000,000,000,000,000,000. - // - Therefore, if the number is positive and lower than that, it's overflow. - // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). - // - } else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } - } +// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept { + // + // Check for minus sign + // + if(src == src_end) { return NUMBER_ERROR; } + bool negative = (*src == '-'); + const uint8_t *p = src + negative; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > longest_digit_count)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } + return negative ? (~i+1) : i; +} + +// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 +simdjson_unused simdjson_really_inline simdjson_result parse_integer_in_string(const uint8_t *src) noexcept { + // + // Check for minus sign + // + bool negative = (*(src + 1) == '-'); + const uint8_t *p = src + negative + 1; + + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > longest_digit_count)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if(*p != '"') { return NUMBER_ERROR; } + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } return negative ? (~i+1) : i; } @@ -16621,6 +18066,167 @@ simdjson_unused simdjson_really_inline simdjson_result parse_double(cons } return d; } + + +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept { + if(src == src_end) { return NUMBER_ERROR; } + // + // Check for minus sign + // + bool negative = (*src == '-'); + src += negative; + + // + // Parse the integer part. + // + uint64_t i = 0; + const uint8_t *p = src; + if(p == src_end) { return NUMBER_ERROR; } + p += parse_digit(*p, i); + bool leading_zero = (i == 0); + while ((p != src_end) && parse_digit(*p, i)) { p++; } + // no integer digits, or 0123 (zero must be solo) + if ( p == src ) { return INCORRECT_TYPE; } + if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } + + // + // Parse the decimal part. + // + int64_t exponent = 0; + bool overflow; + if (simdjson_likely((p != src_end) && (*p == '.'))) { + p++; + const uint8_t *start_decimal_digits = p; + if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits + p++; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + exponent = -(p - start_decimal_digits); + + // Overflow check. More than 19 digits (minus the decimal) may be overflow. + overflow = p-src-1 > 19; + if (simdjson_unlikely(overflow && leading_zero)) { + // Skip leading 0.00000 and see if it still overflows + const uint8_t *start_digits = src + 2; + while (*start_digits == '0') { start_digits++; } + overflow = start_digits-src > 19; + } + } else { + overflow = p-src > 19; + } + + // + // Parse the exponent + // + if ((p != src_end) && (*p == 'e' || *p == 'E')) { + p++; + if(p == src_end) { return NUMBER_ERROR; } + bool exp_neg = *p == '-'; + p += exp_neg || *p == '+'; + + uint64_t exp = 0; + const uint8_t *start_exp_digits = p; + while ((p != src_end) && parse_digit(*p, exp)) { p++; } + // no exp digits, or 20+ exp digits + if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } + + exponent += exp_neg ? 0-exp : exp; + } + + if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; } + + overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; + + // + // Assemble (or slow-parse) the float + // + double d; + if (simdjson_likely(!overflow)) { + if (compute_float_64(exponent, i, negative, d)) { return d; } + } + if (!parse_float_fallback(src-negative, src_end, &d)) { + return NUMBER_ERROR; + } + return d; +} + +simdjson_unused simdjson_really_inline simdjson_result parse_double_in_string(const uint8_t * src) noexcept { + // + // Check for minus sign + // + bool negative = (*(src + 1) == '-'); + src += negative + 1; + + // + // Parse the integer part. + // + uint64_t i = 0; + const uint8_t *p = src; + p += parse_digit(*p, i); + bool leading_zero = (i == 0); + while (parse_digit(*p, i)) { p++; } + // no integer digits, or 0123 (zero must be solo) + if ( p == src ) { return INCORRECT_TYPE; } + if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } + + // + // Parse the decimal part. + // + int64_t exponent = 0; + bool overflow; + if (simdjson_likely(*p == '.')) { + p++; + const uint8_t *start_decimal_digits = p; + if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits + p++; + while (parse_digit(*p, i)) { p++; } + exponent = -(p - start_decimal_digits); + + // Overflow check. More than 19 digits (minus the decimal) may be overflow. + overflow = p-src-1 > 19; + if (simdjson_unlikely(overflow && leading_zero)) { + // Skip leading 0.00000 and see if it still overflows + const uint8_t *start_digits = src + 2; + while (*start_digits == '0') { start_digits++; } + overflow = start_digits-src > 19; + } + } else { + overflow = p-src > 19; + } + + // + // Parse the exponent + // + if (*p == 'e' || *p == 'E') { + p++; + bool exp_neg = *p == '-'; + p += exp_neg || *p == '+'; + + uint64_t exp = 0; + const uint8_t *start_exp_digits = p; + while (parse_digit(*p, exp)) { p++; } + // no exp digits, or 20+ exp digits + if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } + + exponent += exp_neg ? 0-exp : exp; + } + + if (*p != '"') { return NUMBER_ERROR; } + + overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; + + // + // Assemble (or slow-parse) the float + // + double d; + if (simdjson_likely(!overflow)) { + if (compute_float_64(exponent, i, negative, d)) { return d; } + } + if (!parse_float_fallback(src-negative, &d)) { + return NUMBER_ERROR; + } + return d; +} } //namespace {} #endif // SIMDJSON_SKIPNUMBERPARSING @@ -16769,7 +18375,7 @@ public: dom_parser_implementation &operator=(const dom_parser_implementation &) = delete; simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; - simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final; + simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final; simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final; simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final; inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final; @@ -17653,7 +19259,7 @@ static simdjson_really_inline uint32_t parse_eight_digits_unrolled(const uint8_t } // namespace westmere } // namespace simdjson -#define SIMDJSON_SWAR_NUMBER_PARSING +#define SIMDJSON_SWAR_NUMBER_PARSING 1 /* begin file include/simdjson/generic/numberparsing.h */ #include @@ -17818,8 +19424,8 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg // Both i and power_of_five_128[index] have their most significant bit set to 1 which // implies that the either the most or the second most significant bit of the product // is 1. We pack values in this manner for efficiency reasons: it maximizes the use - // we make of the product. It also makes it easy to reason aboutthe product: there - // 0 or 1 leading zero in the product. + // we make of the product. It also makes it easy to reason about the product: there + // is 0 or 1 leading zero in the product. // Unless the least significant 9 bits of the high (64-bit) part of the full // product are all 1s, then we know that the most significant 55 bits are @@ -17934,7 +19540,7 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg mantissa &= ~(1ULL << 52); // we have to check that real_exponent is in range, otherwise we bail out if (simdjson_unlikely(real_exponent > 2046)) { - // We have an infinte value!!! We could actually throw an error here if we could. + // We have an infinite value!!! We could actually throw an error here if we could. return false; } d = to_double(mantissa, real_exponent, negative); @@ -17962,6 +19568,20 @@ static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) { // to handle that max may be a macro on windows). return !(*outDouble > (std::numeric_limits::max)() || *outDouble < std::numeric_limits::lowest()); } +static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) { + *outDouble = simdjson::internal::from_chars(reinterpret_cast(ptr), reinterpret_cast(end_ptr)); + // We do not accept infinite values. + + // Detecting finite values in a portable manner is ridiculously hard, ideally + // we would want to do: + // return !std::isfinite(*outDouble); + // but that mysteriously fails under legacy/old libc++ libraries, see + // https://github.com/simdjson/simdjson/issues/1286 + // + // Therefore, fall back to this solution (the extra parens are there + // to handle that max may be a macro on windows). + return !(*outDouble > (std::numeric_limits::max)() || *outDouble < std::numeric_limits::lowest()); +} // check quickly whether the next 8 chars are made of digits // at a glance, it looks better than Mula's @@ -18011,13 +19631,15 @@ simdjson_really_inline error_code parse_decimal(simdjson_unused const uint8_t *c const uint8_t *const first_after_period = p; #ifdef SIMDJSON_SWAR_NUMBER_PARSING +#if SIMDJSON_SWAR_NUMBER_PARSING // this helps if we have lots of decimals! // this turns out to be frequent enough. if (is_made_of_eight_digits_fast(p)) { i = i * 100000000 + parse_eight_digits_unrolled(p); p += 8; } -#endif +#endif // SIMDJSON_SWAR_NUMBER_PARSING +#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING // Unrolling the first digit makes a small difference on some implementations (e.g. westmere) if (parse_digit(*p, i)) { ++p; } while (parse_digit(*p, i)) { p++; } @@ -18084,9 +19706,7 @@ simdjson_really_inline size_t significant_digits(const uint8_t * start_digits, s // It is possible that the integer had an overflow. // We have to handle the case where we have 0.0000somenumber. const uint8_t *start = start_digits; - while ((*start == '0') || (*start == '.')) { - start++; - } + while ((*start == '0') || (*start == '.')) { ++start; } // we over-decrement by one when there is a '.' return digit_count - size_t(start - start_digits); } @@ -18097,7 +19717,7 @@ simdjson_really_inline error_code write_float(const uint8_t *const src, bool neg // we could extend our code by using a 128-bit integer instead // of a 64-bit integer. However, this is uncommon in practice. // - // 9999999999999999999 < 2**64 so we can accomodate 19 digits. + // 9999999999999999999 < 2**64 so we can accommodate 19 digits. // If we have a decimal separator, then digit_count - 1 is the number of digits, but we // may not have a decimal separator! if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) { @@ -18156,6 +19776,9 @@ simdjson_really_inline error_code parse_number(const uint8_t *const, W &writer) simdjson_unused simdjson_really_inline simdjson_result parse_unsigned(const uint8_t * const src) noexcept { return 0; } simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t * const src) noexcept { return 0; } simdjson_unused simdjson_really_inline simdjson_result parse_double(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_integer_in_string(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_really_inline simdjson_result parse_double_in_string(const uint8_t * const src) noexcept { return 0; } #else @@ -18366,6 +19989,104 @@ simdjson_unused simdjson_really_inline simdjson_result parse_unsigned( return i; } + +// Parse any number from 0 to 18,446,744,073,709,551,615 +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept { + const uint8_t *p = src; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > 20)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + + if (digit_count == 20) { + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). + // + if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } + } + + return i; +} + +// Parse any number from 0 to 18,446,744,073,709,551,615 +simdjson_unused simdjson_really_inline simdjson_result parse_unsigned_in_string(const uint8_t * const src) noexcept { + const uint8_t *p = src + 1; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > 20)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if (*p != '"') { return NUMBER_ERROR; } + + if (digit_count == 20) { + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). + // + if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } + } + + return i; +} + // Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t *src) noexcept { // @@ -18385,10 +20106,10 @@ simdjson_unused simdjson_really_inline simdjson_result parse_integer(co // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. // Optimization note: size_t is expected to be unsigned. size_t digit_count = size_t(p - start_digits); - // The longest negative 64-bit number is 19 digits. - // The longest positive 64-bit number is 20 digits. - // We do it this way so we don't trigger this branch unless we must. - size_t longest_digit_count = negative ? 19 : 20; + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; // Optimization note: the compiler can probably merge // ((digit_count == 0) || (digit_count > longest_digit_count)) // into a single branch since digit_count is unsigned. @@ -18401,27 +20122,96 @@ simdjson_unused simdjson_really_inline simdjson_result parse_integer(co // } // as a single table lookup: if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } - if (digit_count == longest_digit_count) { - if (negative) { - // Anything negative above INT64_MAX+1 is invalid - if (i > uint64_t(INT64_MAX)+1) { return INCORRECT_TYPE; } - return ~i+1; + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } + return negative ? (~i+1) : i; +} - // Positive overflow check: - // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the - // biggest uint64_t. - // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. - // If we got here, it's a 20 digit number starting with the digit "1". - // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller - // than 1,553,255,926,290,448,384. - // - That is smaller than the smallest possible 20-digit number the user could write: - // 10,000,000,000,000,000,000. - // - Therefore, if the number is positive and lower than that, it's overflow. - // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX). - // - } else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } - } +// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept { + // + // Check for minus sign + // + if(src == src_end) { return NUMBER_ERROR; } + bool negative = (*src == '-'); + const uint8_t *p = src + negative; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > longest_digit_count)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } + return negative ? (~i+1) : i; +} + +// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 +simdjson_unused simdjson_really_inline simdjson_result parse_integer_in_string(const uint8_t *src) noexcept { + // + // Check for minus sign + // + bool negative = (*(src + 1) == '-'); + const uint8_t *p = src + negative + 1; + + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > longest_digit_count)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if(*p != '"') { return NUMBER_ERROR; } + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } return negative ? (~i+1) : i; } @@ -18502,6 +20292,167 @@ simdjson_unused simdjson_really_inline simdjson_result parse_double(cons } return d; } + + +// Never read at src_end or beyond +simdjson_unused simdjson_really_inline simdjson_result parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept { + if(src == src_end) { return NUMBER_ERROR; } + // + // Check for minus sign + // + bool negative = (*src == '-'); + src += negative; + + // + // Parse the integer part. + // + uint64_t i = 0; + const uint8_t *p = src; + if(p == src_end) { return NUMBER_ERROR; } + p += parse_digit(*p, i); + bool leading_zero = (i == 0); + while ((p != src_end) && parse_digit(*p, i)) { p++; } + // no integer digits, or 0123 (zero must be solo) + if ( p == src ) { return INCORRECT_TYPE; } + if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } + + // + // Parse the decimal part. + // + int64_t exponent = 0; + bool overflow; + if (simdjson_likely((p != src_end) && (*p == '.'))) { + p++; + const uint8_t *start_decimal_digits = p; + if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits + p++; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + exponent = -(p - start_decimal_digits); + + // Overflow check. More than 19 digits (minus the decimal) may be overflow. + overflow = p-src-1 > 19; + if (simdjson_unlikely(overflow && leading_zero)) { + // Skip leading 0.00000 and see if it still overflows + const uint8_t *start_digits = src + 2; + while (*start_digits == '0') { start_digits++; } + overflow = start_digits-src > 19; + } + } else { + overflow = p-src > 19; + } + + // + // Parse the exponent + // + if ((p != src_end) && (*p == 'e' || *p == 'E')) { + p++; + if(p == src_end) { return NUMBER_ERROR; } + bool exp_neg = *p == '-'; + p += exp_neg || *p == '+'; + + uint64_t exp = 0; + const uint8_t *start_exp_digits = p; + while ((p != src_end) && parse_digit(*p, exp)) { p++; } + // no exp digits, or 20+ exp digits + if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } + + exponent += exp_neg ? 0-exp : exp; + } + + if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; } + + overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; + + // + // Assemble (or slow-parse) the float + // + double d; + if (simdjson_likely(!overflow)) { + if (compute_float_64(exponent, i, negative, d)) { return d; } + } + if (!parse_float_fallback(src-negative, src_end, &d)) { + return NUMBER_ERROR; + } + return d; +} + +simdjson_unused simdjson_really_inline simdjson_result parse_double_in_string(const uint8_t * src) noexcept { + // + // Check for minus sign + // + bool negative = (*(src + 1) == '-'); + src += negative + 1; + + // + // Parse the integer part. + // + uint64_t i = 0; + const uint8_t *p = src; + p += parse_digit(*p, i); + bool leading_zero = (i == 0); + while (parse_digit(*p, i)) { p++; } + // no integer digits, or 0123 (zero must be solo) + if ( p == src ) { return INCORRECT_TYPE; } + if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } + + // + // Parse the decimal part. + // + int64_t exponent = 0; + bool overflow; + if (simdjson_likely(*p == '.')) { + p++; + const uint8_t *start_decimal_digits = p; + if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits + p++; + while (parse_digit(*p, i)) { p++; } + exponent = -(p - start_decimal_digits); + + // Overflow check. More than 19 digits (minus the decimal) may be overflow. + overflow = p-src-1 > 19; + if (simdjson_unlikely(overflow && leading_zero)) { + // Skip leading 0.00000 and see if it still overflows + const uint8_t *start_digits = src + 2; + while (*start_digits == '0') { start_digits++; } + overflow = start_digits-src > 19; + } + } else { + overflow = p-src > 19; + } + + // + // Parse the exponent + // + if (*p == 'e' || *p == 'E') { + p++; + bool exp_neg = *p == '-'; + p += exp_neg || *p == '+'; + + uint64_t exp = 0; + const uint8_t *start_exp_digits = p; + while (parse_digit(*p, exp)) { p++; } + // no exp digits, or 20+ exp digits + if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } + + exponent += exp_neg ? 0-exp : exp; + } + + if (*p != '"') { return NUMBER_ERROR; } + + overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; + + // + // Assemble (or slow-parse) the float + // + double d; + if (simdjson_likely(!overflow)) { + if (compute_float_64(exponent, i, negative, d)) { return d; } + } + if (!parse_float_fallback(src-negative, &d)) { + return NUMBER_ERROR; + } + return d; +} } //namespace {} #endif // SIMDJSON_SKIPNUMBERPARSING @@ -18651,22 +20602,28 @@ struct implementation_simdjson_result_base { */ simdjson_really_inline operator T&&() && noexcept(false); + +#endif // SIMDJSON_EXCEPTIONS + /** * Get the result value. This function is safe if and only * the error() method returns a value that evaluates to false. */ simdjson_really_inline const T& value_unsafe() const& noexcept; - + /** + * Get the result value. This function is safe if and only + * the error() method returns a value that evaluates to false. + */ + simdjson_really_inline T& value_unsafe() & noexcept; /** * Take the result value (move it). This function is safe if and only * the error() method returns a value that evaluates to false. */ simdjson_really_inline T&& value_unsafe() && noexcept; - -#endif // SIMDJSON_EXCEPTIONS - - T first{}; - error_code second{UNINITIALIZED}; +protected: + /** users should never directly access first and second. **/ + T first{}; /** Users should never directly access 'first'. **/ + error_code second{UNINITIALIZED}; /** Users should never directly access 'second'. **/ }; // struct implementation_simdjson_result_base } // namespace SIMDJSON_BUILTIN_IMPLEMENTATION @@ -18773,23 +20730,26 @@ namespace logger { static constexpr const bool LOG_ENABLED = false; #endif -static simdjson_really_inline void log_headers() noexcept; -static simdjson_really_inline void log_line(const json_iterator &iter, token_position index, depth_t depth, const char *title_prefix, const char *title, std::string_view detail) noexcept; -static simdjson_really_inline void log_line(const json_iterator &iter, const char *title_prefix, const char *title, std::string_view detail, int delta, int depth_delta) noexcept; -static simdjson_really_inline void log_event(const json_iterator &iter, const char *type, std::string_view detail="", int delta=0, int depth_delta=0) noexcept; -static simdjson_really_inline void log_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail="") noexcept; -static simdjson_really_inline void log_value(const json_iterator &iter, const char *type, std::string_view detail="", int delta=-1, int depth_delta=0) noexcept; -static simdjson_really_inline void log_start_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail="") noexcept; -static simdjson_really_inline void log_start_value(const json_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept; -static simdjson_really_inline void log_end_value(const json_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept; -static simdjson_really_inline void log_error(const json_iterator &iter, token_position index, depth_t depth, const char *error, const char *detail="") noexcept; -static simdjson_really_inline void log_error(const json_iterator &iter, const char *error, const char *detail="", int delta=-1, int depth_delta=0) noexcept; +// We do not want these functions to be 'really inlined' since real inlining is +// for performance purposes and if you are using the loggers, you do not care about +// performance (or should not). +static inline void log_headers() noexcept; +static inline void log_line(const json_iterator &iter, token_position index, depth_t depth, const char *title_prefix, const char *title, std::string_view detail) noexcept; +static inline void log_line(const json_iterator &iter, const char *title_prefix, const char *title, std::string_view detail, int delta, int depth_delta) noexcept; +static inline void log_event(const json_iterator &iter, const char *type, std::string_view detail="", int delta=0, int depth_delta=0) noexcept; +static inline void log_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail="") noexcept; +static inline void log_value(const json_iterator &iter, const char *type, std::string_view detail="", int delta=-1, int depth_delta=0) noexcept; +static inline void log_start_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail="") noexcept; +static inline void log_start_value(const json_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept; +static inline void log_end_value(const json_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept; +static inline void log_error(const json_iterator &iter, token_position index, depth_t depth, const char *error, const char *detail="") noexcept; +static inline void log_error(const json_iterator &iter, const char *error, const char *detail="", int delta=-1, int depth_delta=0) noexcept; -static simdjson_really_inline void log_event(const value_iterator &iter, const char *type, std::string_view detail="", int delta=0, int depth_delta=0) noexcept; -static simdjson_really_inline void log_value(const value_iterator &iter, const char *type, std::string_view detail="", int delta=-1, int depth_delta=0) noexcept; -static simdjson_really_inline void log_start_value(const value_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept; -static simdjson_really_inline void log_end_value(const value_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept; -static simdjson_really_inline void log_error(const value_iterator &iter, const char *error, const char *detail="", int delta=-1, int depth_delta=0) noexcept; +static inline void log_event(const value_iterator &iter, const char *type, std::string_view detail="", int delta=0, int depth_delta=0) noexcept; +static inline void log_value(const value_iterator &iter, const char *type, std::string_view detail="", int delta=-1, int depth_delta=0) noexcept; +static inline void log_start_value(const value_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept; +static inline void log_end_value(const value_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept; +static inline void log_error(const value_iterator &iter, const char *error, const char *detail="", int delta=-1, int depth_delta=0) noexcept; } // namespace logger } // namespace ondemand @@ -18873,7 +20833,7 @@ public: * long strings. * * If target is a compile-time constant, and your compiler likes you, - * you should be able to do the following without performance penatly... + * you should be able to do the following without performance penalty... * * static_assert(raw_json_string::is_free_from_unescaped_quote(target), ""); * s.unsafe_is_equal(target); @@ -18887,7 +20847,7 @@ public: * the caller is responsible for this check. See is_free_from_unescaped_quote. * * If target is a compile-time constant, and your compiler likes you, - * you should be able to do the following without performance penatly... + * you should be able to do the following without performance penalty... * * static_assert(raw_json_string::is_free_from_unescaped_quote(target), ""); * s.unsafe_is_equal(target); @@ -19018,11 +20978,12 @@ public: /** * Advance to the next token (returning the current one). - * - * Does not check or update depth/expect_value. Caller is responsible for that. */ - simdjson_really_inline const uint8_t *advance() noexcept; - + simdjson_really_inline const uint8_t *return_current_and_advance() noexcept; + /** + * Reports the current offset in bytes from the start of the underlying buffer. + */ + simdjson_really_inline uint32_t current_offset() const noexcept; /** * Get the JSON text for a given token (relative). * @@ -19052,8 +21013,6 @@ public: * * @param position The position of the token. * - * TODO consider a string_view, assuming the length will get stripped out by the optimizer when - * it isn't used ... */ simdjson_really_inline const uint8_t *peek(token_position position) const noexcept; /** @@ -19066,13 +21025,13 @@ public: simdjson_really_inline uint32_t peek_length(token_position position) const noexcept; /** - * Save the current index to be restored later. + * Return the current index. */ simdjson_really_inline token_position position() const noexcept; /** * Reset to a previously saved index. */ - simdjson_really_inline void set_position(token_position target_checkpoint) noexcept; + simdjson_really_inline void set_position(token_position target_position) noexcept; // NOTE: we don't support a full C++ iterator interface, because we expect people to make // different calls to advance the iterator based on *their own* state. @@ -19085,7 +21044,7 @@ public: simdjson_really_inline bool operator<=(const token_iterator &other) const noexcept; protected: - simdjson_really_inline token_iterator(const uint8_t *buf, token_position index) noexcept; + simdjson_really_inline token_iterator(const uint8_t *buf, token_position position) noexcept; /** * Get the index of the JSON text for a given token (relative). @@ -19107,7 +21066,7 @@ protected: simdjson_really_inline uint32_t peek_index(token_position position) const noexcept; const uint8_t *buf{}; - token_position index{}; + token_position _position{}; friend class json_iterator; friend class value_iterator; @@ -19139,6 +21098,7 @@ namespace SIMDJSON_BUILTIN_IMPLEMENTATION { namespace ondemand { class document; +class document_stream; class object; class array; class value; @@ -19180,14 +21140,27 @@ protected: * - 3 = key or value inside root array/object. */ depth_t _depth{}; + /** + * Beginning of the document indexes. + * Normally we have root == parser->implementation->structural_indexes.get() + * but this may differ, especially in streaming mode (where we have several + * documents); + */ + token_position _root{}; + /** + * Normally, a json_iterator operates over a single document, but in + * some cases, we may have a stream of documents. This attribute is meant + * as meta-data: the json_iterator works the same irrespective of the + * value of this attribute. + */ + bool _streaming{false}; public: simdjson_really_inline json_iterator() noexcept = default; simdjson_really_inline json_iterator(json_iterator &&other) noexcept; simdjson_really_inline json_iterator &operator=(json_iterator &&other) noexcept; - simdjson_really_inline json_iterator(const json_iterator &other) noexcept = delete; - simdjson_really_inline json_iterator &operator=(const json_iterator &other) noexcept = delete; - + simdjson_really_inline explicit json_iterator(const json_iterator &other) noexcept = default; + simdjson_really_inline json_iterator &operator=(const json_iterator &other) noexcept = default; /** * Skips a JSON value, whether it is a scalar, array or object. */ @@ -19198,10 +21171,18 @@ public: */ simdjson_really_inline bool at_root() const noexcept; + /** + * Tell whether we should be expected to run in streaming + * mode (iterating over many documents). It is pure metadata + * that does not affect how the iterator works. It is used by + * start_root_array() and start_root_object(). + */ + simdjson_really_inline bool streaming() const noexcept; + /** * Get the root value iterator */ - simdjson_really_inline token_position root_checkpoint() const noexcept; + simdjson_really_inline token_position root_position() const noexcept; /** * Assert if the iterator is not at the start @@ -19211,7 +21192,7 @@ public: /** * Tell whether the iterator is at the EOF mark */ - simdjson_really_inline bool at_eof() const noexcept; + simdjson_really_inline bool at_end() const noexcept; /** * Tell whether the iterator is live (has not been moved). @@ -19224,10 +21205,22 @@ public: simdjson_really_inline void abandon() noexcept; /** - * Advance the current token. + * Advance the current token without modifying depth. */ - simdjson_really_inline const uint8_t *advance() noexcept; + simdjson_really_inline const uint8_t *return_current_and_advance() noexcept; + /** + * Assert that there are at least the given number of tokens left. + * + * Has no effect in release builds. + */ + simdjson_really_inline void assert_more_tokens(uint32_t required_tokens=1) const noexcept; + /** + * Assert that the given position addresses an actual token (is within bounds). + * + * Has no effect in release builds. + */ + simdjson_really_inline void assert_valid_position(token_position position) const noexcept; /** * Get the JSON text for a given token (relative). * @@ -19247,12 +21240,21 @@ public: * @param delta The relative position of the token to retrieve. e.g. 0 = next token, -1 = prev token. */ simdjson_really_inline uint32_t peek_length(int32_t delta=0) const noexcept; + /** + * Get a pointer to the current location in the input buffer. + * + * This is not null-terminated; it is a view into the JSON. + * + * You may be pointing outside of the input buffer: it is not generally + * safe to derefence this pointer. + */ + simdjson_really_inline const uint8_t *unsafe_pointer() const noexcept; /** * Get the JSON text for a given token. * * This is not null-terminated; it is a view into the JSON. * - * @param index The position of the token to retrieve. + * @param position The position of the token to retrieve. * * TODO consider a string_view, assuming the length will get stripped out by the optimizer when * it isn't used ... @@ -19263,7 +21265,7 @@ public: * * The length will include any whitespace at the end of the token. * - * @param index The position of the token to retrieve. + * @param position The position of the token to retrieve. */ simdjson_really_inline uint32_t peek_length(token_position position) const noexcept; /** @@ -19292,8 +21294,8 @@ public: * * @param child_depth the expected child depth. */ - simdjson_really_inline void descend_to(depth_t parent_depth) noexcept; - simdjson_really_inline void descend_to(depth_t parent_depth, int32_t delta) noexcept; + simdjson_really_inline void descend_to(depth_t child_depth) noexcept; + simdjson_really_inline void descend_to(depth_t child_depth, int32_t delta) noexcept; /** * Get current depth. @@ -19321,8 +21323,6 @@ public: simdjson_really_inline error_code optional_error(error_code error, const char *message) noexcept; template simdjson_warn_unused simdjson_really_inline bool copy_to_buffer(const uint8_t *json, uint32_t max_len, uint8_t (&tmpbuf)[N]) noexcept; - template simdjson_warn_unused simdjson_really_inline bool peek_to_buffer(uint8_t (&tmpbuf)[N]) noexcept; - template simdjson_warn_unused simdjson_really_inline bool advance_to_buffer(uint8_t (&tmpbuf)[N]) noexcept; simdjson_really_inline token_position position() const noexcept; simdjson_really_inline void reenter_child(token_position position, depth_t child_depth) noexcept; @@ -19330,12 +21330,24 @@ public: simdjson_really_inline token_position start_position(depth_t depth) const noexcept; simdjson_really_inline void set_start_position(depth_t depth, token_position position) noexcept; #endif - + /* Useful for debugging and logging purposes. */ + inline std::string to_string() const noexcept; + /** + * Updates this json iterator so that it is back at the beginning of the document, + * as if it had just been created. + */ + inline void rewind() noexcept; protected: simdjson_really_inline json_iterator(const uint8_t *buf, ondemand::parser *parser) noexcept; - simdjson_really_inline token_position last_document_position() const noexcept; + /// The last token before the end + simdjson_really_inline token_position last_position() const noexcept; + /// The token *at* the end. This points at gibberish and should only be used for comparison. + simdjson_really_inline token_position end_position() const noexcept; + /// The end of the buffer. + simdjson_really_inline token_position end() const noexcept; friend class document; + friend class document_stream; friend class object; friend class array; friend class value; @@ -19391,8 +21403,6 @@ protected: depth_t _depth{}; /** * The starting token index for this value - * - * PERF NOTE: this is a safety check; we expect this to be elided in release builds. */ token_position _start_position{}; @@ -19414,7 +21424,7 @@ public: /** * Tell whether the iterator is at the EOF mark */ - simdjson_really_inline bool at_eof() const noexcept; + simdjson_really_inline bool at_end() const noexcept; /** * Tell whether the iterator is at the start of the value @@ -19451,7 +21461,7 @@ public: * * @error TAPE_ERROR when the JSON value is a bad token like "}" "," or "alse". */ - simdjson_really_inline simdjson_result type() noexcept; + simdjson_really_inline simdjson_result type() const noexcept; /** * @addtogroup object Object iteration @@ -19481,11 +21491,23 @@ public: /** * Start an object iteration after the user has already checked and moved past the {. * - * Does not move the iterator. + * Does not move the iterator unless the object is empty ({}). * * @returns Whether the object had any fields (returns false for empty). + * @error INCOMPLETE_ARRAY_OR_OBJECT If there are no more tokens (implying the *parent* + * array or object is incomplete). */ - simdjson_warn_unused simdjson_really_inline bool started_object() noexcept; + simdjson_warn_unused simdjson_really_inline simdjson_result started_object() noexcept; + /** + * Start an object iteration from the root, after the user has already checked and moved past the {. + * + * Does not move the iterator unless the object is empty ({}). + * + * @returns Whether the object had any fields (returns false for empty). + * @error INCOMPLETE_ARRAY_OR_OBJECT If there are no more tokens (implying the *parent* + * array or object is incomplete). + */ + simdjson_warn_unused simdjson_really_inline simdjson_result started_root_object() noexcept; /** * Moves to the next field in an object. @@ -19495,6 +21517,7 @@ public: * * @return whether there is another field in the object. * @error TAPE_ERROR If there is a comma missing between fields. + * @error TAPE_ERROR If there is a comma, but not enough tokens remaining to have a key, :, and value. */ simdjson_warn_unused simdjson_really_inline simdjson_result has_next_field() noexcept; @@ -19591,13 +21614,25 @@ public: simdjson_warn_unused simdjson_really_inline simdjson_result start_root_array() noexcept; /** - * Start an array iteration after the user has already checked and moved past the [. + * Start an array iteration, after the user has already checked and moved past the [. * - * Does not move the iterator. + * Does not move the iterator unless the array is empty ([]). * * @returns Whether the array had any elements (returns false for empty). + * @error INCOMPLETE_ARRAY_OR_OBJECT If there are no more tokens (implying the *parent* + * array or object is incomplete). */ - simdjson_warn_unused simdjson_really_inline bool started_array() noexcept; + simdjson_warn_unused simdjson_really_inline simdjson_result started_array() noexcept; + /** + * Start an array iteration from the root, after the user has already checked and moved past the [. + * + * Does not move the iterator unless the array is empty ([]). + * + * @returns Whether the array had any elements (returns false for empty). + * @error INCOMPLETE_ARRAY_OR_OBJECT If there are no more tokens (implying the *parent* + * array or object is incomplete). + */ + simdjson_warn_unused simdjson_really_inline simdjson_result started_root_array() noexcept; /** * Moves to the next element in an array. @@ -19626,16 +21661,22 @@ public: simdjson_warn_unused simdjson_really_inline simdjson_result get_string() noexcept; simdjson_warn_unused simdjson_really_inline simdjson_result get_raw_json_string() noexcept; simdjson_warn_unused simdjson_really_inline simdjson_result get_uint64() noexcept; + simdjson_warn_unused simdjson_really_inline simdjson_result get_uint64_in_string() noexcept; simdjson_warn_unused simdjson_really_inline simdjson_result get_int64() noexcept; + simdjson_warn_unused simdjson_really_inline simdjson_result get_int64_in_string() noexcept; simdjson_warn_unused simdjson_really_inline simdjson_result get_double() noexcept; + simdjson_warn_unused simdjson_really_inline simdjson_result get_double_in_string() noexcept; simdjson_warn_unused simdjson_really_inline simdjson_result get_bool() noexcept; simdjson_really_inline bool is_null() noexcept; simdjson_warn_unused simdjson_really_inline simdjson_result get_root_string() noexcept; simdjson_warn_unused simdjson_really_inline simdjson_result get_root_raw_json_string() noexcept; simdjson_warn_unused simdjson_really_inline simdjson_result get_root_uint64() noexcept; + simdjson_warn_unused simdjson_really_inline simdjson_result get_root_uint64_in_string() noexcept; simdjson_warn_unused simdjson_really_inline simdjson_result get_root_int64() noexcept; + simdjson_warn_unused simdjson_really_inline simdjson_result get_root_int64_in_string() noexcept; simdjson_warn_unused simdjson_really_inline simdjson_result get_root_double() noexcept; + simdjson_warn_unused simdjson_really_inline simdjson_result get_root_double_in_string() noexcept; simdjson_warn_unused simdjson_really_inline simdjson_result get_root_bool() noexcept; simdjson_really_inline bool is_root_null() noexcept; @@ -19648,31 +21689,130 @@ public: simdjson_really_inline bool is_valid() const noexcept; /** @} */ - protected: + /** + * Restarts an array iteration. + * @returns Whether the array has any elements (returns false for empty). + */ + simdjson_really_inline simdjson_result reset_array() noexcept; + /** + * Restarts an object iteration. + * @returns Whether the object has any fields (returns false for empty). + */ + simdjson_really_inline simdjson_result reset_object() noexcept; + /** + * move_at_start(): moves us so that we are pointing at the beginning of + * the container. It updates the index so that at_start() is true and it + * syncs the depth. The user can then create a new container instance. + * + * Usage: used with value::count_elements(). + **/ + simdjson_really_inline void move_at_start() noexcept; + + /** + * move_at_container_start(): moves us so that we are pointing at the beginning of + * the container so that assert_at_container_start() passes. + * + * Usage: used with reset_array() and reset_object(). + **/ + simdjson_really_inline void move_at_container_start() noexcept; + /* Useful for debugging and logging purposes. */ + inline std::string to_string() const noexcept; simdjson_really_inline value_iterator(json_iterator *json_iter, depth_t depth, token_position start_index) noexcept; simdjson_really_inline bool parse_null(const uint8_t *json) const noexcept; simdjson_really_inline simdjson_result parse_bool(const uint8_t *json) const noexcept; - simdjson_really_inline const uint8_t *peek_start() const noexcept; simdjson_really_inline uint32_t peek_start_length() const noexcept; - simdjson_really_inline const uint8_t *advance_start(const char *type) const noexcept; - simdjson_really_inline error_code advance_container_start(const char *type, const uint8_t *&json) const noexcept; - simdjson_really_inline const uint8_t *advance_root_scalar(const char *type) const noexcept; - simdjson_really_inline const uint8_t *advance_non_root_scalar(const char *type) const noexcept; + + /** + * The general idea of the advance_... methods and the peek_* methods + * is that you first peek and check that you have desired type. If you do, + * and only if you do, then you advance. + * + * We used to unconditionally advance. But this made reasoning about our + * current state difficult. + * Suppose you always advance. Look at the 'value' matching the key + * "shadowable" in the following example... + * + * ({"globals":{"a":{"shadowable":[}}}}) + * + * If the user thinks it is a Boolean and asks for it, then we check the '[', + * decide it is not a Boolean, but still move into the next character ('}'). Now + * we are left pointing at '}' right after a '['. And we have not yet reported + * an error, only that we do not have a Boolean. + * + * If, instead, you just stand your ground until it is content that you know, then + * you will only even move beyond the '[' if the user tells you that you have an + * array. So you will be at the '}' character inside the array and, hopefully, you + * will then catch the error because an array cannot start with '}', but the code + * processing Boolean values does not know this. + * + * So the contract is: first call 'peek_...' and then call 'advance_...' only + * if you have determined that it is a type you can handle. + * + * Unfortunately, it makes the code more verbose, longer and maybe more error prone. + */ + + simdjson_really_inline void advance_scalar(const char *type) noexcept; + simdjson_really_inline void advance_root_scalar(const char *type) noexcept; + simdjson_really_inline void advance_non_root_scalar(const char *type) noexcept; + + simdjson_really_inline const uint8_t *peek_scalar(const char *type) noexcept; + simdjson_really_inline const uint8_t *peek_root_scalar(const char *type) noexcept; + simdjson_really_inline const uint8_t *peek_non_root_scalar(const char *type) noexcept; + + + simdjson_really_inline error_code start_container(uint8_t start_char, const char *incorrect_type_message, const char *type) noexcept; + simdjson_really_inline error_code end_container() noexcept; + + /** + * Advance to a place expecting a value (increasing depth). + * + * @return The current token (the one left behind). + * @error TAPE_ERROR If the document ended early. + */ + simdjson_really_inline simdjson_result advance_to_value() noexcept; simdjson_really_inline error_code incorrect_type_error(const char *message) const noexcept; + simdjson_really_inline error_code error_unless_more_tokens(uint32_t tokens=1) const noexcept; simdjson_really_inline bool is_at_start() const noexcept; - simdjson_really_inline bool is_at_container_start() const noexcept; + /** + * is_at_iterator_start() returns true on an array or object after it has just been + * created, whether the instance is empty or not. + * + * Usage: used by array::begin() in debug mode (SIMDJSON_DEVELOPMENT_CHECKS) + */ simdjson_really_inline bool is_at_iterator_start() const noexcept; - simdjson_really_inline void assert_at_start() const noexcept; - simdjson_really_inline void assert_at_container_start() const noexcept; - simdjson_really_inline void assert_at_root() const noexcept; - simdjson_really_inline void assert_at_child() const noexcept; - simdjson_really_inline void assert_at_next() const noexcept; - simdjson_really_inline void assert_at_non_root_start() const noexcept; + + /** + * Assuming that we are within an object, this returns true if we + * are pointing at a key. + * + * Usage: the skip_child() method should never be used while we are pointing + * at a key inside an object. + */ + simdjson_really_inline bool is_at_key() const noexcept; + + inline void assert_at_start() const noexcept; + inline void assert_at_container_start() const noexcept; + inline void assert_at_root() const noexcept; + inline void assert_at_child() const noexcept; + inline void assert_at_next() const noexcept; + inline void assert_at_non_root_start() const noexcept; + + /** Get the starting position of this value */ + simdjson_really_inline token_position start_position() const noexcept; + + /** @copydoc error_code json_iterator::position() const noexcept; */ + simdjson_really_inline token_position position() const noexcept; + /** @copydoc error_code json_iterator::end_position() const noexcept; */ + simdjson_really_inline token_position last_position() const noexcept; + /** @copydoc error_code json_iterator::end_position() const noexcept; */ + simdjson_really_inline token_position end_position() const noexcept; + /** @copydoc error_code json_iterator::report_error(error_code error, const char *message) noexcept; */ + simdjson_really_inline error_code report_error(error_code error, const char *message) noexcept; friend class document; friend class object; @@ -19894,8 +22034,60 @@ public: * Part of the std::iterable interface. */ simdjson_really_inline simdjson_result end() noexcept; + /** + * This method scans the array and counts the number of elements. + * The count_elements method should always be called before you have begun + * iterating through the array: it is expected that you are pointing at + * the beginning of the array. + * The runtime complexity is linear in the size of the array. After + * calling this function, if successful, the array is 'rewinded' at its + * beginning as if it had never been accessed. If the JSON is malformed (e.g., + * there is a missing comma), then an error is returned and it is no longer + * safe to continue. + */ + simdjson_really_inline simdjson_result count_elements() & noexcept; + + /** + * Get the value associated with the given JSON pointer. We use the RFC 6901 + * https://tools.ietf.org/html/rfc6901 standard, interpreting the current node + * as the root of its own JSON document. + * + * ondemand::parser parser; + * auto json = R"([ { "foo": { "a": [ 10, 20, 30 ] }} ])"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("/0/foo/a/1") == 20 + * + * Note that at_pointer() called on the document automatically calls the document's rewind + * method between each call. It invalidates all previously accessed arrays, objects and values + * that have not been consumed. Yet it is not the case when calling at_pointer on an array + * instance: there is no rewind and no invalidation. + * + * You may only call at_pointer on an array after it has been created, but before it has + * been first accessed. When calling at_pointer on an array, the pointer is advanced to + * the location indicated by the JSON pointer (in case of success). It is no longer possible + * to call at_pointer on the same array. + * + * Also note that at_pointer() relies on find_field() which implies that we do not unescape keys when matching. + * + * @return The value associated with the given JSON pointer, or: + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + * - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed + */ + inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + /** + * Consumes the array and returns a string_view instance corresponding to the + * array as represented in JSON. It points inside the original document. + */ + simdjson_really_inline simdjson_result raw_json() noexcept; protected: + /** + * Go to the end of the array, no matter where you are right now. + */ + simdjson_really_inline error_code consume() noexcept; + /** * Begin array iteration. * @@ -19921,7 +22113,7 @@ protected: * * @param iter The iterator. Must be after the initial [. Will be *moved* into the resulting array. */ - static simdjson_really_inline array started(value_iterator &iter) noexcept; + static simdjson_really_inline simdjson_result started(value_iterator &iter) noexcept; /** * Create an array at the given Internal array creation. Call array::start() or array::started() instead of this. @@ -19932,6 +22124,15 @@ protected: */ simdjson_really_inline array(const value_iterator &iter) noexcept; + /** + * Get the value at the given index. This function has linear-time complexity. + * This function should only be called once as the array iterator is not reset between each call. + * + * @return The value at the given index, or: + * - INDEX_OUT_OF_BOUNDS if the array index is larger than an array length + */ + simdjson_really_inline simdjson_result at(size_t index) noexcept; + /** * Iterator marking current position. * @@ -19961,6 +22162,9 @@ public: simdjson_really_inline simdjson_result begin() noexcept; simdjson_really_inline simdjson_result end() noexcept; + simdjson_really_inline simdjson_result count_elements() & noexcept; + simdjson_really_inline simdjson_result at(size_t index) noexcept; + simdjson_really_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; }; } // namespace simdjson @@ -19977,9 +22181,10 @@ class object; class value; class raw_json_string; class array_iterator; +class document_stream; /** - * A JSON document iteration. + * A JSON document. It holds a json_iterator instance. * * Used by tokens to get text, and string buffer location. * @@ -19993,7 +22198,7 @@ public: * Exists so you can declare a variable and later assign to it before use. */ simdjson_really_inline document() noexcept = default; - simdjson_really_inline document(const document &other) noexcept = delete; + simdjson_really_inline document(const document &other) noexcept = delete; // pass your documents by reference, not by copy simdjson_really_inline document(document &&other) noexcept = default; simdjson_really_inline document &operator=(const document &other) noexcept = delete; simdjson_really_inline document &operator=(document &&other) noexcept = default; @@ -20019,6 +22224,13 @@ public: * @returns INCORRECT_TYPE If the JSON value is not a 64-bit unsigned integer. */ simdjson_really_inline simdjson_result get_uint64() noexcept; + /** + * Cast this JSON value (inside string) to an unsigned integer. + * + * @returns A signed 64-bit integer. + * @returns INCORRECT_TYPE If the JSON value is not a 64-bit unsigned integer. + */ + simdjson_really_inline simdjson_result get_uint64_in_string() noexcept; /** * Cast this JSON value to a signed integer. * @@ -20026,6 +22238,13 @@ public: * @returns INCORRECT_TYPE If the JSON value is not a 64-bit integer. */ simdjson_really_inline simdjson_result get_int64() noexcept; + /** + * Cast this JSON value (inside string) to a signed integer. + * + * @returns A signed 64-bit integer. + * @returns INCORRECT_TYPE If the JSON value is not a 64-bit integer. + */ + simdjson_really_inline simdjson_result get_int64_in_string() noexcept; /** * Cast this JSON value to a double. * @@ -20033,6 +22252,14 @@ public: * @returns INCORRECT_TYPE If the JSON value is not a valid floating-point number. */ simdjson_really_inline simdjson_result get_double() noexcept; + + /** + * Cast this JSON value (inside string) to a double. + * + * @returns A double. + * @returns INCORRECT_TYPE If the JSON value is not a valid floating-point number. + */ + simdjson_really_inline simdjson_result get_double_in_string() noexcept; /** * Cast this JSON value to a string. * @@ -20165,7 +22392,26 @@ public: */ simdjson_really_inline operator bool() noexcept(false); #endif - + /** + * This method scans the array and counts the number of elements. + * The count_elements method should always be called before you have begun + * iterating through the array: it is expected that you are pointing at + * the beginning of the array. + * The runtime complexity is linear in the size of the array. After + * calling this function, if successful, the array is 'rewinded' at its + * beginning as if it had never been accessed. If the JSON is malformed (e.g., + * there is a missing comma), then an error is returned and it is no longer + * safe to continue. + */ + simdjson_really_inline simdjson_result count_elements() & noexcept; + /** + * Get the value at the given index in the array. This function has linear-time complexity. + * This function should only be called once as the array iterator is not reset between each call. + * + * @return The value at the given index, or: + * - INDEX_OUT_OF_BOUNDS if the array index is larger than an array length + */ + simdjson_really_inline simdjson_result at(size_t index) & noexcept; /** * Begin array iteration. * @@ -20265,13 +22511,67 @@ public: */ simdjson_really_inline simdjson_result raw_json_token() noexcept; + /** + * Reset the iterator inside the document instance so we are pointing back at the + * beginning of the document, as if it had just been created. It invalidates all + * values, objects and arrays that you have created so far (including unescaped strings). + */ + inline void rewind() noexcept; + /** + * Returns debugging information. + */ + inline std::string to_debug_string() noexcept; + + /** + * Get the value associated with the given JSON pointer. We use the RFC 6901 + * https://tools.ietf.org/html/rfc6901 standard. + * + * ondemand::parser parser; + * auto json = R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("/foo/a/1") == 20 + * + * It is allowed for a key to be the empty string: + * + * ondemand::parser parser; + * auto json = R"({ "": { "a": [ 10, 20, 30 ] }})"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("//a/1") == 20 + * + * Note that at_pointer() automatically calls rewind between each call. Thus + * all values, objects and arrays that you have created so far (including unescaped strings) + * are invalidated. After calling at_pointer, you need to consume the result: string values + * should be stored in your own variables, arrays should be decoded and stored in your own array-like + * structures and so forth. + * + * Also note that at_pointer() relies on find_field() which implies that we do not unescape keys when matching + * + * @return The value associated with the given JSON pointer, or: + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + * - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed + */ + simdjson_really_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + /** + * Consumes the document and returns a string_view instance corresponding to the + * document as represented in JSON. It points inside the original byte array containg + * the JSON document. + */ + simdjson_really_inline simdjson_result raw_json() noexcept; protected: + /** + * Consumes the document. + */ + simdjson_really_inline error_code consume() noexcept; + simdjson_really_inline document(ondemand::json_iterator &&iter) noexcept; simdjson_really_inline const uint8_t *text(uint32_t idx) const noexcept; simdjson_really_inline value_iterator resume_value_iterator() noexcept; simdjson_really_inline value_iterator get_root_value_iterator() noexcept; - simdjson_really_inline value resume_value() noexcept; + simdjson_really_inline simdjson_result get_value_unsafe() noexcept; + simdjson_really_inline simdjson_result start_or_resume_object() noexcept; static simdjson_really_inline document start(ondemand::json_iterator &&iter) noexcept; // @@ -20280,7 +22580,6 @@ protected: json_iterator iter{}; ///< Current position in the document static constexpr depth_t DOCUMENT_DEPTH = 0; ///< document depth is always 0 - friend struct simdjson_result; friend class array_iterator; friend class value; friend class ondemand::parser; @@ -20288,8 +22587,58 @@ protected: friend class array; friend class field; friend class token; + friend class document_stream; }; + +/** + * A document_reference is a thin wrapper around a document reference instance. + */ +class document_reference { +public: + simdjson_really_inline document_reference() noexcept; + simdjson_really_inline document_reference(document &d) noexcept; + simdjson_really_inline document_reference(const document_reference &other) noexcept = default; + simdjson_really_inline void rewind() noexcept; + simdjson_really_inline simdjson_result get_array() & noexcept; + simdjson_really_inline simdjson_result get_object() & noexcept; + simdjson_really_inline simdjson_result get_uint64() noexcept; + simdjson_really_inline simdjson_result get_int64() noexcept; + simdjson_really_inline simdjson_result get_double() noexcept; + simdjson_really_inline simdjson_result get_string() noexcept; + simdjson_really_inline simdjson_result get_raw_json_string() noexcept; + simdjson_really_inline simdjson_result get_bool() noexcept; + simdjson_really_inline bool is_null() noexcept; + simdjson_really_inline simdjson_result raw_json() noexcept; + simdjson_really_inline operator document&() const noexcept; + +#if SIMDJSON_EXCEPTIONS + simdjson_really_inline operator array() & noexcept(false); + simdjson_really_inline operator object() & noexcept(false); + simdjson_really_inline operator uint64_t() noexcept(false); + simdjson_really_inline operator int64_t() noexcept(false); + simdjson_really_inline operator double() noexcept(false); + simdjson_really_inline operator std::string_view() noexcept(false); + simdjson_really_inline operator raw_json_string() noexcept(false); + simdjson_really_inline operator bool() noexcept(false); +#endif + simdjson_really_inline simdjson_result count_elements() & noexcept; + simdjson_really_inline simdjson_result at(size_t index) & noexcept; + simdjson_really_inline simdjson_result begin() & noexcept; + simdjson_really_inline simdjson_result end() & noexcept; + simdjson_really_inline simdjson_result find_field(std::string_view key) & noexcept; + simdjson_really_inline simdjson_result find_field(const char *key) & noexcept; + simdjson_really_inline simdjson_result operator[](std::string_view key) & noexcept; + simdjson_really_inline simdjson_result operator[](const char *key) & noexcept; + simdjson_really_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; + simdjson_really_inline simdjson_result find_field_unordered(const char *key) & noexcept; + + simdjson_really_inline simdjson_result type() noexcept; + simdjson_really_inline simdjson_result raw_json_token() noexcept; + simdjson_really_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; +private: + document *doc{nullptr}; +}; } // namespace ondemand } // namespace SIMDJSON_BUILTIN_IMPLEMENTATION } // namespace simdjson @@ -20302,12 +22651,14 @@ public: simdjson_really_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document &&value) noexcept; ///< @private simdjson_really_inline simdjson_result(error_code error) noexcept; ///< @private simdjson_really_inline simdjson_result() noexcept = default; + simdjson_really_inline error_code rewind() noexcept; simdjson_really_inline simdjson_result get_array() & noexcept; simdjson_really_inline simdjson_result get_object() & noexcept; simdjson_really_inline simdjson_result get_uint64() noexcept; simdjson_really_inline simdjson_result get_int64() noexcept; simdjson_really_inline simdjson_result get_double() noexcept; + simdjson_really_inline simdjson_result get_double_from_string() noexcept; simdjson_really_inline simdjson_result get_string() noexcept; simdjson_really_inline simdjson_result get_raw_json_string() noexcept; simdjson_really_inline simdjson_result get_bool() noexcept; @@ -20329,7 +22680,8 @@ public: simdjson_really_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false); simdjson_really_inline operator bool() noexcept(false); #endif - + simdjson_really_inline simdjson_result count_elements() & noexcept; + simdjson_really_inline simdjson_result at(size_t index) & noexcept; simdjson_really_inline simdjson_result begin() & noexcept; simdjson_really_inline simdjson_result end() & noexcept; simdjson_really_inline simdjson_result find_field(std::string_view key) & noexcept; @@ -20343,8 +22695,64 @@ public: /** @copydoc simdjson_really_inline std::string_view document::raw_json_token() const noexcept */ simdjson_really_inline simdjson_result raw_json_token() noexcept; + + simdjson_really_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; }; + +} // namespace simdjson + + + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_really_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference value, error_code error) noexcept; + simdjson_really_inline simdjson_result() noexcept = default; + simdjson_really_inline error_code rewind() noexcept; + + simdjson_really_inline simdjson_result get_array() & noexcept; + simdjson_really_inline simdjson_result get_object() & noexcept; + simdjson_really_inline simdjson_result get_uint64() noexcept; + simdjson_really_inline simdjson_result get_int64() noexcept; + simdjson_really_inline simdjson_result get_double() noexcept; + simdjson_really_inline simdjson_result get_string() noexcept; + simdjson_really_inline simdjson_result get_raw_json_string() noexcept; + simdjson_really_inline simdjson_result get_bool() noexcept; + simdjson_really_inline bool is_null() noexcept; + +#if SIMDJSON_EXCEPTIONS + simdjson_really_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array() & noexcept(false); + simdjson_really_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object() & noexcept(false); + simdjson_really_inline operator uint64_t() noexcept(false); + simdjson_really_inline operator int64_t() noexcept(false); + simdjson_really_inline operator double() noexcept(false); + simdjson_really_inline operator std::string_view() noexcept(false); + simdjson_really_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false); + simdjson_really_inline operator bool() noexcept(false); +#endif + simdjson_really_inline simdjson_result count_elements() & noexcept; + simdjson_really_inline simdjson_result at(size_t index) & noexcept; + simdjson_really_inline simdjson_result begin() & noexcept; + simdjson_really_inline simdjson_result end() & noexcept; + simdjson_really_inline simdjson_result find_field(std::string_view key) & noexcept; + simdjson_really_inline simdjson_result find_field(const char *key) & noexcept; + simdjson_really_inline simdjson_result operator[](std::string_view key) & noexcept; + simdjson_really_inline simdjson_result operator[](const char *key) & noexcept; + simdjson_really_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; + simdjson_really_inline simdjson_result find_field_unordered(const char *key) & noexcept; + + simdjson_really_inline simdjson_result type() noexcept; + + /** @copydoc simdjson_really_inline std::string_view document_reference::raw_json_token() const noexcept */ + simdjson_really_inline simdjson_result raw_json_token() noexcept; + + simdjson_really_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; +}; + + } // namespace simdjson /* end file include/simdjson/generic/ondemand/document.h */ /* begin file include/simdjson/generic/ondemand/value.h */ @@ -20418,11 +22826,19 @@ public: /** * Cast this JSON value to an unsigned integer. * - * @returns A signed 64-bit integer. + * @returns A unsigned 64-bit integer. * @returns INCORRECT_TYPE If the JSON value is not a 64-bit unsigned integer. */ simdjson_really_inline simdjson_result get_uint64() noexcept; + /** + * Cast this JSON value (inside string) to a unsigned integer. + * + * @returns A unsigned 64-bit integer. + * @returns INCORRECT_TYPE If the JSON value is not a 64-bit unsigned integer. + */ + simdjson_really_inline simdjson_result get_uint64_in_string() noexcept; + /** * Cast this JSON value to a signed integer. * @@ -20431,6 +22847,14 @@ public: */ simdjson_really_inline simdjson_result get_int64() noexcept; + /** + * Cast this JSON value (inside string) to a signed integer. + * + * @returns A signed 64-bit integer. + * @returns INCORRECT_TYPE If the JSON value is not a 64-bit integer. + */ + simdjson_really_inline simdjson_result get_int64_in_string() noexcept; + /** * Cast this JSON value to a double. * @@ -20439,6 +22863,14 @@ public: */ simdjson_really_inline simdjson_result get_double() noexcept; + /** + * Cast this JSON value (inside string) to a double + * + * @returns A double. + * @returns INCORRECT_TYPE If the JSON value is not a valid floating-point number. + */ + simdjson_really_inline simdjson_result get_double_in_string() noexcept; + /** * Cast this JSON value to a string. * @@ -20557,7 +22989,26 @@ public: * Part of the std::iterable interface. */ simdjson_really_inline simdjson_result end() & noexcept; - + /** + * This method scans the array and counts the number of elements. + * The count_elements method should always be called before you have begun + * iterating through the array: it is expected that you are pointing at + * the beginning of the array. + * The runtime complexity is linear in the size of the array. After + * calling this function, if successful, the array is 'rewinded' at its + * beginning as if it had never been accessed. If the JSON is malformed (e.g., + * there is a missing comma), then an error is returned and it is no longer + * safe to continue. + */ + simdjson_really_inline simdjson_result count_elements() & noexcept; + /** + * Get the value at the given index in the array. This function has linear-time complexity. + * This function should only be called once as the array iterator is not reset between each call. + * + * @return The value at the given index, or: + * - INDEX_OUT_OF_BOUNDS if the array index is larger than an array length + */ + simdjson_really_inline simdjson_result at(size_t index) noexcept; /** * Look up a field by name on an object (order-sensitive). * @@ -20652,6 +23103,50 @@ public: */ simdjson_really_inline std::string_view raw_json_token() noexcept; + /** + * Get the value associated with the given JSON pointer. We use the RFC 6901 + * https://tools.ietf.org/html/rfc6901 standard. + * + * ondemand::parser parser; + * auto json = R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("/foo/a/1") == 20 + * + * It is allowed for a key to be the empty string: + * + * ondemand::parser parser; + * auto json = R"({ "": { "a": [ 10, 20, 30 ] }})"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("//a/1") == 20 + * + * Note that at_pointer() called on the document automatically calls the document's rewind + * method between each call. It invalidates all previously accessed arrays, objects and values + * that have not been consumed. + * + * Calling at_pointer() on non-document instances (e.g., arrays and objects) is not + * standardized (by RFC 6901). We provide some experimental support for JSON pointers + * on non-document instances. Yet it is not the case when calling at_pointer on an array + * or an object instance: there is no rewind and no invalidation. + * + * You may only call at_pointer on an array after it has been created, but before it has + * been first accessed. When calling at_pointer on an array, the pointer is advanced to + * the location indicated by the JSON pointer (in case of success). It is no longer possible + * to call at_pointer on the same array. + * + * You may call at_pointer more than once on an object, but each time the pointer is advanced + * to be within the value matched by the key indicated by the JSON pointer query. Thus any preceeding + * key (as well as the current key) can no longer be used with following JSON pointer calls. + * + * Also note that at_pointer() relies on find_field() which implies that we do not unescape keys when matching + * + * @return The value associated with the given JSON pointer, or: + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + * - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed + */ + simdjson_really_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + protected: /** * Create a value. @@ -20690,7 +23185,6 @@ protected: friend class field; friend class object; friend struct simdjson_result; - friend struct simdjson_result; friend struct simdjson_result; }; @@ -20711,8 +23205,11 @@ public: simdjson_really_inline simdjson_result get_object() noexcept; simdjson_really_inline simdjson_result get_uint64() noexcept; + simdjson_really_inline simdjson_result get_uint64_in_string() noexcept; simdjson_really_inline simdjson_result get_int64() noexcept; + simdjson_really_inline simdjson_result get_int64_in_string() noexcept; simdjson_really_inline simdjson_result get_double() noexcept; + simdjson_really_inline simdjson_result get_double_in_string() noexcept; simdjson_really_inline simdjson_result get_string() noexcept; simdjson_really_inline simdjson_result get_raw_json_string() noexcept; simdjson_really_inline simdjson_result get_bool() noexcept; @@ -20732,7 +23229,8 @@ public: simdjson_really_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false); simdjson_really_inline operator bool() noexcept(false); #endif - + simdjson_really_inline simdjson_result count_elements() & noexcept; + simdjson_really_inline simdjson_result at(size_t index) noexcept; simdjson_really_inline simdjson_result begin() & noexcept; simdjson_really_inline simdjson_result end() & noexcept; @@ -20798,6 +23296,8 @@ public: /** @copydoc simdjson_really_inline std::string_view value::raw_json_token() const noexcept */ simdjson_really_inline simdjson_result raw_json_token() noexcept; + + simdjson_really_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; }; } // namespace simdjson @@ -20895,7 +23395,6 @@ public: simdjson_really_inline simdjson_result begin() noexcept; simdjson_really_inline simdjson_result end() noexcept; - /** * Look up a field by name on an object (order-sensitive). * @@ -20952,10 +23451,57 @@ public: /** @overload simdjson_really_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; */ simdjson_really_inline simdjson_result operator[](std::string_view key) && noexcept; + /** + * Get the value associated with the given JSON pointer. We use the RFC 6901 + * https://tools.ietf.org/html/rfc6901 standard, interpreting the current node + * as the root of its own JSON document. + * + * ondemand::parser parser; + * auto json = R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("/foo/a/1") == 20 + * + * It is allowed for a key to be the empty string: + * + * ondemand::parser parser; + * auto json = R"({ "": { "a": [ 10, 20, 30 ] }})"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("//a/1") == 20 + * + * Note that at_pointer() called on the document automatically calls the document's rewind + * method between each call. It invalidates all previously accessed arrays, objects and values + * that have not been consumed. Yet it is not the case when calling at_pointer on an object + * instance: there is no rewind and no invalidation. + * + * You may call at_pointer more than once on an object, but each time the pointer is advanced + * to be within the value matched by the key indicated by the JSON pointer query. Thus any preceeding + * key (as well as the current key) can no longer be used with following JSON pointer calls. + * + * Also note that at_pointer() relies on find_field() which implies that we do not unescape keys when matching. + * + * @return The value associated with the given JSON pointer, or: + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + * - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed + */ + inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + + /** + * Consumes the object and returns a string_view instance corresponding to the + * object as represented in JSON. It points inside the original byte array containg + * the JSON document. + */ + simdjson_really_inline simdjson_result raw_json() noexcept; + protected: + /** + * Go to the end of the object, no matter where you are right now. + */ + simdjson_really_inline error_code consume() noexcept; static simdjson_really_inline simdjson_result start(value_iterator &iter) noexcept; static simdjson_really_inline simdjson_result start_root(value_iterator &iter) noexcept; - static simdjson_really_inline object started(value_iterator &iter) noexcept; + static simdjson_really_inline simdjson_result started(value_iterator &iter) noexcept; static simdjson_really_inline object resume(const value_iterator &iter) noexcept; simdjson_really_inline object(const value_iterator &iter) noexcept; @@ -20989,6 +23535,7 @@ public: simdjson_really_inline simdjson_result find_field_unordered(std::string_view key) && noexcept; simdjson_really_inline simdjson_result operator[](std::string_view key) & noexcept; simdjson_really_inline simdjson_result operator[](std::string_view key) && noexcept; + simdjson_really_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; }; } // namespace simdjson @@ -21003,6 +23550,23 @@ class array; class object; class value; class raw_json_string; +class document_stream; + +/** + * The default batch size for document_stream instances for this On Demand kernel. + * Note that different On Demand kernel may use a different DEFAULT_BATCH_SIZE value + * in the future. + */ +static constexpr size_t DEFAULT_BATCH_SIZE = 1000000; +/** + * Some adversary might try to set the batch size to 0 or 1, which might cause problems. + * We set a minimum of 32B since anything else is highly likely to be an error. In practice, + * most users will want a much larger batch size. + * + * All non-negative MINIMAL_BATCH_SIZE values should be 'safe' except that, obviously, no JSON + * document can ever span 0 or 1 byte and that very large values would create memory allocation issues. + */ +static constexpr size_t MINIMAL_BATCH_SIZE = 32; /** * A JSON fragment iterator. @@ -21016,11 +23580,12 @@ public: * * The new parser will have zero capacity. */ - inline parser() noexcept = default; + inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept; inline parser(parser &&other) noexcept = default; simdjson_really_inline parser(const parser &other) = delete; simdjson_really_inline parser &operator=(const parser &other) = delete; + simdjson_really_inline parser &operator=(parser &&other) noexcept = default; /** Deallocate the JSON parser. */ inline ~parser() noexcept = default; @@ -21031,6 +23596,11 @@ public: * ondemand::parser parser; * document doc = parser.iterate(json); * + * ### IMPORTANT: Validate what you use + * + * Calling iterate on an invalid JSON document may not immediately trigger an error. The call to + * iterate does not parse and validate the whole document. + * * ### IMPORTANT: Buffer Lifetime * * Because parsing is done while you iterate, you *must* keep the JSON buffer around at least as @@ -21101,8 +23671,6 @@ public: * those bytes are initialized to, as long as they are allocated. * * @param json The JSON to parse. - * @param len The length of the JSON. - * @param allocated The number of bytes allocated in the JSON (must be at least len+SIMDJSON_PADDING). * * @return The iterator, or an error: * - INSUFFICIENT_PADDING if the input has less than SIMDJSON_PADDING extra bytes. @@ -21115,21 +23683,88 @@ public: */ simdjson_warn_unused simdjson_result iterate_raw(padded_string_view json) & noexcept; + + /** + * Parse a buffer containing many JSON documents. + * + * auto json = R"({ "foo": 1 } { "foo": 2 } { "foo": 3 } )"_padded; + * ondemand::parser parser; + * ondemand::document_stream docs = parser.iterate_many(json); + * for (auto & doc : docs) { + * std::cout << doc["foo"] << std::endl; + * } + * // Prints 1 2 3 + * + * No copy of the input buffer is made. + * + * The function is lazy: it may be that no more than one JSON document at a time is parsed. + * + * The caller is responsabile to ensure that the input string data remains unchanged and is + * not deleted during the loop. + * + * ### Format + * + * The buffer must contain a series of one or more JSON documents, concatenated into a single + * buffer, separated by whitespace. It effectively parses until it has a fully valid document, + * then starts parsing the next document at that point. (It does this with more parallelism and + * lookahead than you might think, though.) + * + * documents that consist of an object or array may omit the whitespace between them, concatenating + * with no separator. documents that consist of a single primitive (i.e. documents that are not + * arrays or objects) MUST be separated with whitespace. + * + * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse. + * Setting batch_size to excessively large or excesively small values may impact negatively the + * performance. + * + * ### REQUIRED: Buffer Padding + * + * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what + * those bytes are initialized to, as long as they are allocated. + * + * ### Threads + * + * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the + * hood to do some lookahead. + * + * ### Parser Capacity + * + * If the parser's current capacity is less than batch_size, it will allocate enough capacity + * to handle it (up to max_capacity). + * + * @param buf The concatenated JSON to parse. + * @param len The length of the concatenated JSON. + * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet + * spot is cache-related: small enough to fit in cache, yet big enough to + * parse as many documents as possible in one tight loop. + * Defaults to 10MB, which has been a reasonable sweet spot in our tests. + * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors: + * - MEMALLOC if the parser does not have enough capacity and memory allocation fails + * - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity. + * - other json errors if parsing fails. You should not rely on these errors to always the same for the + * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). + */ + inline simdjson_result iterate_many(const uint8_t *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; + /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ + inline simdjson_result iterate_many(const char *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; + /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ + inline simdjson_result iterate_many(const std::string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; + inline simdjson_result iterate_many(const std::string &&s, size_t batch_size) = delete;// unsafe + /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ + inline simdjson_result iterate_many(const padded_string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; + inline simdjson_result iterate_many(const padded_string &&s, size_t batch_size) = delete;// unsafe + + /** @private We do not want to allow implicit conversion from C string to std::string. */ + simdjson_result iterate_many(const char *buf, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept = delete; + /** The capacity of this parser (the largest document it can process). */ simdjson_really_inline size_t capacity() const noexcept; + /** The maximum capacity of this parser (the largest document it is allowed to process). */ + simdjson_really_inline size_t max_capacity() const noexcept; + simdjson_really_inline void set_max_capacity(size_t max_capacity) noexcept; /** The maximum depth of this parser (the most deeply nested objects and arrays it can process). */ simdjson_really_inline size_t max_depth() const noexcept; -private: - /** @private [for benchmarking access] The implementation to use */ - std::unique_ptr implementation{}; - size_t _capacity{0}; - size_t _max_depth{DEFAULT_MAX_DEPTH}; - std::unique_ptr string_buf{}; -#ifdef SIMDJSON_DEVELOPMENT_CHECKS - std::unique_ptr start_positions{}; -#endif - /** * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length * and `max_depth` depth. @@ -21140,7 +23775,28 @@ private: */ simdjson_warn_unused error_code allocate(size_t capacity, size_t max_depth=DEFAULT_MAX_DEPTH) noexcept; + #ifdef SIMDJSON_THREADS_ENABLED + /** + * The parser instance can use threads when they are available to speed up some + * operations. It is enabled by default. Changing this attribute will change the + * behavior of the parser for future operations. + */ + bool threaded{true}; + #endif + +private: + /** @private [for benchmarking access] The implementation to use */ + std::unique_ptr implementation{}; + size_t _capacity{0}; + size_t _max_capacity; + size_t _max_depth{DEFAULT_MAX_DEPTH}; + std::unique_ptr string_buf{}; +#ifdef SIMDJSON_DEVELOPMENT_CHECKS + std::unique_ptr start_positions{}; +#endif + friend class json_iterator; + friend class document_stream; }; } // namespace ondemand @@ -21159,6 +23815,417 @@ public: } // namespace simdjson /* end file include/simdjson/generic/ondemand/parser.h */ +/* begin file include/simdjson/generic/ondemand/document_stream.h */ +#ifdef SIMDJSON_THREADS_ENABLED +#include +#include +#include +#endif + +namespace simdjson { +namespace SIMDJSON_BUILTIN_IMPLEMENTATION { +namespace ondemand { + +class parser; +class json_iterator; +class document; + +#ifdef SIMDJSON_THREADS_ENABLED +/** @private Custom worker class **/ +struct stage1_worker { + stage1_worker() noexcept = default; + stage1_worker(const stage1_worker&) = delete; + stage1_worker(stage1_worker&&) = delete; + stage1_worker operator=(const stage1_worker&) = delete; + ~stage1_worker(); + /** + * We only start the thread when it is needed, not at object construction, this may throw. + * You should only call this once. + **/ + void start_thread(); + /** + * Start a stage 1 job. You should first call 'run', then 'finish'. + * You must call start_thread once before. + */ + void run(document_stream * ds, parser * stage1, size_t next_batch_start); + /** Wait for the run to finish (blocking). You should first call 'run', then 'finish'. **/ + void finish(); + +private: + + /** + * Normally, we would never stop the thread. But we do in the destructor. + * This function is only safe assuming that you are not waiting for results. You + * should have called run, then finish, and be done. + **/ + void stop_thread(); + + std::thread thread{}; + /** These three variables define the work done by the thread. **/ + ondemand::parser * stage1_thread_parser{}; + size_t _next_batch_start{}; + document_stream * owner{}; + /** + * We have two state variables. This could be streamlined to one variable in the future but + * we use two for clarity. + */ + bool has_work{false}; + bool can_work{true}; + + /** + * We lock using a mutex. + */ + std::mutex locking_mutex{}; + std::condition_variable cond_var{}; + + friend class document_stream; +}; +#endif // SIMDJSON_THREADS_ENABLED + +/** + * A forward-only stream of documents. + * + * Produced by parser::iterate_many. + * + */ +class document_stream { +public: + /** + * Construct an uninitialized document_stream. + * + * ```c++ + * document_stream docs; + * auto error = parser.iterate_many(json).get(docs); + * ``` + */ + simdjson_really_inline document_stream() noexcept; + /** Move one document_stream to another. */ + simdjson_really_inline document_stream(document_stream &&other) noexcept = default; + /** Move one document_stream to another. */ + simdjson_really_inline document_stream &operator=(document_stream &&other) noexcept = default; + + simdjson_really_inline ~document_stream() noexcept; + + /** + * Returns the input size in bytes. + */ + inline size_t size_in_bytes() const noexcept; + + /** + * After iterating through the stream, this method + * returns the number of bytes that were not parsed at the end + * of the stream. If truncated_bytes() differs from zero, + * then the input was truncated maybe because incomplete JSON + * documents were found at the end of the stream. You + * may need to process the bytes in the interval [size_in_bytes()-truncated_bytes(), size_in_bytes()). + * + * You should only call truncated_bytes() after streaming through all + * documents, like so: + * + * document_stream stream = parser.iterate_many(json,window); + * for(auto & doc : stream) { + * // do something with doc + * } + * size_t truncated = stream.truncated_bytes(); + * + */ + inline size_t truncated_bytes() const noexcept; + + class iterator { + public: + using value_type = simdjson_result; + using reference = value_type; + + using difference_type = std::ptrdiff_t; + + using iterator_category = std::input_iterator_tag; + + /** + * Default constructor. + */ + simdjson_really_inline iterator() noexcept; + /** + * Get the current document (or error). + */ + simdjson_really_inline simdjson_result operator*() noexcept; + /** + * Advance to the next document (prefix). + */ + inline iterator& operator++() noexcept; + /** + * Check if we're at the end yet. + * @param other the end iterator to compare to. + */ + simdjson_really_inline bool operator!=(const iterator &other) const noexcept; + /** + * @private + * + * Gives the current index in the input document in bytes. + * + * document_stream stream = parser.parse_many(json,window); + * for(auto i = stream.begin(); i != stream.end(); ++i) { + * auto doc = *i; + * size_t index = i.current_index(); + * } + * + * This function (current_index()) is experimental and the usage + * may change in future versions of simdjson: we find the API somewhat + * awkward and we would like to offer something friendlier. + */ + simdjson_really_inline size_t current_index() const noexcept; + + /** + * @private + * + * Gives a view of the current document at the current position. + * + * document_stream stream = parser.iterate_many(json,window); + * for(auto i = stream.begin(); i != stream.end(); ++i) { + * std::string_view v = i.source(); + * } + * + * The returned string_view instance is simply a map to the (unparsed) + * source string: it may thus include white-space characters and all manner + * of padding. + * + * This function (source()) is experimental and the usage + * may change in future versions of simdjson: we find the API somewhat + * awkward and we would like to offer something friendlier. + * + */ + simdjson_really_inline std::string_view source() const noexcept; + + /** + * Returns error of the stream (if any). + */ + inline error_code error() const noexcept; + + private: + simdjson_really_inline iterator(document_stream *s, bool finished) noexcept; + /** The document_stream we're iterating through. */ + document_stream* stream; + /** Whether we're finished or not. */ + bool finished; + + friend class document; + friend class document_stream; + friend class json_iterator; + }; + + /** + * Start iterating the documents in the stream. + */ + simdjson_really_inline iterator begin() noexcept; + /** + * The end of the stream, for iterator comparison purposes. + */ + simdjson_really_inline iterator end() noexcept; + +private: + + document_stream &operator=(const document_stream &) = delete; // Disallow copying + document_stream(const document_stream &other) = delete; // Disallow copying + + /** + * Construct a document_stream. Does not allocate or parse anything until the iterator is + * used. + * + * @param parser is a reference to the parser instance used to generate this document_stream + * @param buf is the raw byte buffer we need to process + * @param len is the length of the raw byte buffer in bytes + * @param batch_size is the size of the windows (must be strictly greater or equal to the largest JSON document) + */ + simdjson_really_inline document_stream( + ondemand::parser &parser, + const uint8_t *buf, + size_t len, + size_t batch_size + ) noexcept; + + /** + * Parse the first document in the buffer. Used by begin(), to handle allocation and + * initialization. + */ + inline void start() noexcept; + + /** + * Parse the next document found in the buffer previously given to document_stream. + * + * The content should be a valid JSON document encoded as UTF-8. If there is a + * UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are + * discouraged. + * + * You do NOT need to pre-allocate a parser. This function takes care of + * pre-allocating a capacity defined by the batch_size defined when creating the + * document_stream object. + * + * The function returns simdjson::EMPTY if there is no more data to be parsed. + * + * The function returns simdjson::SUCCESS (as integer = 0) in case of success + * and indicates that the buffer has successfully been parsed to the end. + * Every document it contained has been parsed without error. + * + * The function returns an error code from simdjson/simdjson.h in case of failure + * such as simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth; + * the simdjson::error_message function converts these error codes into a string). + * + * You can also check validity by calling parser.is_valid(). The same parser can + * and should be reused for the other documents in the buffer. + */ + inline void next() noexcept; + + /** Move the json_iterator of the document to the location of the next document in the stream. */ + inline void next_document() noexcept; + + /** Get the next document index. */ + inline size_t next_batch_start() const noexcept; + + /** Pass the next batch through stage 1 with the given parser. */ + inline error_code run_stage1(ondemand::parser &p, size_t batch_start) noexcept; + + // Fields + ondemand::parser *parser; + const uint8_t *buf; + size_t len; + size_t batch_size; + /** + * We are going to use just one document instance. The document owns + * the json_iterator. It implies that we only ever pass a reference + * to the document to the users. + */ + document doc{}; + /** The error (or lack thereof) from the current document. */ + error_code error; + size_t batch_start{0}; + size_t doc_index{}; + + #ifdef SIMDJSON_THREADS_ENABLED + /** Indicates whether we use threads. Note that this needs to be a constant during the execution of the parsing. */ + bool use_thread; + + inline void load_from_stage1_thread() noexcept; + + /** Start a thread to run stage 1 on the next batch. */ + inline void start_stage1_thread() noexcept; + + /** Wait for the stage 1 thread to finish and capture the results. */ + inline void finish_stage1_thread() noexcept; + + /** The error returned from the stage 1 thread. */ + error_code stage1_thread_error{UNINITIALIZED}; + /** The thread used to run stage 1 against the next batch in the background. */ + std::unique_ptr worker{new(std::nothrow) stage1_worker()}; + /** + * The parser used to run stage 1 in the background. Will be swapped + * with the regular parser when finished. + */ + ondemand::parser stage1_thread_parser{}; + + friend struct stage1_worker; + #endif // SIMDJSON_THREADS_ENABLED + + friend class parser; + friend class document; + friend class json_iterator; + friend struct simdjson_result; + friend struct internal::simdjson_result_base; +}; // document_stream + +} // namespace ondemand +} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { +template<> +struct simdjson_result : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_really_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_stream &&value) noexcept; ///< @private + simdjson_really_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_really_inline simdjson_result() noexcept = default; +}; + +} // namespace simdjson +/* end file include/simdjson/generic/ondemand/document_stream.h */ +/* begin file include/simdjson/generic/ondemand/serialization.h */ + +namespace simdjson { +/** + * Create a string-view instance out of a document instance. The string-view instance + * contains JSON text that is suitable to be parsed as JSON again. + */ +inline simdjson_result to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document& x) noexcept; +/** + * Create a string-view instance out of a value instance. The string-view instance + * contains JSON text that is suitable to be parsed as JSON again. The value must + * not have been accessed previously. + */ +inline simdjson_result to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value& x) noexcept; +/** + * Create a string-view instance out of an object instance. The string-view instance + * contains JSON text that is suitable to be parsed as JSON again. + */ +inline simdjson_result to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object& x) noexcept; +/** + * Create a string-view instance out of an array instance. The string-view instance + * contains JSON text that is suitable to be parsed as JSON again. + */ +inline simdjson_result to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array& x) noexcept; +inline simdjson_result to_json_string(simdjson_result x); +inline simdjson_result to_json_string(simdjson_result x); +inline simdjson_result to_json_string(simdjson_result x); +inline simdjson_result to_json_string(simdjson_result x); +} // namespace simdjson + + +/** + * Print JSON to an output stream. + * + * @param out The output stream. + * @param value The element. + * @throw if there is an error with the underlying output stream. simdjson itself will not throw. + */ +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value x); +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x); +#endif +/** + * Print JSON to an output stream. + * + * @param out The output stream. + * @param value The array. + * @throw if there is an error with the underlying output stream. simdjson itself will not throw. + */ +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array value); +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x); +#endif +/** + * Print JSON to an output stream. + * + * @param out The output stream. + * @param value The array. + * @throw if there is an error with the underlying output stream. simdjson itself will not throw. + */ +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document& value); +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result&& x); +#endif +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference& value); +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result&& x); +#endif +/** + * Print JSON to an output stream. + * + * @param out The output stream. + * @param value The object. + * @throw if there is an error with the underlying output stream. simdjson itself will not throw. + */ +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object value); +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x); +#endif +/* end file include/simdjson/generic/ondemand/serialization.h */ /* end file include/simdjson/generic/ondemand.h */ // Inline definitions @@ -21214,18 +24281,23 @@ simdjson_really_inline implementation_simdjson_result_base::operator T&&() && return std::forward>(*this).take_value(); } +#endif // SIMDJSON_EXCEPTIONS + template simdjson_really_inline const T& implementation_simdjson_result_base::value_unsafe() const& noexcept { return this->first; } +template +simdjson_really_inline T& implementation_simdjson_result_base::value_unsafe() & noexcept { + return this->first; +} + template simdjson_really_inline T&& implementation_simdjson_result_base::value_unsafe() && noexcept { return std::forward(this->first); } -#endif // SIMDJSON_EXCEPTIONS - template simdjson_really_inline implementation_simdjson_result_base::implementation_simdjson_result_base(T &&value, error_code error) noexcept : first{std::forward(value)}, second{error} {} @@ -21290,7 +24362,7 @@ static constexpr const int LOG_SMALL_BUFFER_LEN = 10; static int log_depth = 0; // Not threadsafe. Log only. // Helper to turn unprintable or newline characters into spaces -static simdjson_really_inline char printable_char(char c) { +static inline char printable_char(char c) { if (c >= 0x20) { return c; } else { @@ -21298,62 +24370,90 @@ static simdjson_really_inline char printable_char(char c) { } } -simdjson_really_inline void log_event(const json_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept { +inline void log_event(const json_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept { log_line(iter, "", type, detail, delta, depth_delta); } -simdjson_really_inline void log_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail) noexcept { +inline void log_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail) noexcept { log_line(iter, index, depth, "", type, detail); } -simdjson_really_inline void log_value(const json_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept { +inline void log_value(const json_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept { log_line(iter, "", type, detail, delta, depth_delta); } -simdjson_really_inline void log_start_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail) noexcept { +inline void log_start_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail) noexcept { log_line(iter, index, depth, "+", type, detail); if (LOG_ENABLED) { log_depth++; } } -simdjson_really_inline void log_start_value(const json_iterator &iter, const char *type, int delta, int depth_delta) noexcept { +inline void log_start_value(const json_iterator &iter, const char *type, int delta, int depth_delta) noexcept { log_line(iter, "+", type, "", delta, depth_delta); if (LOG_ENABLED) { log_depth++; } } -simdjson_really_inline void log_end_value(const json_iterator &iter, const char *type, int delta, int depth_delta) noexcept { +inline void log_end_value(const json_iterator &iter, const char *type, int delta, int depth_delta) noexcept { if (LOG_ENABLED) { log_depth--; } log_line(iter, "-", type, "", delta, depth_delta); } -simdjson_really_inline void log_error(const json_iterator &iter, const char *error, const char *detail, int delta, int depth_delta) noexcept { +inline void log_error(const json_iterator &iter, const char *error, const char *detail, int delta, int depth_delta) noexcept { log_line(iter, "ERROR: ", error, detail, delta, depth_delta); } -simdjson_really_inline void log_error(const json_iterator &iter, token_position index, depth_t depth, const char *error, const char *detail) noexcept { +inline void log_error(const json_iterator &iter, token_position index, depth_t depth, const char *error, const char *detail) noexcept { log_line(iter, index, depth, "ERROR: ", error, detail); } -simdjson_really_inline void log_event(const value_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept { +inline void log_event(const value_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept { log_event(iter.json_iter(), type, detail, delta, depth_delta); } -simdjson_really_inline void log_value(const value_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept { +inline void log_value(const value_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept { log_value(iter.json_iter(), type, detail, delta, depth_delta); } -simdjson_really_inline void log_start_value(const value_iterator &iter, const char *type, int delta, int depth_delta) noexcept { +inline void log_start_value(const value_iterator &iter, const char *type, int delta, int depth_delta) noexcept { log_start_value(iter.json_iter(), type, delta, depth_delta); } -simdjson_really_inline void log_end_value(const value_iterator &iter, const char *type, int delta, int depth_delta) noexcept { +inline void log_end_value(const value_iterator &iter, const char *type, int delta, int depth_delta) noexcept { log_end_value(iter.json_iter(), type, delta, depth_delta); } -simdjson_really_inline void log_error(const value_iterator &iter, const char *error, const char *detail, int delta, int depth_delta) noexcept { +inline void log_error(const value_iterator &iter, const char *error, const char *detail, int delta, int depth_delta) noexcept { log_error(iter.json_iter(), error, detail, delta, depth_delta); } -simdjson_really_inline void log_headers() noexcept { +inline void log_headers() noexcept { if (LOG_ENABLED) { + // Technically a static variable is not thread-safe, but if you are using threads + // and logging... well... + static bool displayed_hint{false}; log_depth = 0; printf("\n"); + if(!displayed_hint) { + // We only print this helpful header once. + printf("# Logging provides the depth and position of the iterator user-visible steps:\n"); + printf("# +array says 'this is where we were when we discovered the start array'\n"); + printf("# -array says 'this is where we were when we ended the array'\n"); + printf("# skip says 'this is a structural or value I am skipping'\n"); + printf("# +/-skip says 'this is a start/end array or object I am skipping'\n"); + printf("#\n"); + printf("# The identation of the terms (array, string,...) indicates the depth,\n"); + printf("# in addition to the depth being displayed.\n"); + printf("#\n"); + printf("# Every token in the document has a single depth determined by the tokens before it,\n"); + printf("# and is not affected by what the token actually is.\n"); + printf("#\n"); + printf("# Not all structural elements are presented as tokens in the logs.\n"); + printf("#\n"); + printf("# We never give control to the user within an empty array or an empty object.\n"); + printf("#\n"); + printf("# Inside an array, having a depth greater than the array's depth means that\n"); + printf("# we are pointing inside a value.\n"); + printf("# Having a depth equal to the array means that we are pointing right before a value.\n"); + printf("# Having a depth smaller than the array means that we have moved beyond the array.\n"); + displayed_hint = true; + } + printf("\n"); printf("| %-*s ", LOG_EVENT_LEN, "Event"); printf("| %-*s ", LOG_BUFFER_LEN, "Buffer"); printf("| %-*s ", LOG_SMALL_BUFFER_LEN, "Next"); @@ -21373,10 +24473,10 @@ simdjson_really_inline void log_headers() noexcept { } } -simdjson_really_inline void log_line(const json_iterator &iter, const char *title_prefix, const char *title, std::string_view detail, int delta, int depth_delta) noexcept { - log_line(iter, iter.token.index+delta, depth_t(iter.depth()+depth_delta), title_prefix, title, detail); +inline void log_line(const json_iterator &iter, const char *title_prefix, const char *title, std::string_view detail, int delta, int depth_delta) noexcept { + log_line(iter, iter.position()+delta, depth_t(iter.depth()+depth_delta), title_prefix, title, detail); } -simdjson_really_inline void log_line(const json_iterator &iter, token_position index, depth_t depth, const char *title_prefix, const char *title, std::string_view detail) noexcept { +inline void log_line(const json_iterator &iter, token_position index, depth_t depth, const char *title_prefix, const char *title, std::string_view detail) noexcept { if (LOG_ENABLED) { const int indent = depth*2; const auto buf = iter.token.buf; @@ -21618,13 +24718,20 @@ namespace simdjson { namespace SIMDJSON_BUILTIN_IMPLEMENTATION { namespace ondemand { -simdjson_really_inline token_iterator::token_iterator(const uint8_t *_buf, token_position _index) noexcept - : buf{_buf}, index{_index} +simdjson_really_inline token_iterator::token_iterator( + const uint8_t *_buf, + token_position position +) noexcept : buf{_buf}, _position{position} { } -simdjson_really_inline const uint8_t *token_iterator::advance() noexcept { - return &buf[*(index++)]; +simdjson_really_inline uint32_t token_iterator::current_offset() const noexcept { + return *(_position); +} + + +simdjson_really_inline const uint8_t *token_iterator::return_current_and_advance() noexcept { + return &buf[*(_position++)]; } simdjson_really_inline const uint8_t *token_iterator::peek(token_position position) const noexcept { @@ -21638,39 +24745,39 @@ simdjson_really_inline uint32_t token_iterator::peek_length(token_position posit } simdjson_really_inline const uint8_t *token_iterator::peek(int32_t delta) const noexcept { - return &buf[*(index+delta)]; + return &buf[*(_position+delta)]; } simdjson_really_inline uint32_t token_iterator::peek_index(int32_t delta) const noexcept { - return *(index+delta); + return *(_position+delta); } simdjson_really_inline uint32_t token_iterator::peek_length(int32_t delta) const noexcept { - return *(index+delta+1) - *(index+delta); + return *(_position+delta+1) - *(_position+delta); } simdjson_really_inline token_position token_iterator::position() const noexcept { - return index; + return _position; } -simdjson_really_inline void token_iterator::set_position(token_position target_checkpoint) noexcept { - index = target_checkpoint; +simdjson_really_inline void token_iterator::set_position(token_position target_position) noexcept { + _position = target_position; } simdjson_really_inline bool token_iterator::operator==(const token_iterator &other) const noexcept { - return index == other.index; + return _position == other._position; } simdjson_really_inline bool token_iterator::operator!=(const token_iterator &other) const noexcept { - return index != other.index; + return _position != other._position; } simdjson_really_inline bool token_iterator::operator>(const token_iterator &other) const noexcept { - return index > other.index; + return _position > other._position; } simdjson_really_inline bool token_iterator::operator>=(const token_iterator &other) const noexcept { - return index >= other.index; + return _position >= other._position; } simdjson_really_inline bool token_iterator::operator<(const token_iterator &other) const noexcept { - return index < other.index; + return _position < other._position; } simdjson_really_inline bool token_iterator::operator<=(const token_iterator &other) const noexcept { - return index <= other.index; + return _position <= other._position; } } // namespace ondemand @@ -21695,7 +24802,10 @@ simdjson_really_inline json_iterator::json_iterator(json_iterator &&other) noexc : token(std::forward(other.token)), parser{other.parser}, _string_buf_loc{other._string_buf_loc}, - _depth{other._depth} + error{other.error}, + _depth{other._depth}, + _root{other._root}, + _streaming{other._streaming} { other.parser = nullptr; } @@ -21703,18 +24813,34 @@ simdjson_really_inline json_iterator &json_iterator::operator=(json_iterator &&o token = other.token; parser = other.parser; _string_buf_loc = other._string_buf_loc; + error = other.error; _depth = other._depth; + _root = other._root; + _streaming = other._streaming; other.parser = nullptr; return *this; } simdjson_really_inline json_iterator::json_iterator(const uint8_t *buf, ondemand::parser *_parser) noexcept - : token(buf, _parser->implementation->structural_indexes.get()), + : token(buf, &_parser->implementation->structural_indexes[0]), parser{_parser}, _string_buf_loc{parser->string_buf.get()}, - _depth{1} + _depth{1}, + _root{parser->implementation->structural_indexes.get()}, + _streaming{false} + { logger::log_headers(); +#if SIMDJSON_CHECK_EOF + assert_more_tokens(); +#endif +} + +inline void json_iterator::rewind() noexcept { + token.set_position( root_position() ); + logger::log_headers(); // We start again + _string_buf_loc = parser->string_buf.get(); + _depth = 1; } // GCC 7 warns when the first line of this function is inlined away into oblivion due to the caller @@ -21723,9 +24849,19 @@ simdjson_really_inline json_iterator::json_iterator(const uint8_t *buf, ondemand SIMDJSON_PUSH_DISABLE_WARNINGS SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING simdjson_warn_unused simdjson_really_inline error_code json_iterator::skip_child(depth_t parent_depth) noexcept { + /*** + * WARNING: + * Inside an object, a string value is a depth of +1 compared to the object. Yet a key + * is at the same depth as the object. + * But json_iterator cannot easily tell whether we are pointing at a key or a string value. + * Instead, it assumes that if you are pointing at a string, then it is a value, not a key. + * To be clear... + * the following code assumes that we are *not* pointing at a key. If we are then a bug + * will follow. Unfortunately, it is not possible for the json_iterator its to make this + * check. + */ if (depth() <= parent_depth) { return SUCCESS; } - - switch (*advance()) { + switch (*return_current_and_advance()) { // TODO consider whether matching braces is a requirement: if non-matching braces indicates // *missing* braces, then future lookups are not in the object/arrays they think they are, // violating the rule "validate enough structure that the user can be confident they are @@ -21746,7 +24882,24 @@ simdjson_warn_unused simdjson_really_inline error_code json_iterator::skip_child logger::log_end_value(*this, "skip"); _depth--; if (depth() <= parent_depth) { return SUCCESS; } +#if SIMDJSON_CHECK_EOF + // If there are no more tokens, the parent is incomplete. + if (at_end()) { return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "Missing [ or { at start"); } +#endif // SIMDJSON_CHECK_EOF break; + /*case '"': + if(*peek() == ':') { + // we are at a key!!! This is + // only possible if someone searched + // for a key in an object and the key + // was not found but our code then + // decided the consume the separating + // comma before returning. + logger::log_value(*this, "key"); + advance(); // eat up the ':' + break; // important!!! + } + simdjson_fallthrough;*/ // Anything else must be a scalar value default: // For the first scalar, we will have incremented depth already, so we decrement it here. @@ -21757,9 +24910,8 @@ simdjson_warn_unused simdjson_really_inline error_code json_iterator::skip_child } // Now that we've considered the first value, we only increment/decrement for arrays/objects - auto end = &parser->implementation->structural_indexes[parser->implementation->n_structural_indexes]; - while (token.index <= end) { - switch (*advance()) { + while (position() < end_position()) { + switch (*return_current_and_advance()) { case '[': case '{': logger::log_start_value(*this, "skip"); _depth++; @@ -21786,23 +24938,53 @@ simdjson_warn_unused simdjson_really_inline error_code json_iterator::skip_child SIMDJSON_POP_DISABLE_WARNINGS simdjson_really_inline bool json_iterator::at_root() const noexcept { - return token.position() == root_checkpoint(); + return position() == root_position(); } -simdjson_really_inline token_position json_iterator::root_checkpoint() const noexcept { - return parser->implementation->structural_indexes.get(); +simdjson_really_inline bool json_iterator::streaming() const noexcept { + return _streaming; +} + +simdjson_really_inline token_position json_iterator::root_position() const noexcept { + return _root; } simdjson_really_inline void json_iterator::assert_at_root() const noexcept { SIMDJSON_ASSUME( _depth == 1 ); - // Visual Studio Clang treats unique_ptr.get() as "side effecting." #ifndef SIMDJSON_CLANG_VISUAL_STUDIO - SIMDJSON_ASSUME( token.index == parser->implementation->structural_indexes.get() ); + // Under Visual Studio, the next SIMDJSON_ASSUME fails with: the argument + // has side effects that will be discarded. + SIMDJSON_ASSUME( token.position() == _root ); #endif } -simdjson_really_inline bool json_iterator::at_eof() const noexcept { - return token.index == &parser->implementation->structural_indexes[parser->implementation->n_structural_indexes]; +simdjson_really_inline void json_iterator::assert_more_tokens(uint32_t required_tokens) const noexcept { + assert_valid_position(token._position + required_tokens - 1); +} + +simdjson_really_inline void json_iterator::assert_valid_position(token_position position) const noexcept { +#ifndef SIMDJSON_CLANG_VISUAL_STUDIO + SIMDJSON_ASSUME( position >= &parser->implementation->structural_indexes[0] ); + SIMDJSON_ASSUME( position < &parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] ); +#endif +} + +simdjson_really_inline bool json_iterator::at_end() const noexcept { + return position() == end_position(); +} +simdjson_really_inline token_position json_iterator::end_position() const noexcept { + uint32_t n_structural_indexes{parser->implementation->n_structural_indexes}; + return &parser->implementation->structural_indexes[n_structural_indexes]; +} + +inline std::string json_iterator::to_string() const noexcept { + if( !is_alive() ) { return "dead json_iterator instance"; } + const char * current_structural = reinterpret_cast(token.peek()); + return std::string("json_iterator [ depth : ") + std::to_string(_depth) + + std::string(", structural : '") + std::string(current_structural,1) + + std::string("', offset : ") + std::to_string(token.current_offset()) + + std::string("', error : ") + error_message(error) + + std::string(" ]"); } simdjson_really_inline bool json_iterator::is_alive() const noexcept { @@ -21814,27 +24996,49 @@ simdjson_really_inline void json_iterator::abandon() noexcept { _depth = 0; } -simdjson_really_inline const uint8_t *json_iterator::advance() noexcept { - return token.advance(); +simdjson_really_inline const uint8_t *json_iterator::return_current_and_advance() noexcept { +#if SIMDJSON_CHECK_EOF + assert_more_tokens(); +#endif // SIMDJSON_CHECK_EOF + return token.return_current_and_advance(); +} + +simdjson_really_inline const uint8_t *json_iterator::unsafe_pointer() const noexcept { + // deliberately done without safety guard: + return token.peek(0); } simdjson_really_inline const uint8_t *json_iterator::peek(int32_t delta) const noexcept { +#if SIMDJSON_CHECK_EOF + assert_more_tokens(delta+1); +#endif // SIMDJSON_CHECK_EOF return token.peek(delta); } simdjson_really_inline uint32_t json_iterator::peek_length(int32_t delta) const noexcept { +#if SIMDJSON_CHECK_EOF + assert_more_tokens(delta+1); +#endif // #if SIMDJSON_CHECK_EOF return token.peek_length(delta); } simdjson_really_inline const uint8_t *json_iterator::peek(token_position position) const noexcept { + // todo: currently we require end-of-string buffering, but the following + // assert_valid_position should be turned on if/when we lift that condition. + // assert_valid_position(position); + // This is almost surely related to SIMDJSON_CHECK_EOF but given that SIMDJSON_CHECK_EOF + // is ON by default, we have no choice but to disable it for real with a comment. return token.peek(position); } simdjson_really_inline uint32_t json_iterator::peek_length(token_position position) const noexcept { +#if SIMDJSON_CHECK_EOF + assert_valid_position(position); +#endif // SIMDJSON_CHECK_EOF return token.peek_length(position); } -simdjson_really_inline token_position json_iterator::last_document_position() const noexcept { +simdjson_really_inline token_position json_iterator::last_position() const noexcept { // The following line fails under some compilers... // SIMDJSON_ASSUME(parser->implementation->n_structural_indexes > 0); // since it has side-effects. @@ -21843,7 +25047,7 @@ simdjson_really_inline token_position json_iterator::last_document_position() co return &parser->implementation->structural_indexes[n_structural_indexes - 1]; } simdjson_really_inline const uint8_t *json_iterator::peek_last() const noexcept { - return token.peek(last_document_position()); + return token.peek(last_position()); } simdjson_really_inline void json_iterator::ascend_to(depth_t parent_depth) noexcept { @@ -21876,6 +25080,7 @@ simdjson_really_inline error_code json_iterator::report_error(error_code _error, simdjson_really_inline token_position json_iterator::position() const noexcept { return token.position(); } + simdjson_really_inline void json_iterator::reenter_child(token_position position, depth_t child_depth) noexcept { SIMDJSON_ASSUME(child_depth >= 1 && child_depth < INT32_MAX); SIMDJSON_ASSUME(_depth == child_depth - 1); @@ -21889,9 +25094,11 @@ simdjson_really_inline void json_iterator::reenter_child(token_position position } #ifdef SIMDJSON_DEVELOPMENT_CHECKS + simdjson_really_inline token_position json_iterator::start_position(depth_t depth) const noexcept { return parser->start_positions[depth]; } + simdjson_really_inline void json_iterator::set_start_position(depth_t depth, token_position position) noexcept { parser->start_positions[depth] = position; } @@ -21907,9 +25114,11 @@ simdjson_really_inline error_code json_iterator::optional_error(error_code _erro template simdjson_warn_unused simdjson_really_inline bool json_iterator::copy_to_buffer(const uint8_t *json, uint32_t max_len, uint8_t (&tmpbuf)[N]) noexcept { + // Let us guard against silly cases: + if((N < max_len) || (N == 0)) { return false; } // Truncate whitespace to fit the buffer. if (max_len > N-1) { - if (jsoncharutils::is_not_structural_or_whitespace(json[N-1])) { return false; } + // if (jsoncharutils::is_not_structural_or_whitespace(json[N-1])) { return false; } max_len = N-1; } @@ -21919,20 +25128,6 @@ simdjson_warn_unused simdjson_really_inline bool json_iterator::copy_to_buffer(c return true; } -template -simdjson_warn_unused simdjson_really_inline bool json_iterator::peek_to_buffer(uint8_t (&tmpbuf)[N]) noexcept { - auto max_len = token.peek_length(); - auto json = token.peek(); - return copy_to_buffer(json, max_len, tmpbuf); -} - -template -simdjson_warn_unused simdjson_really_inline bool json_iterator::advance_to_buffer(uint8_t (&tmpbuf)[N]) noexcept { - auto max_len = peek_length(); - auto json = advance(); - return copy_to_buffer(json, max_len, tmpbuf); -} - } // namespace ondemand } // namespace SIMDJSON_BUILTIN_IMPLEMENTATION } // namespace simdjson @@ -21951,54 +25146,72 @@ namespace simdjson { namespace SIMDJSON_BUILTIN_IMPLEMENTATION { namespace ondemand { -simdjson_really_inline value_iterator::value_iterator(json_iterator *json_iter, depth_t depth, token_position start_index) noexcept - : _json_iter{json_iter}, - _depth{depth}, - _start_position{start_index} +simdjson_really_inline value_iterator::value_iterator( + json_iterator *json_iter, + depth_t depth, + token_position start_position +) noexcept : _json_iter{json_iter}, _depth{depth}, _start_position{start_position} { } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::start_object() noexcept { - const uint8_t *json; - SIMDJSON_TRY( advance_container_start("object", json) ); - if (*json != '{') { return incorrect_type_error("Not an object"); } + SIMDJSON_TRY( start_container('{', "Not an object", "object") ); return started_object(); } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::start_root_object() noexcept { - bool result; - SIMDJSON_TRY( start_object().get(result) ); - if (*_json_iter->peek_last() != '}') { return _json_iter->report_error(TAPE_ERROR, "object invalid: { at beginning of document unmatched by } at end of document"); } - return result; + SIMDJSON_TRY( start_container('{', "Not an object", "object") ); + return started_root_object(); } -simdjson_warn_unused simdjson_really_inline bool value_iterator::started_object() noexcept { +simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::started_object() noexcept { assert_at_container_start(); +#ifdef SIMDJSON_DEVELOPMENT_CHECKS + _json_iter->set_start_position(_depth, start_position()); +#endif if (*_json_iter->peek() == '}') { logger::log_value(*_json_iter, "empty object"); - _json_iter->advance(); - _json_iter->ascend_to(depth()-1); + _json_iter->return_current_and_advance(); + end_container(); return false; } - logger::log_start_value(*_json_iter, "object"); -#ifdef SIMDJSON_DEVELOPMENT_CHECKS - _json_iter->set_start_position(_depth, _start_position); -#endif return true; } +simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::started_root_object() noexcept { + // When in streaming mode, we cannot expect peek_last() to be the last structural element of the + // current document. It only works in the normal mode where we have indexed a single document. + // Note that adding a check for 'streaming' is not expensive since we only have at most + // one root element. + if (! _json_iter->streaming() && (*_json_iter->peek_last() != '}')) { + return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing } at end"); + } + return started_object(); +} + +simdjson_warn_unused simdjson_really_inline error_code value_iterator::end_container() noexcept { +#if SIMDJSON_CHECK_EOF + if (depth() > 1 && at_end()) { return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing parent ] or }"); } + // if (depth() <= 1 && !at_end()) { return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing [ or { at start"); } +#endif // SIMDJSON_CHECK_EOF + _json_iter->ascend_to(depth()-1); + return SUCCESS; +} + simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::has_next_field() noexcept { assert_at_next(); - switch (*_json_iter->advance()) { + // It's illegal to call this unless there are more tokens: anything that ends in } or ] is + // obligated to verify there are more tokens if they are not the top level. + switch (*_json_iter->return_current_and_advance()) { case '}': logger::log_end_value(*_json_iter, "object"); - _json_iter->ascend_to(depth()-1); + SIMDJSON_TRY( end_container() ); return false; case ',': return true; default: - return _json_iter->report_error(TAPE_ERROR, "Missing comma between object fields"); + return report_error(TAPE_ERROR, "Missing comma between object fields"); } } @@ -22014,7 +25227,6 @@ simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator // { "a": [ 1, 2 ], "b": [ 3, 4 ] } // ^ (depth 2, index 1) // ``` - // if (at_first_field()) { has_value = true; @@ -22035,7 +25247,7 @@ simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator // this object iterator will blithely scan that object for fields. if (_json_iter->depth() < depth() - 1) { return OUT_OF_ORDER_ITERATION; } #endif - has_value = false; + return false; // 3. When a previous search found a field or an iterator yielded a value: // @@ -22055,15 +25267,19 @@ simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator if ((error = skip_child() )) { abandon(); return error; } if ((error = has_next_field().get(has_value) )) { abandon(); return error; } #ifdef SIMDJSON_DEVELOPMENT_CHECKS - if (_json_iter->start_position(_depth) != _start_position) { return OUT_OF_ORDER_ITERATION; } + if (_json_iter->start_position(_depth) != start_position()) { return OUT_OF_ORDER_ITERATION; } #endif } while (has_value) { // Get the key and colon, stopping at the value. raw_json_string actual_key; // size_t max_key_length = _json_iter->peek_length() - 2; // -2 for the two quotes + // Note: _json_iter->peek_length() - 2 might overflow if _json_iter->peek_length() < 2. + // field_key() advances the pointer and checks that '"' is found (corresponding to a key). + // The depth is left unchanged by field_key(). if ((error = field_key().get(actual_key) )) { abandon(); return error; }; - + // field_value() will advance and check that we find a ':' separating the + // key and the value. It will also increment the depth by one. if ((error = field_value() )) { abandon(); return error; } // If it matches, stop and return // We could do it this way if we wanted to allow arbitrary @@ -22075,12 +25291,18 @@ simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator // input). if (actual_key.unsafe_is_equal(key)) { logger::log_event(*this, "match", key, -2); + // If we return here, then we return while pointing at the ':' that we just checked. return true; } // No match: skip the value and see if , or } is next logger::log_event(*this, "no match", key, -2); + // The call to skip_child is meant to skip over the value corresponding to the key. + // After skip_child(), we are right before the next comma (',') or the final brace ('}'). SIMDJSON_TRY( skip_child() ); // Skip the value entirely + // The has_next_field() advances the pointer and check that either ',' or '}' is found. + // It returns true if ',' is found, false otherwise. If anything other than ',' or '}' is found, + // then we are in error and we abort. if ((error = has_next_field().get(has_value) )) { abandon(); return error; } } @@ -22089,20 +25311,33 @@ simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::find_field_unordered_raw(const std::string_view key) noexcept { + /** + * When find_field_unordered_raw is called, we can either be pointing at the + * first key, pointing outside (at the closing brace) or if a key was matched + * we can be either pointing right afterthe ':' right before the value (that we need skip), + * or we may have consumed the value and we might be at a comma or at the + * final brace (ready for a call to has_next_field()). + */ error_code error; bool has_value; - // + + // First, we scan from that point to the end. + // If we don't find a match, we may loop back around, and scan from the beginning to that point. + token_position search_start = _json_iter->position(); + + // We want to know whether we need to go back to the beginning. + bool at_first = at_first_field(); + /////////////// // Initially, the object can be in one of a few different places: // - // 1. The start of the object, at the first field: + // 1. At the first key: // // ``` // { "a": [ 1, 2 ], "b": [ 3, 4 ] } // ^ (depth 2, index 1) // ``` // - if (at_first_field()) { - // If we're at the beginning of the object, we definitely have a field + if (at_first) { has_value = true; // 2. When a previous search did not yield a value or the object is empty: @@ -22115,14 +25350,15 @@ simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator // ``` // } else if (!is_open()) { + #ifdef SIMDJSON_DEVELOPMENT_CHECKS // If we're past the end of the object, we're being iterated out of order. // Note: this isn't perfect detection. It's possible the user is inside some other object; if so, // this object iterator will blithely scan that object for fields. if (_json_iter->depth() < depth() - 1) { return OUT_OF_ORDER_ITERATION; } #endif - has_value = false; - + SIMDJSON_TRY(reset_object().get(has_value)); + at_first = true; // 3. When a previous search found a field or an iterator yielded a value: // // ``` @@ -22138,11 +25374,14 @@ simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator // ``` // } else { - // Finish the previous value and see if , or } is next + // If someone queried a key but they not did access the value, then we are left pointing + // at the ':' and we need to move forward through the value... If the value was + // processed then skip_child() does not move the iterator (but may adjust the depth). if ((error = skip_child() )) { abandon(); return error; } + search_start = _json_iter->position(); if ((error = has_next_field().get(has_value) )) { abandon(); return error; } #ifdef SIMDJSON_DEVELOPMENT_CHECKS - if (_json_iter->start_position(_depth) != _start_position) { return OUT_OF_ORDER_ITERATION; } + if (_json_iter->start_position(_depth) != start_position()) { return OUT_OF_ORDER_ITERATION; } #endif } @@ -22159,11 +25398,6 @@ simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator // ^ (depth 0) // ``` // - - // First, we scan from that point to the end. - // If we don't find a match, we loop back around, and scan from the beginning to that point. - token_position search_start = _json_iter->position(); - // Next, we find a match starting from the current position. while (has_value) { SIMDJSON_ASSUME( _json_iter->_depth == _depth ); // We must be at the start of a field @@ -22171,8 +25405,12 @@ simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator // Get the key and colon, stopping at the value. raw_json_string actual_key; // size_t max_key_length = _json_iter->peek_length() - 2; // -2 for the two quotes - + // Note: _json_iter->peek_length() - 2 might overflow if _json_iter->peek_length() < 2. + // field_key() advances the pointer and checks that '"' is found (corresponding to a key). + // The depth is left unchanged by field_key(). if ((error = field_key().get(actual_key) )) { abandon(); return error; }; + // field_value() will advance and check that we find a ':' separating the + // key and the value. It will also increment the depth by one. if ((error = field_value() )) { abandon(); return error; } // If it matches, stop and return @@ -22185,31 +25423,44 @@ simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator // input). if (actual_key.unsafe_is_equal(key)) { logger::log_event(*this, "match", key, -2); + // If we return here, then we return while pointing at the ':' that we just checked. return true; } // No match: skip the value and see if , or } is next logger::log_event(*this, "no match", key, -2); + // The call to skip_child is meant to skip over the value corresponding to the key. + // After skip_child(), we are right before the next comma (',') or the final brace ('}'). SIMDJSON_TRY( skip_child() ); + // The has_next_field() advances the pointer and check that either ',' or '}' is found. + // It returns true if ',' is found, false otherwise. If anything other than ',' or '}' is found, + // then we are in error and we abort. if ((error = has_next_field().get(has_value) )) { abandon(); return error; } } + // Performance note: it maybe wasteful to rewind to the beginning when there might be + // no other query following. Indeed, it would require reskipping the whole object. + // Instead, you can just stay where you are. If there is a new query, there is always time + // to rewind. + if(at_first) { return false; } // If we reach the end without finding a match, search the rest of the fields starting at the // beginning of the object. // (We have already run through the object before, so we've already validated its structure. We // don't check errors in this bit.) - _json_iter->reenter_child(_start_position + 1, _depth); - - has_value = started_object(); - while (_json_iter->position() < search_start) { + SIMDJSON_TRY(reset_object().get(has_value)); + while (true) { SIMDJSON_ASSUME(has_value); // we should reach search_start before ever reaching the end of the object SIMDJSON_ASSUME( _json_iter->_depth == _depth ); // We must be at the start of a field // Get the key and colon, stopping at the value. raw_json_string actual_key; // size_t max_key_length = _json_iter->peek_length() - 2; // -2 for the two quotes - + // Note: _json_iter->peek_length() - 2 might overflow if _json_iter->peek_length() < 2. + // field_key() advances the pointer and checks that '"' is found (corresponding to a key). + // The depth is left unchanged by field_key(). error = field_key().get(actual_key); SIMDJSON_ASSUME(!error); + // field_value() will advance and check that we find a ':' separating the + // key and the value. It will also increment the depth by one. error = field_value(); SIMDJSON_ASSUME(!error); // If it matches, stop and return @@ -22222,78 +25473,104 @@ simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator // input). if (actual_key.unsafe_is_equal(key)) { logger::log_event(*this, "match", key, -2); + // If we return here, then we return while pointing at the ':' that we just checked. return true; } // No match: skip the value and see if , or } is next logger::log_event(*this, "no match", key, -2); + // The call to skip_child is meant to skip over the value corresponding to the key. + // After skip_child(), we are right before the next comma (',') or the final brace ('}'). SIMDJSON_TRY( skip_child() ); + // If we reached the end of the key-value pair we started from, then we know + // that the key is not there so we return false. We are either right before + // the next comma or the final brace. + if(_json_iter->position() == search_start) { return false; } + // The has_next_field() advances the pointer and check that either ',' or '}' is found. + // It returns true if ',' is found, false otherwise. If anything other than ',' or '}' is found, + // then we are in error and we abort. error = has_next_field().get(has_value); SIMDJSON_ASSUME(!error); + // If we make the mistake of exiting here, then we could be left pointing at a key + // in the middle of an object. That's not an allowable state. } - - // If the loop ended, we're out of fields to look at. + // If the loop ended, we're out of fields to look at. The program should + // never reach this point. return false; } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::field_key() noexcept { assert_at_next(); - const uint8_t *key = _json_iter->advance(); - if (*(key++) != '"') { return _json_iter->report_error(TAPE_ERROR, "Object key is not a string"); } + const uint8_t *key = _json_iter->return_current_and_advance(); + if (*(key++) != '"') { return report_error(TAPE_ERROR, "Object key is not a string"); } return raw_json_string(key); } simdjson_warn_unused simdjson_really_inline error_code value_iterator::field_value() noexcept { assert_at_next(); - if (*_json_iter->advance() != ':') { return _json_iter->report_error(TAPE_ERROR, "Missing colon in object field"); } + if (*_json_iter->return_current_and_advance() != ':') { return report_error(TAPE_ERROR, "Missing colon in object field"); } _json_iter->descend_to(depth()+1); return SUCCESS; } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::start_array() noexcept { - const uint8_t *json; - SIMDJSON_TRY( advance_container_start("array", json) ); - if (*json != '[') { return incorrect_type_error("Not an array"); } + SIMDJSON_TRY( start_container('[', "Not an array", "array") ); return started_array(); } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::start_root_array() noexcept { - bool result; - SIMDJSON_TRY( start_array().get(result) ); - if (*_json_iter->peek_last() != ']') { return _json_iter->report_error(TAPE_ERROR, "array invalid: [ at beginning of document unmatched by ] at end of document"); } - return result; + SIMDJSON_TRY( start_container('[', "Not an array", "array") ); + return started_root_array(); } -simdjson_warn_unused simdjson_really_inline bool value_iterator::started_array() noexcept { +inline std::string value_iterator::to_string() const noexcept { + auto answer = std::string("value_iterator [ depth : ") + std::to_string(_depth) + std::string(", "); + if(_json_iter != nullptr) { answer += _json_iter->to_string(); } + answer += std::string(" ]"); + return answer; +} + +simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::started_array() noexcept { assert_at_container_start(); if (*_json_iter->peek() == ']') { logger::log_value(*_json_iter, "empty array"); - _json_iter->advance(); - _json_iter->ascend_to(depth()-1); + _json_iter->return_current_and_advance(); + SIMDJSON_TRY( end_container() ); return false; } - logger::log_start_value(*_json_iter, "array"); _json_iter->descend_to(depth()+1); #ifdef SIMDJSON_DEVELOPMENT_CHECKS - _json_iter->set_start_position(_depth, _start_position); + _json_iter->set_start_position(_depth, start_position()); #endif return true; } +simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::started_root_array() noexcept { + // When in streaming mode, we cannot expect peek_last() to be the last structural element of the + // current document. It only works in the normal mode where we have indexed a single document. + // Note that adding a check for 'streaming' is not expensive since we only have at most + // one root element. + if ( ! _json_iter->streaming() && (*_json_iter->peek_last() != ']')) { + return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing ] at end"); + } + return started_array(); +} + simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::has_next_element() noexcept { assert_at_next(); - switch (*_json_iter->advance()) { + logger::log_event(*this, "has_next_element"); + switch (*_json_iter->return_current_and_advance()) { case ']': logger::log_end_value(*_json_iter, "array"); - _json_iter->ascend_to(depth()-1); + SIMDJSON_TRY( end_container() ); return false; case ',': _json_iter->descend_to(depth()+1); return true; default: - return _json_iter->report_error(TAPE_ERROR, "Missing comma between array elements"); + return report_error(TAPE_ERROR, "Missing comma between array elements"); } } @@ -22312,24 +25589,50 @@ simdjson_warn_unused simdjson_really_inline simdjson_result va return get_raw_json_string().unescape(_json_iter->string_buf_loc()); } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_raw_json_string() noexcept { - auto json = advance_start("string"); + auto json = peek_scalar("string"); if (*json != '"') { return incorrect_type_error("Not a string"); } + advance_scalar("string"); return raw_json_string(json+1); } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_uint64() noexcept { - return numberparsing::parse_unsigned(advance_non_root_scalar("uint64")); + auto result = numberparsing::parse_unsigned(peek_non_root_scalar("uint64")); + if(result.error() != INCORRECT_TYPE) { advance_non_root_scalar("uint64"); } + return result; +} +simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_uint64_in_string() noexcept { + auto result = numberparsing::parse_unsigned_in_string(peek_non_root_scalar("uint64")); + if(result.error() != INCORRECT_TYPE) { advance_non_root_scalar("uint64"); } + return result; } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_int64() noexcept { - return numberparsing::parse_integer(advance_non_root_scalar("int64")); + auto result = numberparsing::parse_integer(peek_non_root_scalar("int64")); + if(result.error() != INCORRECT_TYPE) { advance_non_root_scalar("int64"); } + return result; +} +simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_int64_in_string() noexcept { + auto result = numberparsing::parse_integer_in_string(peek_non_root_scalar("int64")); + if(result.error() != INCORRECT_TYPE) { advance_non_root_scalar("int64"); } + return result; } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_double() noexcept { - return numberparsing::parse_double(advance_non_root_scalar("double")); + auto result = numberparsing::parse_double(peek_non_root_scalar("double")); + if(result.error() != INCORRECT_TYPE) { advance_non_root_scalar("double"); } + return result; +} +simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_double_in_string() noexcept { + auto result = numberparsing::parse_double_in_string(peek_non_root_scalar("double")); + if(result.error() != INCORRECT_TYPE) { advance_non_root_scalar("double"); } + return result; } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_bool() noexcept { - return parse_bool(advance_non_root_scalar("bool")); + auto result = parse_bool(peek_non_root_scalar("bool")); + if(result.error() != INCORRECT_TYPE) { advance_non_root_scalar("bool"); } + return result; } simdjson_really_inline bool value_iterator::is_null() noexcept { - return parse_null(advance_non_root_scalar("null")); + auto result = parse_null(peek_non_root_scalar("null")); + if(result) { advance_non_root_scalar("null"); } + return result; } constexpr const uint32_t MAX_INT_LENGTH = 1024; @@ -22342,42 +25645,103 @@ simdjson_warn_unused simdjson_really_inline simdjson_result val } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_root_uint64() noexcept { auto max_len = peek_start_length(); - auto json = advance_root_scalar("uint64"); + auto json = peek_root_scalar("uint64"); uint8_t tmpbuf[20+1]; // <20 digits> is the longest possible unsigned integer - if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) { logger::log_error(*_json_iter, _start_position, depth(), "Root number more than 20 characters"); return NUMBER_ERROR; } - return numberparsing::parse_unsigned(tmpbuf); + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) { + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 20 characters"); + return NUMBER_ERROR; + } + auto result = numberparsing::parse_unsigned(tmpbuf); + if(result.error() != INCORRECT_TYPE) { advance_root_scalar("uint64"); } + return result; +} +simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_root_uint64_in_string() noexcept { + auto max_len = peek_start_length(); + auto json = peek_root_scalar("uint64"); + uint8_t tmpbuf[20+1]; // <20 digits> is the longest possible unsigned integer + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) { + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 20 characters"); + return NUMBER_ERROR; + } + auto result = numberparsing::parse_unsigned_in_string(tmpbuf); + if(result.error() != INCORRECT_TYPE) { advance_root_scalar("uint64"); } + return result; } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_root_int64() noexcept { auto max_len = peek_start_length(); - auto json = advance_root_scalar("int64"); + auto json = peek_root_scalar("int64"); uint8_t tmpbuf[20+1]; // -<19 digits> is the longest possible integer - if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) { logger::log_error(*_json_iter, _start_position, depth(), "Root number more than 20 characters"); return NUMBER_ERROR; } - return numberparsing::parse_integer(tmpbuf); + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) { + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 20 characters"); + return NUMBER_ERROR; + } + + auto result = numberparsing::parse_integer(tmpbuf); + if(result.error() != INCORRECT_TYPE) { advance_root_scalar("int64"); } + return result; +} +simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_root_int64_in_string() noexcept { + auto max_len = peek_start_length(); + auto json = peek_root_scalar("int64"); + uint8_t tmpbuf[20+1]; // -<19 digits> is the longest possible integer + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) { + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 20 characters"); + return NUMBER_ERROR; + } + + auto result = numberparsing::parse_integer_in_string(tmpbuf); + if(result.error() != INCORRECT_TYPE) { advance_root_scalar("int64"); } + return result; } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_root_double() noexcept { auto max_len = peek_start_length(); - auto json = advance_root_scalar("double"); - // Per https://www.exploringbinary.com/maximum-number-of-decimal-digits-in-binary-floating-point-numbers/, 1074 is the maximum number of significant fractional digits. Add 8 more digits for the biggest number: -0.e-308. + auto json = peek_root_scalar("double"); + // Per https://www.exploringbinary.com/maximum-number-of-decimal-digits-in-binary-floating-point-numbers/, + // 1074 is the maximum number of significant fractional digits. Add 8 more digits for the biggest + // number: -0.e-308. uint8_t tmpbuf[1074+8+1]; - if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) { logger::log_error(*_json_iter, _start_position, depth(), "Root number more than 1082 characters"); return NUMBER_ERROR; } - return numberparsing::parse_double(tmpbuf); + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) { + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 1082 characters"); + return NUMBER_ERROR; + } + auto result = numberparsing::parse_double(tmpbuf); + if(result.error() != INCORRECT_TYPE) { advance_root_scalar("double"); } + return result; +} +simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_root_double_in_string() noexcept { + auto max_len = peek_start_length(); + auto json = peek_root_scalar("double"); + // Per https://www.exploringbinary.com/maximum-number-of-decimal-digits-in-binary-floating-point-numbers/, + // 1074 is the maximum number of significant fractional digits. Add 8 more digits for the biggest + // number: -0.e-308. + uint8_t tmpbuf[1074+8+1]; + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) { + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 1082 characters"); + return NUMBER_ERROR; + } + auto result = numberparsing::parse_double_in_string(tmpbuf); + if(result.error() != INCORRECT_TYPE) { advance_root_scalar("double"); } + return result; } simdjson_warn_unused simdjson_really_inline simdjson_result value_iterator::get_root_bool() noexcept { auto max_len = peek_start_length(); - auto json = advance_root_scalar("bool"); + auto json = peek_root_scalar("bool"); uint8_t tmpbuf[5+1]; if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) { return incorrect_type_error("Not a boolean"); } + advance_root_scalar("bool"); return parse_bool(tmpbuf); } simdjson_really_inline bool value_iterator::is_root_null() noexcept { auto max_len = peek_start_length(); - auto json = advance_root_scalar("null"); - return max_len >= 4 && !atomparsing::str4ncmp(json, "null") && - (max_len == 4 || jsoncharutils::is_structural_or_whitespace(json[5])); + auto json = peek_root_scalar("null"); + auto result = (max_len >= 4 && !atomparsing::str4ncmp(json, "null") && + (max_len == 4 || jsoncharutils::is_structural_or_whitespace(json[5]))); + if(result) { advance_root_scalar("null"); } + return result; } simdjson_warn_unused simdjson_really_inline error_code value_iterator::skip_child() noexcept { - SIMDJSON_ASSUME( _json_iter->token.index > _start_position ); + SIMDJSON_ASSUME( _json_iter->token._position > _start_position ); SIMDJSON_ASSUME( _json_iter->_depth >= _depth ); return _json_iter->skip_child(depth()); @@ -22398,17 +25762,17 @@ simdjson_really_inline bool value_iterator::is_open() const noexcept { } SIMDJSON_POP_DISABLE_WARNINGS -simdjson_really_inline bool value_iterator::at_eof() const noexcept { - return _json_iter->at_eof(); +simdjson_really_inline bool value_iterator::at_end() const noexcept { + return _json_iter->at_end(); } simdjson_really_inline bool value_iterator::at_start() const noexcept { - return _json_iter->token.index == _start_position; + return _json_iter->token.position() == start_position(); } simdjson_really_inline bool value_iterator::at_first_field() const noexcept { - SIMDJSON_ASSUME( _json_iter->token.index > _start_position ); - return _json_iter->token.index == _start_position + 1; + SIMDJSON_ASSUME( _json_iter->token._position > _start_position ); + return _json_iter->token.position() == start_position() + 1; } simdjson_really_inline void value_iterator::abandon() noexcept { @@ -22432,111 +25796,169 @@ simdjson_warn_unused simdjson_really_inline json_iterator &value_iterator::json_ } simdjson_really_inline const uint8_t *value_iterator::peek_start() const noexcept { - return _json_iter->peek(_start_position); + return _json_iter->peek(start_position()); } simdjson_really_inline uint32_t value_iterator::peek_start_length() const noexcept { - return _json_iter->peek_length(_start_position); + return _json_iter->peek_length(start_position()); } -simdjson_really_inline const uint8_t *value_iterator::advance_start(const char *type) const noexcept { - logger::log_value(*_json_iter, _start_position, depth(), type); +simdjson_really_inline const uint8_t *value_iterator::peek_scalar(const char *type) noexcept { + logger::log_value(*_json_iter, start_position(), depth(), type); // If we're not at the position anymore, we don't want to advance the cursor. if (!is_at_start()) { return peek_start(); } // Get the JSON and advance the cursor, decreasing depth to signify that we have retrieved the value. assert_at_start(); - auto result = _json_iter->advance(); - _json_iter->ascend_to(depth()-1); - return result; + return _json_iter->peek(); } -simdjson_really_inline error_code value_iterator::advance_container_start(const char *type, const uint8_t *&json) const noexcept { - logger::log_start_value(*_json_iter, _start_position, depth(), type); +simdjson_really_inline void value_iterator::advance_scalar(const char *type) noexcept { + logger::log_value(*_json_iter, start_position(), depth(), type); // If we're not at the position anymore, we don't want to advance the cursor. + if (!is_at_start()) { return; } + + // Get the JSON and advance the cursor, decreasing depth to signify that we have retrieved the value. + assert_at_start(); + _json_iter->return_current_and_advance(); + _json_iter->ascend_to(depth()-1); +} + +simdjson_really_inline error_code value_iterator::start_container(uint8_t start_char, const char *incorrect_type_message, const char *type) noexcept { + logger::log_start_value(*_json_iter, start_position(), depth(), type); + // If we're not at the position anymore, we don't want to advance the cursor. + const uint8_t *json; if (!is_at_start()) { #ifdef SIMDJSON_DEVELOPMENT_CHECKS if (!is_at_iterator_start()) { return OUT_OF_ORDER_ITERATION; } #endif json = peek_start(); - return SUCCESS; + if (*json != start_char) { return incorrect_type_error(incorrect_type_message); } + } else { + assert_at_start(); + /** + * We should be prudent. Let us peek. If it is not the right type, we + * return an error. Only once we have determined that we have the right + * type are we allowed to advance! + */ + json = _json_iter->peek(); + if (*json != start_char) { return incorrect_type_error(incorrect_type_message); } + _json_iter->return_current_and_advance(); } - // Get the JSON and advance the cursor, decreasing depth to signify that we have retrieved the value. - assert_at_start(); - json = _json_iter->advance(); + return SUCCESS; } -simdjson_really_inline const uint8_t *value_iterator::advance_root_scalar(const char *type) const noexcept { - logger::log_value(*_json_iter, _start_position, depth(), type); + + +simdjson_really_inline const uint8_t *value_iterator::peek_root_scalar(const char *type) noexcept { + logger::log_value(*_json_iter, start_position(), depth(), type); if (!is_at_start()) { return peek_start(); } assert_at_root(); - auto result = _json_iter->advance(); - _json_iter->ascend_to(depth()-1); - return result; + return _json_iter->peek(); } -simdjson_really_inline const uint8_t *value_iterator::advance_non_root_scalar(const char *type) const noexcept { - logger::log_value(*_json_iter, _start_position, depth(), type); +simdjson_really_inline const uint8_t *value_iterator::peek_non_root_scalar(const char *type) noexcept { + logger::log_value(*_json_iter, start_position(), depth(), type); if (!is_at_start()) { return peek_start(); } assert_at_non_root_start(); - auto result = _json_iter->advance(); + return _json_iter->peek(); +} + +simdjson_really_inline void value_iterator::advance_root_scalar(const char *type) noexcept { + logger::log_value(*_json_iter, start_position(), depth(), type); + if (!is_at_start()) { return; } + + assert_at_root(); + _json_iter->return_current_and_advance(); + _json_iter->ascend_to(depth()-1); +} +simdjson_really_inline void value_iterator::advance_non_root_scalar(const char *type) noexcept { + logger::log_value(*_json_iter, start_position(), depth(), type); + if (!is_at_start()) { return; } + + assert_at_non_root_start(); + _json_iter->return_current_and_advance(); _json_iter->ascend_to(depth()-1); - return result; } simdjson_really_inline error_code value_iterator::incorrect_type_error(const char *message) const noexcept { - logger::log_error(*_json_iter, _start_position, depth(), message); + logger::log_error(*_json_iter, start_position(), depth(), message); return INCORRECT_TYPE; } simdjson_really_inline bool value_iterator::is_at_start() const noexcept { - return _json_iter->token.index == _start_position; + return position() == start_position(); } -simdjson_really_inline bool value_iterator::is_at_container_start() const noexcept { - return _json_iter->token.index == _start_position + 1; + +simdjson_really_inline bool value_iterator::is_at_key() const noexcept { + // Keys are at the same depth as the object. + // Note here that we could be safer and check that we are within an object, + // but we do not. + return _depth == _json_iter->_depth && *_json_iter->peek() == '"'; } + simdjson_really_inline bool value_iterator::is_at_iterator_start() const noexcept { // We can legitimately be either at the first value ([1]), or after the array if it's empty ([]). - auto delta = _json_iter->token.index - _start_position; + auto delta = position() - start_position(); return delta == 1 || delta == 2; } -simdjson_really_inline void value_iterator::assert_at_start() const noexcept { - SIMDJSON_ASSUME( _json_iter->token.index == _start_position ); +inline void value_iterator::assert_at_start() const noexcept { + SIMDJSON_ASSUME( _json_iter->token._position == _start_position ); SIMDJSON_ASSUME( _json_iter->_depth == _depth ); SIMDJSON_ASSUME( _depth > 0 ); } -simdjson_really_inline void value_iterator::assert_at_container_start() const noexcept { - SIMDJSON_ASSUME( _json_iter->token.index == _start_position + 1 ); +inline void value_iterator::assert_at_container_start() const noexcept { + SIMDJSON_ASSUME( _json_iter->token._position == _start_position + 1 ); SIMDJSON_ASSUME( _json_iter->_depth == _depth ); SIMDJSON_ASSUME( _depth > 0 ); } -simdjson_really_inline void value_iterator::assert_at_next() const noexcept { - SIMDJSON_ASSUME( _json_iter->token.index > _start_position ); +inline void value_iterator::assert_at_next() const noexcept { + SIMDJSON_ASSUME( _json_iter->token._position > _start_position ); SIMDJSON_ASSUME( _json_iter->_depth == _depth ); SIMDJSON_ASSUME( _depth > 0 ); } -simdjson_really_inline void value_iterator::assert_at_child() const noexcept { - SIMDJSON_ASSUME( _json_iter->token.index > _start_position ); +simdjson_really_inline void value_iterator::move_at_start() noexcept { + _json_iter->_depth = _depth; + _json_iter->token.set_position(_start_position); +} + +simdjson_really_inline void value_iterator::move_at_container_start() noexcept { + _json_iter->_depth = _depth; + _json_iter->token.set_position(_start_position + 1); +} + +simdjson_really_inline simdjson_result value_iterator::reset_array() noexcept { + move_at_container_start(); + return started_array(); +} + +simdjson_really_inline simdjson_result value_iterator::reset_object() noexcept { + move_at_container_start(); + return started_object(); +} + +inline void value_iterator::assert_at_child() const noexcept { + SIMDJSON_ASSUME( _json_iter->token._position > _start_position ); SIMDJSON_ASSUME( _json_iter->_depth == _depth + 1 ); SIMDJSON_ASSUME( _depth > 0 ); } -simdjson_really_inline void value_iterator::assert_at_root() const noexcept { +inline void value_iterator::assert_at_root() const noexcept { assert_at_start(); SIMDJSON_ASSUME( _depth == 1 ); } -simdjson_really_inline void value_iterator::assert_at_non_root_start() const noexcept { +inline void value_iterator::assert_at_non_root_start() const noexcept { assert_at_start(); SIMDJSON_ASSUME( _depth > 1 ); } -simdjson_really_inline void value_iterator::assert_is_valid() const noexcept { +inline void value_iterator::assert_is_valid() const noexcept { SIMDJSON_ASSUME( _json_iter != nullptr ); } @@ -22544,8 +25966,7 @@ simdjson_really_inline bool value_iterator::is_valid() const noexcept { return _json_iter != nullptr; } - -simdjson_really_inline simdjson_result value_iterator::type() noexcept { +simdjson_really_inline simdjson_result value_iterator::type() const noexcept { switch (*peek_start()) { case '{': return json_type::object; @@ -22566,6 +25987,26 @@ simdjson_really_inline simdjson_result value_iterator::type() noexcep } } +simdjson_really_inline token_position value_iterator::start_position() const noexcept { + return _start_position; +} + +simdjson_really_inline token_position value_iterator::position() const noexcept { + return _json_iter->position(); +} + +simdjson_really_inline token_position value_iterator::end_position() const noexcept { + return _json_iter->end_position(); +} + +simdjson_really_inline token_position value_iterator::last_position() const noexcept { + return _json_iter->last_position(); +} + +simdjson_really_inline error_code value_iterator::report_error(error_code error, const char *message) noexcept { + return _json_iter->report_error(error, message); +} + } // namespace ondemand } // namespace SIMDJSON_BUILTIN_IMPLEMENTATION } // namespace simdjson @@ -22602,9 +26043,9 @@ simdjson_really_inline array_iterator &array_iterator::operator++() noexcept { error_code error; // PERF NOTE this is a safety rail ... users should exit loops as soon as they receive an error, so we'll never get here. // However, it does not seem to make a perf difference, so we add it out of an abundance of caution. - if ((error = iter.error()) ) { return *this; } - if ((error = iter.skip_child() )) { return *this; } - if ((error = iter.has_next_element().error() )) { return *this; } + if (( error = iter.error() )) { return *this; } + if (( error = iter.skip_child() )) { return *this; } + if (( error = iter.has_next_element().error() )) { return *this; } return *this; } @@ -22832,8 +26273,9 @@ simdjson_really_inline simdjson_result array::start_root(value_iterator & SIMDJSON_TRY( iter.start_root_array().get(has_value) ); return array(iter); } -simdjson_really_inline array array::started(value_iterator &iter) noexcept { - simdjson_unused bool has_value = iter.started_array(); +simdjson_really_inline simdjson_result array::started(value_iterator &iter) noexcept { + bool has_value; + SIMDJSON_TRY(iter.started_array().get(has_value)); return array(iter); } @@ -22846,6 +26288,79 @@ simdjson_really_inline simdjson_result array::begin() noexcept { simdjson_really_inline simdjson_result array::end() noexcept { return array_iterator(iter); } +simdjson_really_inline error_code array::consume() noexcept { + auto error = iter.json_iter().skip_child(iter.depth()-1); + if(error) { iter.abandon(); } + return error; +} + +simdjson_really_inline simdjson_result array::raw_json() noexcept { + const uint8_t * starting_point{iter.peek_start()}; + auto error = consume(); + if(error) { return error; } + // After 'consume()', we could be left pointing just beyond the document, but that + // is ok because we are not going to dereference the final pointer position, we just + // use it to compute the length in bytes. + const uint8_t * final_point{iter._json_iter->unsafe_pointer()}; + return std::string_view(reinterpret_cast(starting_point), size_t(final_point - starting_point)); +} + +simdjson_really_inline simdjson_result array::count_elements() & noexcept { + size_t count{0}; + // Important: we do not consume any of the values. + for(simdjson_unused auto v : *this) { count++; } + // The above loop will always succeed, but we want to report errors. + if(iter.error()) { return iter.error(); } + // We need to move back at the start because we expect users to iterate through + // the array after counting the number of elements. + iter.reset_array(); + return count; +} + +inline simdjson_result array::at_pointer(std::string_view json_pointer) noexcept { + if (json_pointer[0] != '/') { return INVALID_JSON_POINTER; } + json_pointer = json_pointer.substr(1); + // - means "the append position" or "the element after the end of the array" + // We don't support this, because we're returning a real element, not a position. + if (json_pointer == "-") { return INDEX_OUT_OF_BOUNDS; } + + // Read the array index + size_t array_index = 0; + size_t i; + for (i = 0; i < json_pointer.length() && json_pointer[i] != '/'; i++) { + uint8_t digit = uint8_t(json_pointer[i] - '0'); + // Check for non-digit in array index. If it's there, we're trying to get a field in an object + if (digit > 9) { return INCORRECT_TYPE; } + array_index = array_index*10 + digit; + } + + // 0 followed by other digits is invalid + if (i > 1 && json_pointer[0] == '0') { return INVALID_JSON_POINTER; } // "JSON pointer array index has other characters after 0" + + // Empty string is invalid; so is a "/" with no digits before it + if (i == 0) { return INVALID_JSON_POINTER; } // "Empty string in JSON pointer array index" + // Get the child + auto child = at(array_index); + // If there is an error, it ends here + if(child.error()) { + return child; + } + + // If there is a /, we're not done yet, call recursively. + if (i < json_pointer.length()) { + child = child.at_pointer(json_pointer.substr(i)); + } + return child; +} + +simdjson_really_inline simdjson_result array::at(size_t index) noexcept { + size_t i = 0; + for (auto value : *this) { + if (i == index) { return value; } + i++; + } + return INDEX_OUT_OF_BOUNDS; +} } // namespace ondemand } // namespace SIMDJSON_BUILTIN_IMPLEMENTATION @@ -22876,7 +26391,18 @@ simdjson_really_inline simdjson_result simdjson_result::count_elements() & noexcept { + if (error()) { return error(); } + return first.count_elements(); +} +simdjson_really_inline simdjson_result simdjson_result::at(size_t index) noexcept { + if (error()) { return error(); } + return first.at(index); +} +simdjson_really_inline simdjson_result simdjson_result::at_pointer(std::string_view json_pointer) noexcept { + if (error()) { return error(); } + return first.at_pointer(json_pointer); +} } // namespace simdjson /* end file include/simdjson/generic/ondemand/array-inl.h */ /* begin file include/simdjson/generic/ondemand/document-inl.h */ @@ -22894,14 +26420,46 @@ simdjson_really_inline document document::start(json_iterator &&iter) noexcept { return document(std::forward(iter)); } +inline void document::rewind() noexcept { + iter.rewind(); +} + +inline std::string document::to_debug_string() noexcept { + return iter.to_string(); +} + simdjson_really_inline value_iterator document::resume_value_iterator() noexcept { - return value_iterator(&iter, 1, iter.root_checkpoint()); + return value_iterator(&iter, 1, iter.root_position()); } simdjson_really_inline value_iterator document::get_root_value_iterator() noexcept { return resume_value_iterator(); } -simdjson_really_inline value document::resume_value() noexcept { - return resume_value_iterator(); +simdjson_really_inline simdjson_result document::start_or_resume_object() noexcept { + if (iter.at_root()) { + return get_object(); + } else { + return object::resume(resume_value_iterator()); + } +} +simdjson_really_inline simdjson_result document::get_value_unsafe() noexcept { + // Make sure we start any arrays or objects before returning, so that start_root_() + // gets called. + switch (*iter.peek()) { + case '[': { + array result; + SIMDJSON_TRY( get_array().get(result) ); + return value(result.iter); + } + case '{': { + object result; + SIMDJSON_TRY( get_object().get(result) ); + return value(result.iter); + } + default: + // TODO it is still wrong to convert this to a value! get_root_bool / etc. will not be + // called if you do this. + return value(get_root_value_iterator()); + } } simdjson_really_inline simdjson_result document::get_array() & noexcept { auto value = get_root_value_iterator(); @@ -22914,12 +26472,21 @@ simdjson_really_inline simdjson_result document::get_object() & noexcept simdjson_really_inline simdjson_result document::get_uint64() noexcept { return get_root_value_iterator().get_root_uint64(); } +simdjson_really_inline simdjson_result document::get_uint64_in_string() noexcept { + return get_root_value_iterator().get_root_uint64_in_string(); +} simdjson_really_inline simdjson_result document::get_int64() noexcept { return get_root_value_iterator().get_root_int64(); } +simdjson_really_inline simdjson_result document::get_int64_in_string() noexcept { + return get_root_value_iterator().get_root_int64_in_string(); +} simdjson_really_inline simdjson_result document::get_double() noexcept { return get_root_value_iterator().get_root_double(); } +simdjson_really_inline simdjson_result document::get_double_in_string() noexcept { + return get_root_value_iterator().get_root_double_in_string(); +} simdjson_really_inline simdjson_result document::get_string() noexcept { return get_root_value_iterator().get_root_string(); } @@ -22966,7 +26533,17 @@ simdjson_really_inline document::operator std::string_view() noexcept(false) { r simdjson_really_inline document::operator raw_json_string() noexcept(false) { return get_raw_json_string(); } simdjson_really_inline document::operator bool() noexcept(false) { return get_bool(); } #endif - +simdjson_really_inline simdjson_result document::count_elements() & noexcept { + auto a = get_array(); + simdjson_result answer = a.count_elements(); + /* If there was an array, we are now left pointing at its first element. */ + if(answer.error() == SUCCESS) { iter._depth -= 1 ; /* undoing the increment so we go back at the doc depth.*/ } + return answer; +} +simdjson_really_inline simdjson_result document::at(size_t index) & noexcept { + auto a = get_array(); + return a.at(index); +} simdjson_really_inline simdjson_result document::begin() & noexcept { return get_array().begin(); } @@ -22975,22 +26552,40 @@ simdjson_really_inline simdjson_result document::end() & noexcep } simdjson_really_inline simdjson_result document::find_field(std::string_view key) & noexcept { - return resume_value().find_field(key); + return start_or_resume_object().find_field(key); } simdjson_really_inline simdjson_result document::find_field(const char *key) & noexcept { - return resume_value().find_field(key); + return start_or_resume_object().find_field(key); } simdjson_really_inline simdjson_result document::find_field_unordered(std::string_view key) & noexcept { - return resume_value().find_field_unordered(key); + return start_or_resume_object().find_field_unordered(key); } simdjson_really_inline simdjson_result document::find_field_unordered(const char *key) & noexcept { - return resume_value().find_field_unordered(key); + return start_or_resume_object().find_field_unordered(key); } simdjson_really_inline simdjson_result document::operator[](std::string_view key) & noexcept { - return resume_value()[key]; + return start_or_resume_object()[key]; } simdjson_really_inline simdjson_result document::operator[](const char *key) & noexcept { - return resume_value()[key]; + return start_or_resume_object()[key]; +} + +simdjson_really_inline error_code document::consume() noexcept { + auto error = iter.skip_child(0); + if(error) { iter.abandon(); } + return error; +} + +simdjson_really_inline simdjson_result document::raw_json() noexcept { + auto _iter = get_root_value_iterator(); + const uint8_t * starting_point{_iter.peek_start()}; + auto error = consume(); + if(error) { return error; } + // After 'consume()', we could be left pointing just beyond the document, but that + // is ok because we are not going to dereference the final pointer position, we just + // use it to compute the length in bytes. + const uint8_t * final_point{iter.unsafe_pointer()}; + return std::string_view(reinterpret_cast(starting_point), size_t(final_point - starting_point)); } simdjson_really_inline simdjson_result document::type() noexcept { @@ -23002,6 +26597,24 @@ simdjson_really_inline simdjson_result document::raw_json_toke return std::string_view(reinterpret_cast(_iter.peek_start()), _iter.peek_start_length()); } +simdjson_really_inline simdjson_result document::at_pointer(std::string_view json_pointer) noexcept { + rewind(); // Rewind the document each time at_pointer is called + if (json_pointer.empty()) { + return this->get_value_unsafe(); + } + json_type t; + SIMDJSON_TRY(type().get(t)); + switch (t) + { + case json_type::array: + return (*this).get_array().at_pointer(json_pointer); + case json_type::object: + return (*this).get_object().at_pointer(json_pointer); + default: + return INVALID_JSON_POINTER; + } +} + } // namespace ondemand } // namespace SIMDJSON_BUILTIN_IMPLEMENTATION } // namespace simdjson @@ -23024,7 +26637,19 @@ simdjson_really_inline simdjson_result simdjson_result::count_elements() & noexcept { + if (error()) { return error(); } + return first.count_elements(); +} +simdjson_really_inline simdjson_result simdjson_result::at(size_t index) & noexcept { + if (error()) { return error(); } + return first.at(index); +} +simdjson_really_inline error_code simdjson_result::rewind() noexcept { + if (error()) { return error(); } + first.rewind(); + return SUCCESS; +} simdjson_really_inline simdjson_result simdjson_result::begin() & noexcept { if (error()) { return error(); } return first.begin(); @@ -23171,6 +26796,201 @@ simdjson_really_inline simdjson_result simdjson_result simdjson_result::at_pointer(std::string_view json_pointer) noexcept { + if (error()) { return error(); } + return first.at_pointer(json_pointer); +} + + +} // namespace simdjson + + +namespace simdjson { +namespace SIMDJSON_BUILTIN_IMPLEMENTATION { +namespace ondemand { + +simdjson_really_inline document_reference::document_reference() noexcept : doc{nullptr} {} +simdjson_really_inline document_reference::document_reference(document &d) noexcept : doc(&d) {} +simdjson_really_inline void document_reference::rewind() noexcept { doc->rewind(); } +simdjson_really_inline simdjson_result document_reference::get_array() & noexcept { return doc->get_array(); } +simdjson_really_inline simdjson_result document_reference::get_object() & noexcept { return doc->get_object(); } +simdjson_really_inline simdjson_result document_reference::get_uint64() noexcept { return doc->get_uint64(); } +simdjson_really_inline simdjson_result document_reference::get_int64() noexcept { return doc->get_int64(); } +simdjson_really_inline simdjson_result document_reference::get_double() noexcept { return doc->get_double(); } +simdjson_really_inline simdjson_result document_reference::get_string() noexcept { return doc->get_string(); } +simdjson_really_inline simdjson_result document_reference::get_raw_json_string() noexcept { return doc->get_raw_json_string(); } +simdjson_really_inline simdjson_result document_reference::get_bool() noexcept { return doc->get_bool(); } +simdjson_really_inline bool document_reference::is_null() noexcept { return doc->is_null(); } + +#if SIMDJSON_EXCEPTIONS +simdjson_really_inline document_reference::operator array() & noexcept(false) { return array(*doc); } +simdjson_really_inline document_reference::operator object() & noexcept(false) { return object(*doc); } +simdjson_really_inline document_reference::operator uint64_t() noexcept(false) { return uint64_t(*doc); } +simdjson_really_inline document_reference::operator int64_t() noexcept(false) { return int64_t(*doc); } +simdjson_really_inline document_reference::operator double() noexcept(false) { return double(*doc); } +simdjson_really_inline document_reference::operator std::string_view() noexcept(false) { return std::string_view(*doc); } +simdjson_really_inline document_reference::operator raw_json_string() noexcept(false) { return raw_json_string(*doc); } +simdjson_really_inline document_reference::operator bool() noexcept(false) { return bool(*doc); } +#endif +simdjson_really_inline simdjson_result document_reference::count_elements() & noexcept { return doc->count_elements(); } +simdjson_really_inline simdjson_result document_reference::at(size_t index) & noexcept { return doc->at(index); } +simdjson_really_inline simdjson_result document_reference::begin() & noexcept { return doc->begin(); } +simdjson_really_inline simdjson_result document_reference::end() & noexcept { return doc->end(); } +simdjson_really_inline simdjson_result document_reference::find_field(std::string_view key) & noexcept { return doc->find_field(key); } +simdjson_really_inline simdjson_result document_reference::find_field(const char *key) & noexcept { return doc->find_field(key); } +simdjson_really_inline simdjson_result document_reference::operator[](std::string_view key) & noexcept { return (*doc)[key]; } +simdjson_really_inline simdjson_result document_reference::operator[](const char *key) & noexcept { return (*doc)[key]; } +simdjson_really_inline simdjson_result document_reference::find_field_unordered(std::string_view key) & noexcept { return doc->find_field_unordered(key); } +simdjson_really_inline simdjson_result document_reference::find_field_unordered(const char *key) & noexcept { return doc->find_field_unordered(key); } + +simdjson_really_inline simdjson_result document_reference::type() noexcept { return doc->type(); } +simdjson_really_inline simdjson_result document_reference::raw_json_token() noexcept { return doc->raw_json_token(); } +simdjson_really_inline simdjson_result document_reference::at_pointer(std::string_view json_pointer) noexcept { return doc->at_pointer(json_pointer); } +simdjson_really_inline simdjson_result document_reference::raw_json() noexcept { return doc->raw_json();} +simdjson_really_inline document_reference::operator document&() const noexcept { return *doc; } + +} // namespace ondemand +} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION +} // namespace simdjson + + + +namespace simdjson { +simdjson_really_inline simdjson_result::simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference value, error_code error) + noexcept : implementation_simdjson_result_base(std::forward(value), error) {} + + +simdjson_really_inline simdjson_result simdjson_result::count_elements() & noexcept { + if (error()) { return error(); } + return first.count_elements(); +} +simdjson_really_inline simdjson_result simdjson_result::at(size_t index) & noexcept { + if (error()) { return error(); } + return first.at(index); +} +simdjson_really_inline error_code simdjson_result::rewind() noexcept { + if (error()) { return error(); } + first.rewind(); + return SUCCESS; +} +simdjson_really_inline simdjson_result simdjson_result::begin() & noexcept { + if (error()) { return error(); } + return first.begin(); +} +simdjson_really_inline simdjson_result simdjson_result::end() & noexcept { + return {}; +} +simdjson_really_inline simdjson_result simdjson_result::find_field_unordered(std::string_view key) & noexcept { + if (error()) { return error(); } + return first.find_field_unordered(key); +} +simdjson_really_inline simdjson_result simdjson_result::find_field_unordered(const char *key) & noexcept { + if (error()) { return error(); } + return first.find_field_unordered(key); +} +simdjson_really_inline simdjson_result simdjson_result::operator[](std::string_view key) & noexcept { + if (error()) { return error(); } + return first[key]; +} +simdjson_really_inline simdjson_result simdjson_result::operator[](const char *key) & noexcept { + if (error()) { return error(); } + return first[key]; +} +simdjson_really_inline simdjson_result simdjson_result::find_field(std::string_view key) & noexcept { + if (error()) { return error(); } + return first.find_field(key); +} +simdjson_really_inline simdjson_result simdjson_result::find_field(const char *key) & noexcept { + if (error()) { return error(); } + return first.find_field(key); +} +simdjson_really_inline simdjson_result simdjson_result::get_array() & noexcept { + if (error()) { return error(); } + return first.get_array(); +} +simdjson_really_inline simdjson_result simdjson_result::get_object() & noexcept { + if (error()) { return error(); } + return first.get_object(); +} +simdjson_really_inline simdjson_result simdjson_result::get_uint64() noexcept { + if (error()) { return error(); } + return first.get_uint64(); +} +simdjson_really_inline simdjson_result simdjson_result::get_int64() noexcept { + if (error()) { return error(); } + return first.get_int64(); +} +simdjson_really_inline simdjson_result simdjson_result::get_double() noexcept { + if (error()) { return error(); } + return first.get_double(); +} +simdjson_really_inline simdjson_result simdjson_result::get_string() noexcept { + if (error()) { return error(); } + return first.get_string(); +} +simdjson_really_inline simdjson_result simdjson_result::get_raw_json_string() noexcept { + if (error()) { return error(); } + return first.get_raw_json_string(); +} +simdjson_really_inline simdjson_result simdjson_result::get_bool() noexcept { + if (error()) { return error(); } + return first.get_bool(); +} +simdjson_really_inline bool simdjson_result::is_null() noexcept { + if (error()) { return error(); } + return first.is_null(); +} +simdjson_really_inline simdjson_result simdjson_result::type() noexcept { + if (error()) { return error(); } + return first.type(); +} + +#if SIMDJSON_EXCEPTIONS +simdjson_really_inline simdjson_result::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array() & noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_really_inline simdjson_result::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object() & noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_really_inline simdjson_result::operator uint64_t() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_really_inline simdjson_result::operator int64_t() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_really_inline simdjson_result::operator double() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_really_inline simdjson_result::operator std::string_view() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_really_inline simdjson_result::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_really_inline simdjson_result::operator bool() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +#endif + +simdjson_really_inline simdjson_result simdjson_result::raw_json_token() noexcept { + if (error()) { return error(); } + return first.raw_json_token(); +} + +simdjson_really_inline simdjson_result simdjson_result::at_pointer(std::string_view json_pointer) noexcept { + if (error()) { return error(); } + return first.at_pointer(json_pointer); +} + + } // namespace simdjson /* end file include/simdjson/generic/ondemand/document-inl.h */ /* begin file include/simdjson/generic/ondemand/value-inl.h */ @@ -23212,12 +27032,21 @@ simdjson_really_inline simdjson_result value::get_string() noe simdjson_really_inline simdjson_result value::get_double() noexcept { return iter.get_double(); } +simdjson_really_inline simdjson_result value::get_double_in_string() noexcept { + return iter.get_double_in_string(); +} simdjson_really_inline simdjson_result value::get_uint64() noexcept { return iter.get_uint64(); } +simdjson_really_inline simdjson_result value::get_uint64_in_string() noexcept { + return iter.get_uint64_in_string(); +} simdjson_really_inline simdjson_result value::get_int64() noexcept { return iter.get_int64(); } +simdjson_really_inline simdjson_result value::get_int64_in_string() noexcept { + return iter.get_int64_in_string(); +} simdjson_really_inline simdjson_result value::get_bool() noexcept { return iter.get_bool(); } @@ -23271,6 +27100,20 @@ simdjson_really_inline simdjson_result value::begin() & noexcept simdjson_really_inline simdjson_result value::end() & noexcept { return {}; } +simdjson_really_inline simdjson_result value::count_elements() & noexcept { + simdjson_result answer; + auto a = get_array(); + answer = a.count_elements(); + // count_elements leaves you pointing inside the array, at the first element. + // We need to move back so that the user can create a new array (which requires that + // we point at '['). + iter.move_at_start(); + return answer; +} +simdjson_really_inline simdjson_result value::at(size_t index) noexcept { + auto a = get_array(); + return a.at(index); +} simdjson_really_inline simdjson_result value::find_field(std::string_view key) noexcept { return start_or_resume_object().find_field(key); @@ -23301,6 +27144,20 @@ simdjson_really_inline std::string_view value::raw_json_token() noexcept { return std::string_view(reinterpret_cast(iter.peek_start()), iter.peek_start_length()); } +simdjson_really_inline simdjson_result value::at_pointer(std::string_view json_pointer) noexcept { + json_type t; + SIMDJSON_TRY(type().get(t)); + switch (t) + { + case json_type::array: + return (*this).get_array().at_pointer(json_pointer); + case json_type::object: + return (*this).get_object().at_pointer(json_pointer); + default: + return INVALID_JSON_POINTER; + } +} + } // namespace ondemand } // namespace SIMDJSON_BUILTIN_IMPLEMENTATION } // namespace simdjson @@ -23321,7 +27178,14 @@ simdjson_really_inline simdjson_result(error) { } - +simdjson_really_inline simdjson_result simdjson_result::count_elements() & noexcept { + if (error()) { return error(); } + return first.count_elements(); +} +simdjson_really_inline simdjson_result simdjson_result::at(size_t index) noexcept { + if (error()) { return error(); } + return first.at(index); +} simdjson_really_inline simdjson_result simdjson_result::begin() & noexcept { if (error()) { return error(); } return first.begin(); @@ -23370,14 +27234,26 @@ simdjson_really_inline simdjson_result simdjson_result simdjson_result::get_uint64_in_string() noexcept { + if (error()) { return error(); } + return first.get_uint64_in_string(); +} simdjson_really_inline simdjson_result simdjson_result::get_int64() noexcept { if (error()) { return error(); } return first.get_int64(); } +simdjson_really_inline simdjson_result simdjson_result::get_int64_in_string() noexcept { + if (error()) { return error(); } + return first.get_int64_in_string(); +} simdjson_really_inline simdjson_result simdjson_result::get_double() noexcept { if (error()) { return error(); } return first.get_double(); } +simdjson_really_inline simdjson_result simdjson_result::get_double_in_string() noexcept { + if (error()) { return error(); } + return first.get_double_in_string(); +} simdjson_really_inline simdjson_result simdjson_result::get_string() noexcept { if (error()) { return error(); } return first.get_string(); @@ -23459,6 +27335,11 @@ simdjson_really_inline simdjson_result simdjson_result simdjson_result::at_pointer(std::string_view json_pointer) noexcept { + if (error()) { return error(); } + return first.at_pointer(json_pointer); +} + } // namespace simdjson /* end file include/simdjson/generic/ondemand/value-inl.h */ /* begin file include/simdjson/generic/ondemand/field-inl.h */ @@ -23486,7 +27367,7 @@ simdjson_really_inline simdjson_result field::start(const value_iterator } simdjson_really_inline simdjson_warn_unused simdjson_result field::unescaped_key() noexcept { - SIMDJSON_ASSUME(first.buf != nullptr); // We would like to call .alive() by Visual Studio won't let us. + SIMDJSON_ASSUME(first.buf != nullptr); // We would like to call .alive() but Visual Studio won't let us. simdjson_result answer = first.unescape(second.iter.string_buf_loc()); first.consume(); return answer; @@ -23578,21 +27459,52 @@ simdjson_really_inline simdjson_result object::find_field(const std::stri } simdjson_really_inline simdjson_result object::start(value_iterator &iter) noexcept { - // We don't need to know if the object is empty to start iteration, but we do want to know if there - // is an error--thus `simdjson_unused`. - simdjson_unused bool has_value; - SIMDJSON_TRY( iter.start_object().get(has_value) ); + SIMDJSON_TRY( iter.start_object().error() ); return object(iter); } simdjson_really_inline simdjson_result object::start_root(value_iterator &iter) noexcept { - simdjson_unused bool has_value; - SIMDJSON_TRY( iter.start_root_object().get(has_value) ); + SIMDJSON_TRY( iter.start_root_object().error() ); return object(iter); } -simdjson_really_inline object object::started(value_iterator &iter) noexcept { - simdjson_unused bool has_value = iter.started_object(); - return iter; +simdjson_really_inline error_code object::consume() noexcept { + if(iter.is_at_key()) { + /** + * whenever you are pointing at a key, calling skip_child() is + * unsafe because you will hit a string and you will assume that + * it is string value, and this mistake will lead you to make bad + * depth computation. + */ + /** + * We want to 'consume' the key. We could really + * just do _json_iter->return_current_and_advance(); at this + * point, but, for clarity, we will use the high-level API to + * eat the key. We assume that the compiler optimizes away + * most of the work. + */ + simdjson_unused raw_json_string actual_key; + auto error = iter.field_key().get(actual_key); + if (error) { iter.abandon(); return error; }; + // Let us move to the value while we are at it. + if ((error = iter.field_value())) { iter.abandon(); return error; } + } + auto error_skip = iter.json_iter().skip_child(iter.depth()-1); + if(error_skip) { iter.abandon(); } + return error_skip; } + +simdjson_really_inline simdjson_result object::raw_json() noexcept { + const uint8_t * starting_point{iter.peek_start()}; + auto error = consume(); + if(error) { return error; } + const uint8_t * final_point{iter._json_iter->peek(0)}; + return std::string_view(reinterpret_cast(starting_point), size_t(final_point - starting_point)); +} + +simdjson_really_inline simdjson_result object::started(value_iterator &iter) noexcept { + SIMDJSON_TRY( iter.started_object().error() ); + return object(iter); +} + simdjson_really_inline object object::resume(const value_iterator &iter) noexcept { return iter; } @@ -23612,6 +27524,46 @@ simdjson_really_inline simdjson_result object::end() noexcept { return object_iterator(iter); } +inline simdjson_result object::at_pointer(std::string_view json_pointer) noexcept { + if (json_pointer[0] != '/') { return INVALID_JSON_POINTER; } + json_pointer = json_pointer.substr(1); + size_t slash = json_pointer.find('/'); + std::string_view key = json_pointer.substr(0, slash); + // Grab the child with the given key + simdjson_result child; + + // If there is an escape character in the key, unescape it and then get the child. + size_t escape = key.find('~'); + if (escape != std::string_view::npos) { + // Unescape the key + std::string unescaped(key); + do { + switch (unescaped[escape+1]) { + case '0': + unescaped.replace(escape, 2, "~"); + break; + case '1': + unescaped.replace(escape, 2, "/"); + break; + default: + return INVALID_JSON_POINTER; // "Unexpected ~ escape character in JSON pointer"); + } + escape = unescaped.find('~', escape+1); + } while (escape != std::string::npos); + child = find_field(unescaped); // Take note find_field does not unescape keys when matching + } else { + child = find_field(key); + } + if(child.error()) { + return child; // we do not continue if there was an error + } + // If there is a /, we have to recurse and look up more of the path + if (slash != std::string_view::npos) { + child = child.at_pointer(json_pointer.substr(slash)); + } + return child; +} + } // namespace ondemand } // namespace SIMDJSON_BUILTIN_IMPLEMENTATION } // namespace simdjson @@ -23656,6 +27608,11 @@ simdjson_really_inline simdjson_result(first).find_field(key); } +simdjson_really_inline simdjson_result simdjson_result::at_pointer(std::string_view json_pointer) noexcept { + if (error()) { return error(); } + return first.at_pointer(json_pointer); +} + } // namespace simdjson /* end file include/simdjson/generic/ondemand/object-inl.h */ /* begin file include/simdjson/generic/ondemand/parser-inl.h */ @@ -23663,7 +27620,12 @@ namespace simdjson { namespace SIMDJSON_BUILTIN_IMPLEMENTATION { namespace ondemand { +simdjson_really_inline parser::parser(size_t max_capacity) noexcept + : _max_capacity{max_capacity} { +} + simdjson_warn_unused simdjson_really_inline error_code parser::allocate(size_t new_capacity, size_t new_max_depth) noexcept { + if (new_capacity > max_capacity()) { return CAPACITY; } if (string_buf && new_capacity == capacity() && new_max_depth == max_depth()) { return SUCCESS; } // string_capacity copied from document::allocate @@ -23693,7 +27655,7 @@ simdjson_warn_unused simdjson_really_inline simdjson_result parser::it } // Run stage 1. - SIMDJSON_TRY( implementation->stage1(reinterpret_cast(json.data()), json.length(), false) ); + SIMDJSON_TRY( implementation->stage1(reinterpret_cast(json.data()), json.length(), stage1_mode::regular) ); return document::start({ reinterpret_cast(json.data()), this }); } @@ -23736,17 +27698,42 @@ simdjson_warn_unused simdjson_really_inline simdjson_result parse } // Run stage 1. - SIMDJSON_TRY( implementation->stage1(reinterpret_cast(json.data()), json.length(), false) ); + SIMDJSON_TRY( implementation->stage1(reinterpret_cast(json.data()), json.length(), stage1_mode::regular) ); return json_iterator(reinterpret_cast(json.data()), this); } +inline simdjson_result parser::iterate_many(const uint8_t *buf, size_t len, size_t batch_size) noexcept { + if(batch_size < MINIMAL_BATCH_SIZE) { batch_size = MINIMAL_BATCH_SIZE; } + return document_stream(*this, buf, len, batch_size); +} +inline simdjson_result parser::iterate_many(const char *buf, size_t len, size_t batch_size) noexcept { + return iterate_many(reinterpret_cast(buf), len, batch_size); +} +inline simdjson_result parser::iterate_many(const std::string &s, size_t batch_size) noexcept { + return iterate_many(s.data(), s.length(), batch_size); +} +inline simdjson_result parser::iterate_many(const padded_string &s, size_t batch_size) noexcept { + return iterate_many(s.data(), s.length(), batch_size); +} + simdjson_really_inline size_t parser::capacity() const noexcept { return _capacity; } +simdjson_really_inline size_t parser::max_capacity() const noexcept { + return _max_capacity; +} simdjson_really_inline size_t parser::max_depth() const noexcept { return _max_depth; } +simdjson_really_inline void parser::set_max_capacity(size_t max_capacity) noexcept { + size_t MINIMAL_DOCUMENT_CAPACITY = 32; + if(max_capacity < MINIMAL_DOCUMENT_CAPACITY) { + _max_capacity = max_capacity; + } else { + _max_capacity = MINIMAL_DOCUMENT_CAPACITY; + } +} } // namespace ondemand } // namespace SIMDJSON_BUILTIN_IMPLEMENTATION @@ -23761,6 +27748,635 @@ simdjson_really_inline simdjson_result +#include +#include +namespace simdjson { +namespace SIMDJSON_BUILTIN_IMPLEMENTATION { +namespace ondemand { + +#ifdef SIMDJSON_THREADS_ENABLED + +inline void stage1_worker::finish() { + // After calling "run" someone would call finish() to wait + // for the end of the processing. + // This function will wait until either the thread has done + // the processing or, else, the destructor has been called. + std::unique_lock lock(locking_mutex); + cond_var.wait(lock, [this]{return has_work == false;}); +} + +inline stage1_worker::~stage1_worker() { + // The thread may never outlive the stage1_worker instance + // and will always be stopped/joined before the stage1_worker + // instance is gone. + stop_thread(); +} + +inline void stage1_worker::start_thread() { + std::unique_lock lock(locking_mutex); + if(thread.joinable()) { + return; // This should never happen but we never want to create more than one thread. + } + thread = std::thread([this]{ + while(true) { + std::unique_lock thread_lock(locking_mutex); + // We wait for either "run" or "stop_thread" to be called. + cond_var.wait(thread_lock, [this]{return has_work || !can_work;}); + // If, for some reason, the stop_thread() method was called (i.e., the + // destructor of stage1_worker is called, then we want to immediately destroy + // the thread (and not do any more processing). + if(!can_work) { + break; + } + this->owner->stage1_thread_error = this->owner->run_stage1(*this->stage1_thread_parser, + this->_next_batch_start); + this->has_work = false; + // The condition variable call should be moved after thread_lock.unlock() for performance + // reasons but thread sanitizers may report it as a data race if we do. + // See https://stackoverflow.com/questions/35775501/c-should-condition-variable-be-notified-under-lock + cond_var.notify_one(); // will notify "finish" + thread_lock.unlock(); + } + } + ); +} + + +inline void stage1_worker::stop_thread() { + std::unique_lock lock(locking_mutex); + // We have to make sure that all locks can be released. + can_work = false; + has_work = false; + cond_var.notify_all(); + lock.unlock(); + if(thread.joinable()) { + thread.join(); + } +} + +inline void stage1_worker::run(document_stream * ds, parser * stage1, size_t next_batch_start) { + std::unique_lock lock(locking_mutex); + owner = ds; + _next_batch_start = next_batch_start; + stage1_thread_parser = stage1; + has_work = true; + // The condition variable call should be moved after thread_lock.unlock() for performance + // reasons but thread sanitizers may report it as a data race if we do. + // See https://stackoverflow.com/questions/35775501/c-should-condition-variable-be-notified-under-lock + cond_var.notify_one(); // will notify the thread lock that we have work + lock.unlock(); +} + +#endif // SIMDJSON_THREADS_ENABLED + +simdjson_really_inline document_stream::document_stream( + ondemand::parser &_parser, + const uint8_t *_buf, + size_t _len, + size_t _batch_size +) noexcept + : parser{&_parser}, + buf{_buf}, + len{_len}, + batch_size{_batch_size <= MINIMAL_BATCH_SIZE ? MINIMAL_BATCH_SIZE : _batch_size}, + error{SUCCESS} + #ifdef SIMDJSON_THREADS_ENABLED + , use_thread(_parser.threaded) // we need to make a copy because _parser.threaded can change + #endif +{ +#ifdef SIMDJSON_THREADS_ENABLED + if(worker.get() == nullptr) { + error = MEMALLOC; + } +#endif +} + +simdjson_really_inline document_stream::document_stream() noexcept + : parser{nullptr}, + buf{nullptr}, + len{0}, + batch_size{0}, + error{UNINITIALIZED} + #ifdef SIMDJSON_THREADS_ENABLED + , use_thread(false) + #endif +{ +} + +simdjson_really_inline document_stream::~document_stream() noexcept +{ + #ifdef SIMDJSON_THREADS_ENABLED + worker.reset(); + #endif +} + +inline size_t document_stream::size_in_bytes() const noexcept { + return len; +} + +inline size_t document_stream::truncated_bytes() const noexcept { + if(error == CAPACITY) { return len - batch_start; } + return parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] - parser->implementation->structural_indexes[parser->implementation->n_structural_indexes + 1]; +} + +simdjson_really_inline document_stream::iterator::iterator() noexcept + : stream{nullptr}, finished{true} { +} + +simdjson_really_inline document_stream::iterator::iterator(document_stream* _stream, bool is_end) noexcept + : stream{_stream}, finished{is_end} { +} + +simdjson_really_inline simdjson_result document_stream::iterator::operator*() noexcept { + //if(stream->error) { return stream->error; } + return simdjson_result(stream->doc, stream->error); +} + +simdjson_really_inline document_stream::iterator& document_stream::iterator::operator++() noexcept { + // If there is an error, then we want the iterator + // to be finished, no matter what. (E.g., we do not + // keep generating documents with errors, or go beyond + // a document with errors.) + // + // Users do not have to call "operator*()" when they use operator++, + // so we need to end the stream in the operator++ function. + // + // Note that setting finished = true is essential otherwise + // we would enter an infinite loop. + if (stream->error) { finished = true; } + // Note that stream->error() is guarded against error conditions + // (it will immediately return if stream->error casts to false). + // In effect, this next function does nothing when (stream->error) + // is true (hence the risk of an infinite loop). + stream->next(); + // If that was the last document, we're finished. + // It is the only type of error we do not want to appear + // in operator*. + if (stream->error == EMPTY) { finished = true; } + // If we had any other kind of error (not EMPTY) then we want + // to pass it along to the operator* and we cannot mark the result + // as "finished" just yet. + return *this; +} + +simdjson_really_inline bool document_stream::iterator::operator!=(const document_stream::iterator &other) const noexcept { + return finished != other.finished; +} + +simdjson_really_inline document_stream::iterator document_stream::begin() noexcept { + start(); + // If there are no documents, we're finished. + return iterator(this, error == EMPTY); +} + +simdjson_really_inline document_stream::iterator document_stream::end() noexcept { + return iterator(this, true); +} + +inline void document_stream::start() noexcept { + if (error) { return; } + error = parser->allocate(batch_size); + if (error) { return; } + // Always run the first stage 1 parse immediately + batch_start = 0; + error = run_stage1(*parser, batch_start); + while(error == EMPTY) { + // In exceptional cases, we may start with an empty block + batch_start = next_batch_start(); + if (batch_start >= len) { return; } + error = run_stage1(*parser, batch_start); + } + if (error) { return; } + doc_index = batch_start; + doc = document(json_iterator(&buf[batch_start], parser)); + doc.iter._streaming = true; + + #ifdef SIMDJSON_THREADS_ENABLED + if (use_thread && next_batch_start() < len) { + // Kick off the first thread on next batch if needed + error = stage1_thread_parser.allocate(batch_size); + if (error) { return; } + worker->start_thread(); + start_stage1_thread(); + if (error) { return; } + } + #endif // SIMDJSON_THREADS_ENABLED +} + +inline void document_stream::next() noexcept { + // We always enter at once once in an error condition. + if (error) { return; } + next_document(); + if (error) { return; } + auto cur_struct_index = doc.iter._root - parser->implementation->structural_indexes.get(); + doc_index = batch_start + parser->implementation->structural_indexes[cur_struct_index]; + + // Check if at end of structural indexes (i.e. at end of batch) + if(cur_struct_index >= static_cast(parser->implementation->n_structural_indexes)) { + error = EMPTY; + // Load another batch (if available) + while (error == EMPTY) { + batch_start = next_batch_start(); + if (batch_start >= len) { break; } + #ifdef SIMDJSON_THREADS_ENABLED + if(use_thread) { + load_from_stage1_thread(); + } else { + error = run_stage1(*parser, batch_start); + } + #else + error = run_stage1(*parser, batch_start); + #endif + /** + * Whenever we move to another window, we need to update all pointers to make + * it appear as if the input buffer started at the beginning of the window. + * + * Take this input: + * + * {"z":5} {"1":1,"2":2,"4":4} [7, 10, 9] [15, 11, 12, 13] [154, 110, 112, 1311] + * + * Say you process the following window... + * + * '{"z":5} {"1":1,"2":2,"4":4} [7, 10, 9]' + * + * When you do so, the json_iterator has a pointer at the beginning of the memory region + * (pointing at the beginning of '{"z"...'. + * + * When you move to the window that starts at... + * + * '[7, 10, 9] [15, 11, 12, 13] ... + * + * then it is not sufficient to just run stage 1. You also need to re-anchor the + * json_iterator so that it believes we are starting at '[7, 10, 9]...'. + * + * Under the DOM front-end, this gets done automatically because the parser owns + * the pointer the data, and when you call stage1 and then stage2 on the same + * parser, then stage2 will run on the pointer acquired by stage1. + * + * That is, stage1 calls "this->buf = _buf" so the parser remembers the buffer that + * we used. But json_iterator has no callback when stage1 is called on the parser. + * In fact, I think that the parser is unaware of json_iterator. + * + * + * So we need to re-anchor the json_iterator after each call to stage 1 so that + * all of the pointers are in sync. + */ + doc.iter = json_iterator(&buf[batch_start], parser); + doc.iter._streaming = true; + /** + * End of resync. + */ + + if (error) { continue; } // If the error was EMPTY, we may want to load another batch. + doc_index = batch_start; + } + } +} + +inline void document_stream::next_document() noexcept { + // Go to next place where depth=0 (document depth) + error = doc.iter.skip_child(0); + if (error) { return; } + // Always set depth=1 at the start of document + doc.iter._depth = 1; + // Resets the string buffer at the beginning, thus invalidating the strings. + doc.iter._string_buf_loc = parser->string_buf.get(); + doc.iter._root = doc.iter.position(); +} + +inline size_t document_stream::next_batch_start() const noexcept { + return batch_start + parser->implementation->structural_indexes[parser->implementation->n_structural_indexes]; +} + +inline error_code document_stream::run_stage1(ondemand::parser &p, size_t _batch_start) noexcept { + // This code only updates the structural index in the parser, it does not update any json_iterator + // instance. + size_t remaining = len - _batch_start; + if (remaining <= batch_size) { + return p.implementation->stage1(&buf[_batch_start], remaining, stage1_mode::streaming_final); + } else { + return p.implementation->stage1(&buf[_batch_start], batch_size, stage1_mode::streaming_partial); + } +} + +simdjson_really_inline size_t document_stream::iterator::current_index() const noexcept { + return stream->doc_index; +} + +simdjson_really_inline std::string_view document_stream::iterator::source() const noexcept { + auto depth = stream->doc.iter.depth(); + auto cur_struct_index = stream->doc.iter._root - stream->parser->implementation->structural_indexes.get(); + + // If at root, process the first token to determine if scalar value + if (stream->doc.iter.at_root()) { + switch (stream->buf[stream->batch_start + stream->parser->implementation->structural_indexes[cur_struct_index]]) { + case '{': case '[': // Depth=1 already at start of document + break; + case '}': case ']': + depth--; + break; + default: // Scalar value document + // TODO: Remove any trailing whitespaces + // This returns a string spanning from start of value to the beginning of the next document (excluded) + return std::string_view(reinterpret_cast(stream->buf) + current_index(), stream->parser->implementation->structural_indexes[++cur_struct_index] - current_index() - 1); + } + cur_struct_index++; + } + + while (cur_struct_index <= static_cast(stream->parser->implementation->n_structural_indexes)) { + switch (stream->buf[stream->batch_start + stream->parser->implementation->structural_indexes[cur_struct_index]]) { + case '{': case '[': + depth++; + break; + case '}': case ']': + depth--; + break; + } + if (depth == 0) { break; } + cur_struct_index++; + } + + return std::string_view(reinterpret_cast(stream->buf) + current_index(), stream->parser->implementation->structural_indexes[cur_struct_index] - current_index() + stream->batch_start + 1);; +} + +inline error_code document_stream::iterator::error() const noexcept { + return stream->error; +} + +#ifdef SIMDJSON_THREADS_ENABLED + +inline void document_stream::load_from_stage1_thread() noexcept { + worker->finish(); + // Swap to the parser that was loaded up in the thread. Make sure the parser has + // enough memory to swap to, as well. + std::swap(stage1_thread_parser,*parser); + error = stage1_thread_error; + if (error) { return; } + + // If there's anything left, start the stage 1 thread! + if (next_batch_start() < len) { + start_stage1_thread(); + } +} + +inline void document_stream::start_stage1_thread() noexcept { + // we call the thread on a lambda that will update + // this->stage1_thread_error + // there is only one thread that may write to this value + // TODO this is NOT exception-safe. + this->stage1_thread_error = UNINITIALIZED; // In case something goes wrong, make sure it's an error + size_t _next_batch_start = this->next_batch_start(); + + worker->run(this, & this->stage1_thread_parser, _next_batch_start); +} + +#endif // SIMDJSON_THREADS_ENABLED + +} // namespace ondemand +} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_really_inline simdjson_result::simdjson_result( + error_code error +) noexcept : + implementation_simdjson_result_base(error) +{ +} +simdjson_really_inline simdjson_result::simdjson_result( + SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_stream &&value +) noexcept : + implementation_simdjson_result_base( + std::forward(value) + ) +{ +} + +} +/* end file include/simdjson/generic/ondemand/document_stream-inl.h */ +/* begin file include/simdjson/generic/ondemand/serialization-inl.h */ + + +namespace simdjson { + +inline std::string_view trim(const std::string_view str) noexcept { + // We can almost surely do better by rolling our own find_first_not_of function. + size_t first = str.find_first_not_of(" \t\n\r"); + // If we have the empty string (just white space), then no trimming is possible, and + // we return the empty string_view. + if (std::string_view::npos == first) { return std::string_view(); } + size_t last = str.find_last_not_of(" \t\n\r"); + return str.substr(first, (last - first + 1)); +} + + +inline simdjson_result to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document& x) noexcept { + std::string_view v; + auto error = x.raw_json().get(v); + if(error) {return error; } + return trim(v); +} + +inline simdjson_result to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference& x) noexcept { + std::string_view v; + auto error = x.raw_json().get(v); + if(error) {return error; } + return trim(v); +} + +inline simdjson_result to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value& x) noexcept { + /** + * If we somehow receive a value that has already been consumed, + * then the following code could be in trouble. E.g., we create + * an array as needed, but if an array was already created, then + * it could be bad. + */ + using namespace SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand; + SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type t; + auto error = x.type().get(t); + if(error != SUCCESS) { return error; } + switch (t) + { + case json_type::array: + { + SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array array; + error = x.get_array().get(array); + if(error) { return error; } + return to_json_string(array); + } + case json_type::object: + { + SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object object; + error = x.get_object().get(object); + if(error) { return error; } + return to_json_string(object); + } + default: + return trim(x.raw_json_token()); + } +} + +inline simdjson_result to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object& x) noexcept { + std::string_view v; + auto error = x.raw_json().get(v); + if(error) {return error; } + return trim(v); +} + +inline simdjson_result to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array& x) noexcept { + std::string_view v; + auto error = x.raw_json().get(v); + if(error) {return error; } + return trim(v); +} + +inline simdjson_result to_json_string(simdjson_result x) { + if (x.error()) { return x.error(); } + return to_json_string(x.value_unsafe()); +} + +inline simdjson_result to_json_string(simdjson_result x) { + if (x.error()) { return x.error(); } + return to_json_string(x.value_unsafe()); +} + +inline simdjson_result to_json_string(simdjson_result x) { + if (x.error()) { return x.error(); } + return to_json_string(x.value_unsafe()); +} + +inline simdjson_result to_json_string(simdjson_result x) { + if (x.error()) { return x.error(); } + return to_json_string(x.value_unsafe()); +} + +inline simdjson_result to_json_string(simdjson_result x) { + if (x.error()) { return x.error(); } + return to_json_string(x.value_unsafe()); +} +} // namespace simdjson + + +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value x) { + std::string_view v; + auto error = simdjson::to_json_string(x).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + throw simdjson::simdjson_error(error); + } +} +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x) { + if (x.error()) { throw simdjson::simdjson_error(x.error()); } + return (out << x.value()); +} +#else +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value x) { + std::string_view v; + auto error = simdjson::to_json_string(x).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + return (out << error); + } +} +#endif + +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + throw simdjson::simdjson_error(error); + } +} +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x) { + if (x.error()) { throw simdjson::simdjson_error(x.error()); } + return (out << x.value()); +} +#else +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + return (out << error); + } +} +#endif + +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document& value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + throw simdjson::simdjson_error(error); + } +} +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference& value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + throw simdjson::simdjson_error(error); + } +} +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result&& x) { + if (x.error()) { throw simdjson::simdjson_error(x.error()); } + return (out << x.value()); +} +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result&& x) { + if (x.error()) { throw simdjson::simdjson_error(x.error()); } + return (out << x.value()); +} +#else +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document& value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + return (out << error); + } +} +#endif + +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + throw simdjson::simdjson_error(error); + } +} +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x) { + if (x.error()) { throw simdjson::simdjson_error(x.error()); } + return (out << x.value()); +} +#else +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + return (out << error); + } +} +#endif +/* end file include/simdjson/generic/ondemand/serialization-inl.h */ /* end file include/simdjson/generic/ondemand-inl.h */