/* auto-generated on Sun Aug 4 15:43:41 EDT 2019. Do not edit! */ #include "simdjson.h" /* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */ #ifdef DMALLOC #include "dmalloc.h" #endif /* begin file src/simdjson.cpp */ #include namespace simdjson { const std::map error_strings = { {SUCCESS, "No errors"}, {CAPACITY, "This ParsedJson can't support a document that big"}, {MEMALLOC, "Error allocating memory, we're most likely out of memory"}, {TAPE_ERROR, "Something went wrong while writing to the tape"}, {STRING_ERROR, "Problem while parsing a string"}, {T_ATOM_ERROR, "Problem while parsing an atom starting with the letter 't'"}, {F_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'f'"}, {N_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'n'"}, {NUMBER_ERROR, "Problem while parsing a number"}, {UTF8_ERROR, "The input is not valid UTF-8"}, {UNITIALIZED, "Unitialized"}, {EMPTY, "Empty"}, {UNESCAPED_CHARS, "Within strings, some characters must be escapted, we " "found unescapted characters"}, {UNEXPECTED_ERROR, "Unexpected error, consider reporting this problem as " "you may have found a bug in simdjson"}, }; const std::string &error_message(const int error_code) { return error_strings.at(error_code); } } // namespace simdjson /* end file src/simdjson.cpp */ /* begin file src/jsonioutil.cpp */ #include #include namespace simdjson { char *allocate_padded_buffer(size_t length) { // we could do a simple malloc // return (char *) malloc(length + SIMDJSON_PADDING); // However, we might as well align to cache lines... size_t totalpaddedlength = length + SIMDJSON_PADDING; char *padded_buffer = aligned_malloc_char(64, totalpaddedlength); return padded_buffer; } padded_string get_corpus(const std::string &filename) { std::FILE *fp = std::fopen(filename.c_str(), "rb"); if (fp != nullptr) { std::fseek(fp, 0, SEEK_END); size_t len = std::ftell(fp); padded_string s(len); if (s.data() == nullptr) { std::fclose(fp); throw std::runtime_error("could not allocate memory"); } std::rewind(fp); size_t readb = std::fread(s.data(), 1, len, fp); std::fclose(fp); if (readb != len) { throw std::runtime_error("could not read the data"); } return s; } throw std::runtime_error("could not load corpus"); } } // namespace simdjson /* end file src/jsonioutil.cpp */ /* begin file src/jsonminifier.cpp */ #include #ifndef __AVX2__ namespace simdjson { static uint8_t jump_table[256 * 3] = { 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, }; size_t json_minify(const unsigned char *bytes, size_t how_many, unsigned char *out) { size_t i = 0, pos = 0; uint8_t quote = 0; uint8_t nonescape = 1; while (i < how_many) { unsigned char c = bytes[i]; uint8_t *meta = jump_table + 3 * c; quote = quote ^ (meta[0] & nonescape); out[pos] = c; pos += meta[2] | quote; i += 1; nonescape = (~nonescape) | (meta[1]); } return pos; } } // namespace simdjson #else #include namespace simdjson { // some intrinsics are missing under GCC? #ifndef __clang__ #ifndef _MSC_VER static __m256i inline _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) { __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo)); return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1); } static inline void _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) { __m128i __v128; __v128 = _mm256_castsi256_si128(__a); _mm_storeu_si128(__addr_lo, __v128); __v128 = _mm256_extractf128_si256(__a, 1); _mm_storeu_si128(__addr_hi, __v128); } #endif #endif // a straightforward comparison of a mask against input. static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi, __m256i mask) { __m256i cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask); uint64_t res_0 = static_cast(_mm256_movemask_epi8(cmp_res_0)); __m256i cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask); uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1); return res_0 | (res_1 << 32); } // take input from buf and remove useless whitespace, input and output can be // the same, result is null terminated, return the string length (minus the null // termination) size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out) { // Useful constant masks const uint64_t even_bits = 0x5555555555555555ULL; const uint64_t odd_bits = ~even_bits; uint8_t *initout(out); uint64_t prev_iter_ends_odd_backslash = 0ULL; // either 0 or 1, but a 64-bit value uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones size_t idx = 0; if (len >= 64) { size_t avx_len = len - 63; for (; idx < avx_len; idx += 64) { __m256i input_lo = _mm256_loadu_si256(reinterpret_cast(buf + idx + 0)); __m256i input_hi = _mm256_loadu_si256(reinterpret_cast(buf + idx + 32)); uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\')); uint64_t start_edges = bs_bits & ~(bs_bits << 1); uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"')); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); quote_mask ^= prev_iter_inside_quote; prev_iter_inside_quote = static_cast( static_cast(quote_mask) >> 63); // might be undefined behavior, should be fully defined in C++20, // ok according to John Regher from Utah University const __m256i low_nibble_mask = _mm256_setr_epi8( // 0 9 a b c d 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); const __m256i high_nibble_mask = _mm256_setr_epi8( // 0 2 3 5 7 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0); __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18); __m256i v_lo = _mm256_and_si256( _mm256_shuffle_epi8(low_nibble_mask, input_lo), _mm256_shuffle_epi8(high_nibble_mask, _mm256_and_si256(_mm256_srli_epi32(input_lo, 4), _mm256_set1_epi8(0x7f)))); __m256i v_hi = _mm256_and_si256( _mm256_shuffle_epi8(low_nibble_mask, input_hi), _mm256_shuffle_epi8(high_nibble_mask, _mm256_and_si256(_mm256_srli_epi32(input_hi, 4), _mm256_set1_epi8(0x7f)))); __m256i tmp_ws_lo = _mm256_cmpeq_epi8( _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0)); __m256i tmp_ws_hi = _mm256_cmpeq_epi8( _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0)); uint64_t ws_res_0 = static_cast(_mm256_movemask_epi8(tmp_ws_lo)); uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); whitespace &= ~quote_mask; int mask1 = whitespace & 0xFFFF; int mask2 = (whitespace >> 16) & 0xFFFF; int mask3 = (whitespace >> 32) & 0xFFFF; int mask4 = (whitespace >> 48) & 0xFFFF; int pop1 = hamming((~whitespace) & 0xFFFF); int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF)); int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF)); int pop4 = hamming((~whitespace)); __m256i vmask1 = _mm256_loadu2_m128i( reinterpret_cast(mask128_epi8) + (mask2 & 0x7FFF), reinterpret_cast(mask128_epi8) + (mask1 & 0x7FFF)); __m256i vmask2 = _mm256_loadu2_m128i( reinterpret_cast(mask128_epi8) + (mask4 & 0x7FFF), reinterpret_cast(mask128_epi8) + (mask3 & 0x7FFF)); __m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1); __m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2); _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop1), reinterpret_cast<__m128i *>(out), result1); _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop3), reinterpret_cast<__m128i *>(out + pop2), result2); out += pop4; } } // we finish off the job... copying and pasting the code is not ideal here, // but it gets the job done. if (idx < len) { uint8_t buffer[64]; memset(buffer, 0, 64); memcpy(buffer, buf + idx, len - idx); __m256i input_lo = _mm256_loadu_si256(reinterpret_cast(buffer)); __m256i input_hi = _mm256_loadu_si256(reinterpret_cast(buffer + 32)); uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\')); uint64_t start_edges = bs_bits & ~(bs_bits << 1); uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; // bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; // // we never use it uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"')); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); quote_mask ^= prev_iter_inside_quote; // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we // don't need this anymore __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32 __m256i mask_70 = _mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits // but moves any value >= 16 above 128 __m256i lut_cntrl = _mm256_setr_epi8( 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00); __m256i tmp_ws_lo = _mm256_or_si256( _mm256_cmpeq_epi8(mask_20, input_lo), _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo))); __m256i tmp_ws_hi = _mm256_or_si256( _mm256_cmpeq_epi8(mask_20, input_hi), _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi))); uint64_t ws_res_0 = static_cast(_mm256_movemask_epi8(tmp_ws_lo)); uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32)); whitespace &= ~quote_mask; if (len - idx < 64) { whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx); } int mask1 = whitespace & 0xFFFF; int mask2 = (whitespace >> 16) & 0xFFFF; int mask3 = (whitespace >> 32) & 0xFFFF; int mask4 = (whitespace >> 48) & 0xFFFF; int pop1 = hamming((~whitespace) & 0xFFFF); int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF)); int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF)); int pop4 = hamming((~whitespace)); __m256i vmask1 = _mm256_loadu2_m128i( reinterpret_cast(mask128_epi8) + (mask2 & 0x7FFF), reinterpret_cast(mask128_epi8) + (mask1 & 0x7FFF)); __m256i vmask2 = _mm256_loadu2_m128i( reinterpret_cast(mask128_epi8) + (mask4 & 0x7FFF), reinterpret_cast(mask128_epi8) + (mask3 & 0x7FFF)); __m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1); __m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2); _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop1), reinterpret_cast<__m128i *>(buffer), result1); _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop3), reinterpret_cast<__m128i *>(buffer + pop2), result2); memcpy(out, buffer, pop4); out += pop4; } *out = '\0'; // NULL termination return out - initout; } } // namespace simdjson #endif /* end file src/jsonminifier.cpp */ /* begin file src/jsonparser.cpp */ namespace simdjson { // The function that users are expected to call is json_parse. // We have more than one such function because we want to support several // instruction sets. // function pointer type for json_parse using json_parse_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj, bool realloc_if_needed); // Pointer that holds the json_parse implementation corresponding to the // available SIMD instruction set extern json_parse_functype *json_parse_ptr; int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool realloc_if_needed) { return json_parse_ptr(buf, len, pj, realloc_if_needed); } int json_parse(const char *buf, size_t len, ParsedJson &pj, bool realloc_if_needed) { return json_parse_ptr(reinterpret_cast(buf), len, pj, realloc_if_needed); } Architecture find_best_supported_implementation() { constexpr uint32_t haswell_flags = instruction_set::AVX2 | instruction_set::PCLMULQDQ | instruction_set::BMI1 | instruction_set::BMI2; constexpr uint32_t westmere_flags = instruction_set::SSE42 | instruction_set::PCLMULQDQ; uint32_t supports = detect_supported_architectures(); // Order from best to worst (within architecture) if ((haswell_flags & supports) == haswell_flags) return Architecture::HASWELL; if ((westmere_flags & supports) == westmere_flags) return Architecture::WESTMERE; if (instruction_set::NEON) return Architecture::ARM64; return Architecture::NONE; } // Responsible to select the best json_parse implementation int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool realloc_if_needed) { Architecture best_implementation = find_best_supported_implementation(); // Selecting the best implementation switch (best_implementation) { #ifdef IS_X86_64 case Architecture::HASWELL: json_parse_ptr = &json_parse_implementation; break; case Architecture::WESTMERE: json_parse_ptr = &json_parse_implementation; break; #endif #ifdef IS_ARM64 case Architecture::ARM64: json_parse_ptr = &json_parse_implementation; break; #endif default: std::cerr << "The processor is not supported by simdjson." << std::endl; return simdjson::UNEXPECTED_ERROR; } return json_parse_ptr(buf, len, pj, realloc_if_needed); } json_parse_functype *json_parse_ptr = &json_parse_dispatch; WARN_UNUSED ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool realloc_if_needed) { ParsedJson pj; bool ok = pj.allocate_capacity(len); if (ok) { json_parse(buf, len, pj, realloc_if_needed); } else { std::cerr << "failure during memory allocation " << std::endl; } return pj; } } // namespace simdjson /* end file src/jsonparser.cpp */ /* begin file src/stage1_find_marks.cpp */ #ifdef IS_X86_64 TARGET_HASWELL namespace simdjson { template <> int find_structural_bits(const uint8_t *buf, size_t len, ParsedJson &pj) { FIND_STRUCTURAL_BITS(Architecture::HASWELL, buf, len, pj, simdjson::haswell::flatten_bits); } } // namespace simdjson UNTARGET_REGION TARGET_WESTMERE namespace simdjson { template <> int find_structural_bits(const uint8_t *buf, size_t len, ParsedJson &pj) { FIND_STRUCTURAL_BITS(Architecture::WESTMERE, buf, len, pj, simdjson::flatten_bits); } } // namespace simdjson UNTARGET_REGION #endif #ifdef IS_ARM64 namespace simdjson { template <> int find_structural_bits(const uint8_t *buf, size_t len, ParsedJson &pj) { FIND_STRUCTURAL_BITS(Architecture::ARM64, buf, len, pj, simdjson::flatten_bits); } } // namespace simdjson #endif /* end file src/stage1_find_marks.cpp */ /* begin file src/stage2_build_tape.cpp */ namespace simdjson { // this macro reads the next structural character, updating idx, i and c. #define UPDATE_CHAR() \ { \ idx = pj.structural_indexes[i++]; \ c = buf[idx]; \ } #ifdef SIMDJSON_USE_COMPUTED_GOTO #define SET_GOTO_ARRAY_CONTINUE() pj.ret_address[depth] = &&array_continue; #define SET_GOTO_OBJECT_CONTINUE() pj.ret_address[depth] = &&object_continue; #define SET_GOTO_START_CONTINUE() pj.ret_address[depth] = &&start_continue; #define GOTO_CONTINUE() goto *pj.ret_address[depth]; #else #define SET_GOTO_ARRAY_CONTINUE() pj.ret_address[depth] = 'a'; #define SET_GOTO_OBJECT_CONTINUE() pj.ret_address[depth] = 'o'; #define SET_GOTO_START_CONTINUE() pj.ret_address[depth] = 's'; #define GOTO_CONTINUE() \ { \ if (pj.ret_address[depth] == 'a') { \ goto array_continue; \ } else if (pj.ret_address[depth] == 'o') { \ goto object_continue; \ } else { \ goto start_continue; \ } \ } #endif /************ * The JSON is parsed to a tape, see the accompanying tape.md file * for documentation. ***********/ // We need to compile that code for multiple architectures. However, target // attributes can be used only once by function definition. Huge macro seemed // better than huge code duplication. int UNIFIED_MACHINE(const uint8_t *buf, // size_t len, ParsedJson &pj) #define UNIFIED_MACHINE(T, buf, len, pj) \ { \ if (ALLOW_SAME_PAGE_BUFFER_OVERRUN) { \ memset((uint8_t *)buf + len, 0, \ SIMDJSON_PADDING); /* to please valgrind */ \ } \ uint32_t i = 0; /* index of the structural character (0,1,2,3...) */ \ uint32_t \ idx; /* location of the structural character in the input (buf) */ \ uint8_t c; /* used to track the (structural) character we are looking at, \ updated */ \ /* by UPDATE_CHAR macro */ \ uint32_t depth = 0; /* could have an arbitrary starting depth */ \ pj.init(); /* sets is_valid to false */ \ if (pj.byte_capacity < len) { \ pj.error_code = simdjson::CAPACITY; \ return pj.error_code; \ } \ \ /*//////////////////////////// START STATE ///////////////////////////// \ */ \ SET_GOTO_START_CONTINUE() \ pj.containing_scope_offset[depth] = pj.get_current_loc(); \ pj.write_tape(0, 'r'); /* r for root, 0 is going to get overwritten */ \ /* the root is used, if nothing else, to capture the size of the tape */ \ depth++; /* everything starts at depth = 1, depth = 0 is just for the \ root, the root may contain an object, an array or something \ else. */ \ if (depth >= pj.depth_capacity) { \ goto fail; \ } \ \ UPDATE_CHAR(); \ switch (c) { \ case '{': \ pj.containing_scope_offset[depth] = pj.get_current_loc(); \ SET_GOTO_START_CONTINUE(); \ depth++; \ if (depth >= pj.depth_capacity) { \ goto fail; \ } \ pj.write_tape( \ 0, \ c); /* strangely, moving this to object_begin slows things down */ \ goto object_begin; \ case '[': \ pj.containing_scope_offset[depth] = pj.get_current_loc(); \ SET_GOTO_START_CONTINUE(); \ depth++; \ if (depth >= pj.depth_capacity) { \ goto fail; \ } \ pj.write_tape(0, c); \ goto array_begin; \ /* #define SIMDJSON_ALLOWANYTHINGINROOT \ * A JSON text is a serialized value. Note that certain previous \ * specifications of JSON constrained a JSON text to be an object or an \ * array. Implementations that generate only objects or arrays where a \ * JSON text is called for will be interoperable in the sense that all \ * implementations will accept these as conforming JSON texts. \ * https://tools.ietf.org/html/rfc8259 \ * #ifdef SIMDJSON_ALLOWANYTHINGINROOT */ \ case '"': { \ if (!parse_string(buf, len, pj, depth, idx)) { \ goto fail; \ } \ break; \ } \ case 't': { \ /* we need to make a copy to make sure that the string is space \ * terminated. \ * this only applies to the JSON document made solely of the true value. \ * this will almost never be called in practice */ \ char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ if (copy == nullptr) { \ goto fail; \ } \ memcpy(copy, buf, len); \ copy[len] = ' '; \ if (!is_valid_true_atom(reinterpret_cast(copy) + \ idx)) { \ free(copy); \ goto fail; \ } \ free(copy); \ pj.write_tape(0, c); \ break; \ } \ case 'f': { \ /* we need to make a copy to make sure that the string is space \ * terminated. \ * this only applies to the JSON document made solely of the false \ * value. \ * this will almost never be called in practice */ \ char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ if (copy == nullptr) { \ goto fail; \ } \ memcpy(copy, buf, len); \ copy[len] = ' '; \ if (!is_valid_false_atom(reinterpret_cast(copy) + \ idx)) { \ free(copy); \ goto fail; \ } \ free(copy); \ pj.write_tape(0, c); \ break; \ } \ case 'n': { \ /* we need to make a copy to make sure that the string is space \ * terminated. \ * this only applies to the JSON document made solely of the null value. \ * this will almost never be called in practice */ \ char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ if (copy == nullptr) { \ goto fail; \ } \ memcpy(copy, buf, len); \ copy[len] = ' '; \ if (!is_valid_null_atom(reinterpret_cast(copy) + \ idx)) { \ free(copy); \ goto fail; \ } \ free(copy); \ pj.write_tape(0, c); \ break; \ } \ case '0': \ case '1': \ case '2': \ case '3': \ case '4': \ case '5': \ case '6': \ case '7': \ case '8': \ case '9': { \ /* we need to make a copy to make sure that the string is space \ * terminated. \ * this is done only for JSON documents made of a sole number \ * this will almost never be called in practice. We terminate with a \ * space \ * because we do not want to allow NULLs in the middle of a number \ * (whereas a \ * space in the middle of a number would be identified in stage 1). */ \ char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ if (copy == nullptr) { \ goto fail; \ } \ memcpy(copy, buf, len); \ copy[len] = ' '; \ if (!parse_number(reinterpret_cast(copy), pj, idx, \ false)) { \ free(copy); \ goto fail; \ } \ free(copy); \ break; \ } \ case '-': { \ /* we need to make a copy to make sure that the string is NULL \ * terminated. \ * this is done only for JSON documents made of a sole number \ * this will almost never be called in practice */ \ char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ if (copy == nullptr) { \ goto fail; \ } \ memcpy(copy, buf, len); \ copy[len] = ' '; \ if (!parse_number(reinterpret_cast(copy), pj, idx, \ true)) { \ free(copy); \ goto fail; \ } \ free(copy); \ break; \ } \ default: \ goto fail; \ } \ start_continue: \ /* the string might not be NULL terminated. */ \ if (i + 1 == pj.n_structural_indexes) { \ goto succeed; \ } else { \ goto fail; \ } \ /*//////////////////////////// OBJECT STATES ///////////////////////////*/ \ \ object_begin: \ UPDATE_CHAR(); \ switch (c) { \ case '"': { \ if (!parse_string(buf, len, pj, depth, idx)) { \ goto fail; \ } \ goto object_key_state; \ } \ case '}': \ goto scope_end; /* could also go to object_continue */ \ default: \ goto fail; \ } \ \ object_key_state: \ UPDATE_CHAR(); \ if (c != ':') { \ goto fail; \ } \ UPDATE_CHAR(); \ switch (c) { \ case '"': { \ if (!parse_string(buf, len, pj, depth, idx)) { \ goto fail; \ } \ break; \ } \ case 't': \ if (!is_valid_true_atom(buf + idx)) { \ goto fail; \ } \ pj.write_tape(0, c); \ break; \ case 'f': \ if (!is_valid_false_atom(buf + idx)) { \ goto fail; \ } \ pj.write_tape(0, c); \ break; \ case 'n': \ if (!is_valid_null_atom(buf + idx)) { \ goto fail; \ } \ pj.write_tape(0, c); \ break; \ case '0': \ case '1': \ case '2': \ case '3': \ case '4': \ case '5': \ case '6': \ case '7': \ case '8': \ case '9': { \ if (!parse_number(buf, pj, idx, false)) { \ goto fail; \ } \ break; \ } \ case '-': { \ if (!parse_number(buf, pj, idx, true)) { \ goto fail; \ } \ break; \ } \ case '{': { \ pj.containing_scope_offset[depth] = pj.get_current_loc(); \ pj.write_tape(0, c); /* here the compilers knows what c is so this gets \ optimized */ \ /* we have not yet encountered } so we need to come back for it */ \ SET_GOTO_OBJECT_CONTINUE() \ /* we found an object inside an object, so we need to increment the \ * depth */ \ depth++; \ if (depth >= pj.depth_capacity) { \ goto fail; \ } \ \ goto object_begin; \ } \ case '[': { \ pj.containing_scope_offset[depth] = pj.get_current_loc(); \ pj.write_tape(0, c); /* here the compilers knows what c is so this gets \ optimized */ \ /* we have not yet encountered } so we need to come back for it */ \ SET_GOTO_OBJECT_CONTINUE() \ /* we found an array inside an object, so we need to increment the depth \ */ \ depth++; \ if (depth >= pj.depth_capacity) { \ goto fail; \ } \ goto array_begin; \ } \ default: \ goto fail; \ } \ \ object_continue: \ UPDATE_CHAR(); \ switch (c) { \ case ',': \ UPDATE_CHAR(); \ if (c != '"') { \ goto fail; \ } else { \ if (!parse_string(buf, len, pj, depth, idx)) { \ goto fail; \ } \ goto object_key_state; \ } \ case '}': \ goto scope_end; \ default: \ goto fail; \ } \ \ /*//////////////////////////// COMMON STATE ///////////////////////////*/ \ \ scope_end: \ /* write our tape location to the header scope */ \ depth--; \ pj.write_tape(pj.containing_scope_offset[depth], c); \ pj.annotate_previous_loc(pj.containing_scope_offset[depth], \ pj.get_current_loc()); \ /* goto saved_state */ \ GOTO_CONTINUE() \ \ /*//////////////////////////// ARRAY STATES ///////////////////////////*/ \ array_begin: \ UPDATE_CHAR(); \ if (c == ']') { \ goto scope_end; /* could also go to array_continue */ \ } \ \ main_array_switch: \ /* we call update char on all paths in, so we can peek at c on the \ * on paths that can accept a close square brace (post-, and at start) */ \ switch (c) { \ case '"': { \ if (!parse_string(buf, len, pj, depth, idx)) { \ goto fail; \ } \ break; \ } \ case 't': \ if (!is_valid_true_atom(buf + idx)) { \ goto fail; \ } \ pj.write_tape(0, c); \ break; \ case 'f': \ if (!is_valid_false_atom(buf + idx)) { \ goto fail; \ } \ pj.write_tape(0, c); \ break; \ case 'n': \ if (!is_valid_null_atom(buf + idx)) { \ goto fail; \ } \ pj.write_tape(0, c); \ break; /* goto array_continue; */ \ \ case '0': \ case '1': \ case '2': \ case '3': \ case '4': \ case '5': \ case '6': \ case '7': \ case '8': \ case '9': { \ if (!parse_number(buf, pj, idx, false)) { \ goto fail; \ } \ break; /* goto array_continue; */ \ } \ case '-': { \ if (!parse_number(buf, pj, idx, true)) { \ goto fail; \ } \ break; /* goto array_continue; */ \ } \ case '{': { \ /* we have not yet encountered ] so we need to come back for it */ \ pj.containing_scope_offset[depth] = pj.get_current_loc(); \ pj.write_tape(0, c); /* here the compilers knows what c is so this gets \ optimized */ \ SET_GOTO_ARRAY_CONTINUE() \ /* we found an object inside an array, so we need to increment the depth \ */ \ depth++; \ if (depth >= pj.depth_capacity) { \ goto fail; \ } \ \ goto object_begin; \ } \ case '[': { \ /* we have not yet encountered ] so we need to come back for it */ \ pj.containing_scope_offset[depth] = pj.get_current_loc(); \ pj.write_tape(0, c); /* here the compilers knows what c is so this gets \ optimized */ \ SET_GOTO_ARRAY_CONTINUE() \ /* we found an array inside an array, so we need to increment the depth \ */ \ depth++; \ if (depth >= pj.depth_capacity) { \ goto fail; \ } \ goto array_begin; \ } \ default: \ goto fail; \ } \ \ array_continue: \ UPDATE_CHAR(); \ switch (c) { \ case ',': \ UPDATE_CHAR(); \ goto main_array_switch; \ case ']': \ goto scope_end; \ default: \ goto fail; \ } \ \ /*//////////////////////////// FINAL STATES ///////////////////////////*/ \ \ succeed: \ depth--; \ if (depth != 0) { \ fprintf(stderr, "internal bug\n"); \ abort(); \ } \ if (pj.containing_scope_offset[depth] != 0) { \ fprintf(stderr, "internal bug\n"); \ abort(); \ } \ pj.annotate_previous_loc(pj.containing_scope_offset[depth], \ pj.get_current_loc()); \ pj.write_tape(pj.containing_scope_offset[depth], 'r'); /* r is root */ \ \ pj.valid = true; \ pj.error_code = simdjson::SUCCESS; \ return pj.error_code; \ fail: \ /* we do not need the next line because this is done by pj.init(), \ * pessimistically. \ * pj.is_valid = false; \ * At this point in the code, we have all the time in the world. \ * Note that we know exactly where we are in the document so we could, \ * without any overhead on the processing code, report a specific \ * location. \ * We could even trigger special code paths to assess what happened \ * carefully, \ * all without any added cost. */ \ if (depth >= pj.depth_capacity) { \ pj.error_code = simdjson::DEPTH_ERROR; \ return pj.error_code; \ } \ switch (c) { \ case '"': \ pj.error_code = simdjson::STRING_ERROR; \ return pj.error_code; \ case '0': \ case '1': \ case '2': \ case '3': \ case '4': \ case '5': \ case '6': \ case '7': \ case '8': \ case '9': \ case '-': \ pj.error_code = simdjson::NUMBER_ERROR; \ return pj.error_code; \ case 't': \ pj.error_code = simdjson::T_ATOM_ERROR; \ return pj.error_code; \ case 'n': \ pj.error_code = simdjson::N_ATOM_ERROR; \ return pj.error_code; \ case 'f': \ pj.error_code = simdjson::F_ATOM_ERROR; \ return pj.error_code; \ default: \ break; \ } \ pj.error_code = simdjson::TAPE_ERROR; \ return pj.error_code; \ } } // namespace simdjson #ifdef IS_X86_64 TARGET_HASWELL namespace simdjson { template <> WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) { UNIFIED_MACHINE(Architecture::HASWELL, buf, len, pj); } } // namespace simdjson UNTARGET_REGION TARGET_WESTMERE namespace simdjson { template <> WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) { UNIFIED_MACHINE(Architecture::WESTMERE, buf, len, pj); } } // namespace simdjson UNTARGET_REGION #endif // IS_X86_64 #ifdef IS_ARM64 namespace simdjson { template <> WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) { UNIFIED_MACHINE(Architecture::ARM64, buf, len, pj); } } // namespace simdjson #endif /* end file src/stage2_build_tape.cpp */ /* begin file src/parsedjson.cpp */ namespace simdjson { ParsedJson::ParsedJson() : structural_indexes(nullptr), tape(nullptr), containing_scope_offset(nullptr), ret_address(nullptr), string_buf(nullptr), current_string_buf_loc(nullptr) {} ParsedJson::~ParsedJson() { deallocate(); } ParsedJson::ParsedJson(ParsedJson &&p) : byte_capacity(p.byte_capacity), depth_capacity(p.depth_capacity), tape_capacity(p.tape_capacity), string_capacity(p.string_capacity), current_loc(p.current_loc), n_structural_indexes(p.n_structural_indexes), structural_indexes(p.structural_indexes), tape(p.tape), containing_scope_offset(p.containing_scope_offset), ret_address(p.ret_address), string_buf(p.string_buf), current_string_buf_loc(p.current_string_buf_loc), valid(p.valid) { p.structural_indexes = nullptr; p.tape = nullptr; p.containing_scope_offset = nullptr; p.ret_address = nullptr; p.string_buf = nullptr; p.current_string_buf_loc = nullptr; } WARN_UNUSED bool ParsedJson::allocate_capacity(size_t len, size_t max_depth) { if (max_depth <= 0) { max_depth = 1; // don't let the user allocate nothing } if (len <= 0) { len = 64; // allocating 0 bytes is wasteful. } if (len > SIMDJSON_MAXSIZE_BYTES) { return false; } if ((len <= byte_capacity) && (depth_capacity < max_depth)) { return true; } deallocate(); valid = false; byte_capacity = 0; // will only set it to len after allocations are a success n_structural_indexes = 0; uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7; structural_indexes = new (std::nothrow) uint32_t[max_structures]; // a pathological input like "[[[[..." would generate len tape elements, so // need a capacity of len + 1 size_t local_tape_capacity = ROUNDUP_N(len + 1, 64); // a document with only zero-length strings... could have len/3 string // and we would need len/3 * 5 bytes on the string buffer size_t local_string_capacity = ROUNDUP_N(5 * len / 3 + 32, 64); string_buf = new (std::nothrow) uint8_t[local_string_capacity]; tape = new (std::nothrow) uint64_t[local_tape_capacity]; containing_scope_offset = new (std::nothrow) uint32_t[max_depth]; #ifdef SIMDJSON_USE_COMPUTED_GOTO ret_address = new (std::nothrow) void *[max_depth]; #else ret_address = new (std::nothrow) char[max_depth]; #endif if ((string_buf == nullptr) || (tape == nullptr) || (containing_scope_offset == nullptr) || (ret_address == nullptr) || (structural_indexes == nullptr)) { std::cerr << "Could not allocate memory" << std::endl; delete[] ret_address; delete[] containing_scope_offset; delete[] tape; delete[] string_buf; delete[] structural_indexes; return false; } /* // We do not need to initialize this content for parsing, though we could // need to initialize it for safety. memset(string_buf, 0 , local_string_capacity); memset(structural_indexes, 0, max_structures * sizeof(uint32_t)); memset(tape, 0, local_tape_capacity * sizeof(uint64_t)); */ byte_capacity = len; depth_capacity = max_depth; tape_capacity = local_tape_capacity; string_capacity = local_string_capacity; return true; } bool ParsedJson::is_valid() const { return valid; } int ParsedJson::get_error_code() const { return error_code; } std::string ParsedJson::get_error_message() const { return error_message(error_code); } void ParsedJson::deallocate() { byte_capacity = 0; depth_capacity = 0; tape_capacity = 0; string_capacity = 0; delete[] ret_address; delete[] containing_scope_offset; delete[] tape; delete[] string_buf; delete[] structural_indexes; valid = false; } void ParsedJson::init() { current_string_buf_loc = string_buf; current_loc = 0; valid = false; } WARN_UNUSED bool ParsedJson::print_json(std::ostream &os) { if (!valid) { return false; } uint32_t string_length; size_t tape_idx = 0; uint64_t tape_val = tape[tape_idx]; uint8_t type = (tape_val >> 56); size_t how_many = 0; if (type == 'r') { how_many = tape_val & JSON_VALUE_MASK; } else { fprintf(stderr, "Error: no starting root node?"); return false; } if (how_many > tape_capacity) { fprintf( stderr, "We may be exceeding the tape capacity. Is this a valid document?\n"); return false; } tape_idx++; bool *in_object = new bool[depth_capacity]; auto *in_object_idx = new size_t[depth_capacity]; int depth = 1; // only root at level 0 in_object_idx[depth] = 0; in_object[depth] = false; for (; tape_idx < how_many; tape_idx++) { tape_val = tape[tape_idx]; uint64_t payload = tape_val & JSON_VALUE_MASK; type = (tape_val >> 56); if (!in_object[depth]) { if ((in_object_idx[depth] > 0) && (type != ']')) { os << ","; } in_object_idx[depth]++; } else { // if (in_object) { if ((in_object_idx[depth] > 0) && ((in_object_idx[depth] & 1) == 0) && (type != '}')) { os << ","; } if (((in_object_idx[depth] & 1) == 1)) { os << ":"; } in_object_idx[depth]++; } switch (type) { case '"': // we have a string os << '"'; memcpy(&string_length, string_buf + payload, sizeof(uint32_t)); print_with_escapes( (const unsigned char *)(string_buf + payload + sizeof(uint32_t)), string_length); os << '"'; break; case 'l': // we have a long int if (tape_idx + 1 >= how_many) { delete[] in_object; delete[] in_object_idx; return false; } os << static_cast(tape[++tape_idx]); break; case 'd': // we have a double if (tape_idx + 1 >= how_many) { delete[] in_object; delete[] in_object_idx; return false; } double answer; memcpy(&answer, &tape[++tape_idx], sizeof(answer)); os << answer; break; case 'n': // we have a null os << "null"; break; case 't': // we have a true os << "true"; break; case 'f': // we have a false os << "false"; break; case '{': // we have an object os << '{'; depth++; in_object[depth] = true; in_object_idx[depth] = 0; break; case '}': // we end an object depth--; os << '}'; break; case '[': // we start an array os << '['; depth++; in_object[depth] = false; in_object_idx[depth] = 0; break; case ']': // we end an array depth--; os << ']'; break; case 'r': // we start and end with the root node fprintf(stderr, "should we be hitting the root node?\n"); delete[] in_object; delete[] in_object_idx; return false; default: fprintf(stderr, "bug %c\n", type); delete[] in_object; delete[] in_object_idx; return false; } } delete[] in_object; delete[] in_object_idx; return true; } WARN_UNUSED bool ParsedJson::dump_raw_tape(std::ostream &os) { if (!valid) { return false; } uint32_t string_length; size_t tape_idx = 0; uint64_t tape_val = tape[tape_idx]; uint8_t type = (tape_val >> 56); os << tape_idx << " : " << type; tape_idx++; size_t how_many = 0; if (type == 'r') { how_many = tape_val & JSON_VALUE_MASK; } else { fprintf(stderr, "Error: no starting root node?"); return false; } os << "\t// pointing to " << how_many << " (right after last node)\n"; uint64_t payload; for (; tape_idx < how_many; tape_idx++) { os << tape_idx << " : "; tape_val = tape[tape_idx]; payload = tape_val & JSON_VALUE_MASK; type = (tape_val >> 56); switch (type) { case '"': // we have a string os << "string \""; memcpy(&string_length, string_buf + payload, sizeof(uint32_t)); print_with_escapes( (const unsigned char *)(string_buf + payload + sizeof(uint32_t)), string_length); os << '"'; os << '\n'; break; case 'l': // we have a long int if (tape_idx + 1 >= how_many) { return false; } os << "integer " << static_cast(tape[++tape_idx]) << "\n"; break; case 'd': // we have a double os << "float "; if (tape_idx + 1 >= how_many) { return false; } double answer; memcpy(&answer, &tape[++tape_idx], sizeof(answer)); os << answer << '\n'; break; case 'n': // we have a null os << "null\n"; break; case 't': // we have a true os << "true\n"; break; case 'f': // we have a false os << "false\n"; break; case '{': // we have an object os << "{\t// pointing to next tape location " << payload << " (first node after the scope) \n"; break; case '}': // we end an object os << "}\t// pointing to previous tape location " << payload << " (start of the scope) \n"; break; case '[': // we start an array os << "[\t// pointing to next tape location " << payload << " (first node after the scope) \n"; break; case ']': // we end an array os << "]\t// pointing to previous tape location " << payload << " (start of the scope) \n"; break; case 'r': // we start and end with the root node printf("end of root\n"); return false; default: return false; } } tape_val = tape[tape_idx]; payload = tape_val & JSON_VALUE_MASK; type = (tape_val >> 56); os << tape_idx << " : " << type << "\t// pointing to " << payload << " (start root)\n"; return true; } } // namespace simdjson /* end file src/parsedjson.cpp */ /* begin file src/parsedjsoniterator.cpp */ #include namespace simdjson { ParsedJson::Iterator::Iterator(ParsedJson &pj_) : pj(pj_), depth(0), location(0), tape_length(0), depth_index(nullptr) { if (!pj.is_valid()) { throw InvalidJSON(); } // we overallocate by "1" to silence a warning in Visual Studio depth_index = new scopeindex_t[pj.depth_capacity + 1]; // memory allocation would throw // if(depth_index == nullptr) { // return; //} depth_index[0].start_of_scope = location; current_val = pj.tape[location++]; current_type = (current_val >> 56); depth_index[0].scope_type = current_type; if (current_type == 'r') { tape_length = current_val & JSON_VALUE_MASK; if (location < tape_length) { // If we make it here, then depth_capacity must >=2, but the compiler // may not know this. current_val = pj.tape[location]; current_type = (current_val >> 56); depth++; depth_index[depth].start_of_scope = location; depth_index[depth].scope_type = current_type; } } else { // should never happen throw InvalidJSON(); } } ParsedJson::Iterator::~Iterator() { delete[] depth_index; } ParsedJson::Iterator::Iterator(const Iterator &o) noexcept : pj(o.pj), depth(o.depth), location(o.location), tape_length(0), current_type(o.current_type), current_val(o.current_val), depth_index(nullptr) { depth_index = new scopeindex_t[pj.depth_capacity]; // allocation might throw memcpy(depth_index, o.depth_index, pj.depth_capacity * sizeof(depth_index[0])); tape_length = o.tape_length; } ParsedJson::Iterator::Iterator(Iterator &&o) noexcept : pj(o.pj), depth(o.depth), location(o.location), tape_length(o.tape_length), current_type(o.current_type), current_val(o.current_val), depth_index(o.depth_index) { o.depth_index = nullptr; // we take ownership } bool ParsedJson::Iterator::print(std::ostream &os, bool escape_strings) const { if (!is_ok()) { return false; } switch (current_type) { case '"': // we have a string os << '"'; if (escape_strings) { print_with_escapes(get_string(), os, get_string_length()); } else { // was: os << get_string();, but given that we can include null chars, we // have to do something crazier: std::copy(get_string(), get_string() + get_string_length(), std::ostream_iterator(os)); } os << '"'; break; case 'l': // we have a long int os << get_integer(); break; case 'd': os << get_double(); break; case 'n': // we have a null os << "null"; break; case 't': // we have a true os << "true"; break; case 'f': // we have a false os << "false"; break; case '{': // we have an object case '}': // we end an object case '[': // we start an array case ']': // we end an array os << static_cast(current_type); break; default: return false; } return true; } bool ParsedJson::Iterator::move_to(const char *pointer, uint32_t length) { char *new_pointer = nullptr; if (pointer[0] == '#') { // Converting fragment representation to string representation new_pointer = new char[length]; uint32_t new_length = 0; for (uint32_t i = 1; i < length; i++) { if (pointer[i] == '%' && pointer[i + 1] == 'x') { try { int fragment = std::stoi(std::string(&pointer[i + 2], 2), nullptr, 16); if (fragment == '\\' || fragment == '"' || (fragment <= 0x1F)) { // escaping the character new_pointer[new_length] = '\\'; new_length++; } new_pointer[new_length] = fragment; i += 3; } catch (std::invalid_argument &) { delete[] new_pointer; return false; // the fragment is invalid } } else { new_pointer[new_length] = pointer[i]; } new_length++; } length = new_length; pointer = new_pointer; } // saving the current state size_t depth_s = depth; size_t location_s = location; uint8_t current_type_s = current_type; uint64_t current_val_s = current_val; scopeindex_t *depth_index_s = depth_index; rewind(); // The json pointer is used from the root of the document. bool found = relative_move_to(pointer, length); delete[] new_pointer; if (!found) { // since the pointer has found nothing, we get back to the original // position. depth = depth_s; location = location_s; current_type = current_type_s; current_val = current_val_s; depth_index = depth_index_s; } return found; } bool ParsedJson::Iterator::relative_move_to(const char *pointer, uint32_t length) { if (length == 0) { // returns the whole document return true; } if (pointer[0] != '/') { // '/' must be the first character return false; } // finding the key in an object or the index in an array std::string key_or_index; uint32_t offset = 1; // checking for the "-" case if (is_array() && pointer[1] == '-') { if (length != 2) { // the pointer must be exactly "/-" // there can't be anything more after '-' as an index return false; } key_or_index = '-'; offset = length; // will skip the loop coming right after } // We either transform the first reference token to a valid json key // or we make sure it is a valid index in an array. for (; offset < length; offset++) { if (pointer[offset] == '/') { // beginning of the next key or index break; } if (is_array() && (pointer[offset] < '0' || pointer[offset] > '9')) { // the index of an array must be an integer // we also make sure std::stoi won't discard whitespaces later return false; } if (pointer[offset] == '~') { // "~1" represents "/" if (pointer[offset + 1] == '1') { key_or_index += '/'; offset++; continue; } // "~0" represents "~" if (pointer[offset + 1] == '0') { key_or_index += '~'; offset++; continue; } } if (pointer[offset] == '\\') { if (pointer[offset + 1] == '\\' || pointer[offset + 1] == '"' || (pointer[offset + 1] <= 0x1F)) { key_or_index += pointer[offset + 1]; offset++; continue; } return false; // invalid escaped character } if (pointer[offset] == '\"') { // unescaped quote character. this is an invalid case. // lets do nothing and assume most pointers will be valid. // it won't find any corresponding json key anyway. // return false; } key_or_index += pointer[offset]; } bool found = false; if (is_object()) { if (move_to_key(key_or_index.c_str(), key_or_index.length())) { found = relative_move_to(pointer + offset, length - offset); } } else if (is_array()) { if (key_or_index == "-") { // handling "-" case first if (down()) { while (next()) ; // moving to the end of the array // moving to the nonexistent value right after... size_t npos; if ((current_type == '[') || (current_type == '{')) { // we need to jump npos = (current_val & JSON_VALUE_MASK); } else { npos = location + ((current_type == 'd' || current_type == 'l') ? 2 : 1); } location = npos; current_val = pj.tape[npos]; current_type = (current_val >> 56); return true; // how could it fail ? } } else { // regular numeric index // The index can't have a leading '0' if (key_or_index[0] == '0' && key_or_index.length() > 1) { return false; } // it cannot be empty if (key_or_index.length() == 0) { return false; } // we already checked the index contains only valid digits uint32_t index = std::stoi(key_or_index); if (move_to_index(index)) { found = relative_move_to(pointer + offset, length - offset); } } } return found; } } // namespace simdjson /* end file src/parsedjsoniterator.cpp */