diff --git a/benchmark/parse.cpp b/benchmark/parse.cpp index f7039646..fcad31d7 100644 --- a/benchmark/parse.cpp +++ b/benchmark/parse.cpp @@ -144,7 +144,11 @@ int main(int argc, char *argv[]) { std::cout << "[verbose] allocated memory for parsed JSON " << std::endl; } unified.start(); - isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS); +#ifdef __AVX2__ + isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS); +#elif defined (__ARM_NEON) + isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS); +#endif unified.end(results); cy1 += results[0]; cl1 += results[1]; @@ -185,7 +189,11 @@ int main(int argc, char *argv[]) { } auto start = std::chrono::steady_clock::now(); - isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS); +#ifdef __AVX2__ + isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS); +#elif defined (__ARM_NEON) + isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS); +#endif isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj)); auto end = std::chrono::steady_clock::now(); std::chrono::duration secs = end - start; diff --git a/benchmark/statisticalmodel.cpp b/benchmark/statisticalmodel.cpp index 7b429674..c0b331b1 100644 --- a/benchmark/statisticalmodel.cpp +++ b/benchmark/statisticalmodel.cpp @@ -180,7 +180,11 @@ int main(int argc, char *argv[]) { results.resize(evts.size()); for (uint32_t i = 0; i < iterations; i++) { unified.start(); - bool isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS); +#ifdef __AVX2__ + bool isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS); +#elif defined (__ARM_NEON) + bool isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS); +#endif unified.end(results); cy1 += results[0]; diff --git a/include/simdjson/jsonparser.h b/include/simdjson/jsonparser.h index 40e55e86..a5e2fcb4 100644 --- a/include/simdjson/jsonparser.h +++ b/include/simdjson/jsonparser.h @@ -8,6 +8,62 @@ #include "simdjson/stage1_find_marks.h" #include "simdjson/stage2_build_tape.h" #include "simdjson/simdjson.h" +#ifdef _MSC_VER +#include +#include +#else +#include +#endif + +// function pointer type for json_parse +using json_parse_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded); + +// Pointer that holds the json_parse implementation corresponding to the available SIMD instruction set +extern json_parse_functype *json_parse_ptr; + +template +int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) { + if (pj.bytecapacity < len) { + return simdjson::CAPACITY; + } + bool reallocated = false; + if(reallocifneeded) { +#ifdef ALLOW_SAME_PAGE_BUFFER_OVERRUN + // realloc is needed if the end of the memory crosses a page +#ifdef _MSC_VER + SYSTEM_INFO sysInfo; + GetSystemInfo(&sysInfo); + long pagesize = sysInfo.dwPageSize; +#else + long pagesize = sysconf (_SC_PAGESIZE); +#endif + ////////////// + // We want to check that buf + len - 1 and buf + len - 1 + SIMDJSON_PADDING + // are in the same page. + // That is, we want to check that + // (buf + len - 1) / pagesize == (buf + len - 1 + SIMDJSON_PADDING) / pagesize + // That's true if (buf + len - 1) % pagesize + SIMDJSON_PADDING < pagesize. + /////////// + if ( (reinterpret_cast(buf + len - 1) % pagesize ) + SIMDJSON_PADDING < static_cast(pagesize) ) { +#else // SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN + if(true) { // if not SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN, we always reallocate +#endif + const uint8_t *tmpbuf = buf; + buf = (uint8_t *) allocate_padded_buffer(len); + if(buf == NULL) return simdjson::MEMALLOC; + memcpy((void*)buf,tmpbuf,len); + reallocated = true; + } + } + int stage1_is_ok = find_structural_bits(buf, len, pj); + if(stage1_is_ok != simdjson::SUCCESS) { + pj.errorcode = stage1_is_ok; + return pj.errorcode; + } + int res = unified_machine(buf, len, pj); + if(reallocated) { aligned_free((void*)buf);} + return res; +} // Parse a document found in buf. // You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)). @@ -24,8 +80,11 @@ // The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false, // all bytes at and after buf + len are ignored (can be garbage). // The ParsedJson object can be reused. + WARN_UNUSED -int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true); +inline int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) { + return json_parse_ptr(buf, len, pj, reallocifneeded); +} // Parse a document found in buf. // You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)). @@ -45,7 +104,7 @@ int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifnee // The ParsedJson object can be reused. WARN_UNUSED inline int json_parse(const char * buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) { - return json_parse(reinterpret_cast(buf), len, pj, reallocifneeded); + return json_parse_ptr(reinterpret_cast(buf), len, pj, reallocifneeded); } // We do not want to allow implicit conversion from C string to std::string. @@ -140,4 +199,4 @@ inline ParsedJson build_parsed_json(const padded_string &s) { -#endif +#endif \ No newline at end of file diff --git a/include/simdjson/simdjson.h b/include/simdjson/simdjson.h index 09012679..343c727d 100644 --- a/include/simdjson/simdjson.h +++ b/include/simdjson/simdjson.h @@ -4,6 +4,13 @@ #include struct simdjson { + enum class instruction_set { + avx2, + sse4_2, + neon, + none + }; + enum errorValues { SUCCESS = 0, CAPACITY, // This ParsedJson can't support a document that big diff --git a/include/simdjson/stage1_find_marks.h b/include/simdjson/stage1_find_marks.h index 869af0a5..f594a13c 100644 --- a/include/simdjson/stage1_find_marks.h +++ b/include/simdjson/stage1_find_marks.h @@ -1,14 +1,855 @@ #ifndef SIMDJSON_STAGE1_FIND_MARKS_H #define SIMDJSON_STAGE1_FIND_MARKS_H +#include #include "simdjson/common_defs.h" +#include "simdjson/parsedjson.h" +#include "simdjson/portability.h" -struct ParsedJson; +#ifdef __AVX2__ -WARN_UNUSED -int find_structural_bits(const uint8_t *buf, size_t len, ParsedJson &pj); - -WARN_UNUSED -int find_structural_bits(const char *buf, size_t len, ParsedJson &pj); +#ifndef SIMDJSON_SKIPUTF8VALIDATION +#define SIMDJSON_UTF8VALIDATE + +#endif +#else +// currently we don't UTF8 validate for ARM +// also we assume that if you're not __AVX2__ +// you're ARM, which is a bit dumb. TODO: Fix... +#ifdef __ARM_NEON +#include +#else +#warning It appears that neither ARM NEON nor AVX2 are detected. +#endif // __ARM_NEON +#endif // __AVX2__ + +// It seems that many parsers do UTF-8 validation. +// RapidJSON does not do it by default, but a flag +// allows it. +#ifdef SIMDJSON_UTF8VALIDATE +#include "simdjson/simdutf8check.h" +#endif + +#define TRANSPOSE + +template +struct simd_input; +#ifdef __AVX2__ +template<> +struct simd_input +{ + __m256i lo; + __m256i hi; +}; +#endif + +#ifdef __ARM_NEON +template<> struct simd_input +{ +#ifndef TRANSPOSE + uint8x16_t i0; + uint8x16_t i1; + uint8x16_t i2; + uint8x16_t i3; +#else + uint8x16x4_t i; +#endif +}; +#endif + +#ifdef __ARM_NEON +really_inline +uint16_t neonmovemask(uint8x16_t input) { + const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; + uint8x16_t minput = vandq_u8(input, bitmask); + uint8x16_t tmp = vpaddq_u8(minput, minput); + tmp = vpaddq_u8(tmp, tmp); + tmp = vpaddq_u8(tmp, tmp); + return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0); +} + +really_inline +uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) { +#ifndef TRANSPOSE + const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; + uint8x16_t t0 = vandq_u8(p0, bitmask); + uint8x16_t t1 = vandq_u8(p1, bitmask); + uint8x16_t t2 = vandq_u8(p2, bitmask); + uint8x16_t t3 = vandq_u8(p3, bitmask); + uint8x16_t sum0 = vpaddq_u8(t0, t1); + uint8x16_t sum1 = vpaddq_u8(t2, t3); + sum0 = vpaddq_u8(sum0, sum1); + sum0 = vpaddq_u8(sum0, sum0); + return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); +#else + const uint8x16_t bitmask1 = { 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, + 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10}; + const uint8x16_t bitmask2 = { 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, + 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20}; + const uint8x16_t bitmask3 = { 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, + 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40}; + const uint8x16_t bitmask4 = { 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, + 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80}; +#if 0 + uint8x16_t t0 = vandq_u8(p0, bitmask1); + uint8x16_t t1 = vandq_u8(p1, bitmask2); + uint8x16_t t2 = vandq_u8(p2, bitmask3); + uint8x16_t t3 = vandq_u8(p3, bitmask4); + uint8x16_t tmp = vorrq_u8(vorrq_u8(t0, t1), vorrq_u8(t2, t3)); +#else + uint8x16_t t0 = vandq_u8(p0, bitmask1); + uint8x16_t t1 = vbslq_u8(bitmask2, p1, t0); + uint8x16_t t2 = vbslq_u8(bitmask3, p2, t1); + uint8x16_t tmp = vbslq_u8(bitmask4, p3, t2); +#endif + uint8x16_t sum = vpaddq_u8(tmp, tmp); + return vgetq_lane_u64(vreinterpretq_u64_u8(sum), 0); +#endif +} +#endif + +template +uint64_t compute_quote_mask(uint64_t quote_bits); + +// In practice, if you have NEON or __PCLMUL__, you would +// always want to use them, but it might be useful, for research +// purposes, to disable it willingly, that's what SIMDJSON_AVOID_CLMUL +// does. +// Also: we don't know of an instance where AVX2 is supported but +// where clmul is not supported, so check for both, to be sure. +#ifdef SIMDJSON_AVOID_CLMUL +template really_inline +uint64_t compute_quote_mask(uint64_t quote_bits) +{ + uint64_t quote_mask = quote_bits ^ (quote_bits << 1); + quote_mask = quote_mask ^ (quote_mask << 2); + quote_mask = quote_mask ^ (quote_mask << 4); + quote_mask = quote_mask ^ (quote_mask << 8); + quote_mask = quote_mask ^ (quote_mask << 16); + quote_mask = quote_mask ^ (quote_mask << 32); + return quote_mask; +} +#else +template +uint64_t compute_quote_mask(uint64_t quote_bits); + +#ifdef __AVX2__ +template<> really_inline +uint64_t compute_quote_mask(uint64_t quote_bits) { + uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( + _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); + return quote_mask; +} +#endif + +#ifdef __ARM_NEON +template<> really_inline +uint64_t compute_quote_mask(uint64_t quote_bits) { +#ifdef __PCLMUL__ // Might cause problems on runtime dispatch + uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( + _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); +#else + uint64_t quote_mask = vmull_p64( -1ULL, quote_bits); +#endif + return quote_mask; +} +#endif +#endif + +#ifdef SIMDJSON_UTF8VALIDATE +templatereally_inline +void check_utf8(simd_input in, + __m256i &has_error, + struct avx_processed_utf_bytes &previous) { + __m256i highbit = _mm256_set1_epi8(0x80); + if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) { + // it is ascii, we just check continuation + has_error = _mm256_or_si256( + _mm256_cmpgt_epi8( + previous.carried_continuations, + _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)), + has_error); + } else { + // it is not ascii so we have to do heavy work + previous = avxcheckUTF8Bytes(in.lo, &previous, &has_error); + previous = avxcheckUTF8Bytes(in.hi, &previous, &has_error); + } +} +#endif + +template +simd_input fill_input(const uint8_t * ptr); + +#ifdef __AVX2__ +template<> really_inline +simd_input fill_input(const uint8_t * ptr) { + struct simd_input in; + in.lo = _mm256_loadu_si256(reinterpret_cast(ptr + 0)); + in.hi = _mm256_loadu_si256(reinterpret_cast(ptr + 32)); + return in; +} +#endif + +#ifdef __ARM_NEON +template<> really_inline +simd_input fill_input(const uint8_t * ptr) { + struct simd_input in; +#ifndef TRANSPOSE + in.i0 = vld1q_u8(ptr + 0); + in.i1 = vld1q_u8(ptr + 16); + in.i2 = vld1q_u8(ptr + 32); + in.i3 = vld1q_u8(ptr + 48); +#else + in.i = vld4q_u8(ptr); +#endif + return in; +} +#endif + +// a straightforward comparison of a mask against input. 5 uops; would be +// cheaper in AVX512. +template +uint64_t cmp_mask_against_input(simd_input in, uint8_t m); + +#ifdef __AVX2__ +template<> really_inline +uint64_t cmp_mask_against_input(simd_input in, uint8_t m) { + + const __m256i mask = _mm256_set1_epi8(m); + __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask); + uint64_t res_0 = static_cast(_mm256_movemask_epi8(cmp_res_0)); + __m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask); + uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1); + return res_0 | (res_1 << 32); +} +#endif + +#ifdef __ARM_NEON +template<> really_inline +uint64_t cmp_mask_against_input(simd_input in, uint8_t m) { + const uint8x16_t mask = vmovq_n_u8(m); + uint8x16_t cmp_res_0 = vceqq_u8(in.i.val[0], mask); + uint8x16_t cmp_res_1 = vceqq_u8(in.i.val[1], mask); + uint8x16_t cmp_res_2 = vceqq_u8(in.i.val[2], mask); + uint8x16_t cmp_res_3 = vceqq_u8(in.i.val[3], mask); + return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); +} +#endif + +// find all values less than or equal than the content of maxval (using unsigned arithmetic) +template +uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m); + +#ifdef __AVX2__ +template<> really_inline +uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m) { + const __m256i maxval = _mm256_set1_epi8(m); + __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.lo),maxval); + uint64_t res_0 = static_cast(_mm256_movemask_epi8(cmp_res_0)); + __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.hi),maxval); + uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1); + return res_0 | (res_1 << 32); +} +#endif + +#ifdef __ARM_NEON +template<> really_inline +uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m) { + const uint8x16_t mask = vmovq_n_u8(m); + uint8x16_t cmp_res_0 = vcleq_u8(in.i.val[0], mask); + uint8x16_t cmp_res_1 = vcleq_u8(in.i.val[1], mask); + uint8x16_t cmp_res_2 = vcleq_u8(in.i.val[2], mask); + uint8x16_t cmp_res_3 = vcleq_u8(in.i.val[3], mask); + return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); +} +#endif + +// return a bitvector indicating where we have characters that end an odd-length +// sequence of backslashes (and thus change the behavior of the next character +// to follow). A even-length sequence of backslashes, and, for that matter, the +// largest even-length prefix of our odd-length sequence of backslashes, simply +// modify the behavior of the backslashes themselves. +// We also update the prev_iter_ends_odd_backslash reference parameter to +// indicate whether we end an iteration on an odd-length sequence of +// backslashes, which modifies our subsequent search for odd-length +// sequences of backslashes in an obvious way. +template really_inline +uint64_t find_odd_backslash_sequences(simd_input in, uint64_t &prev_iter_ends_odd_backslash) { + const uint64_t even_bits = 0x5555555555555555ULL; + const uint64_t odd_bits = ~even_bits; + uint64_t bs_bits = cmp_mask_against_input(in, '\\'); + uint64_t start_edges = bs_bits & ~(bs_bits << 1); + // flip lowest if we have an odd-length run at the end of the prior + // iteration + uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; + uint64_t even_starts = start_edges & even_start_mask; + uint64_t odd_starts = start_edges & ~even_start_mask; + uint64_t even_carries = bs_bits + even_starts; + + uint64_t odd_carries; + // must record the carry-out of our odd-carries out of bit 63; this + // indicates whether the sense of any edge going to the next iteration + // should be flipped + bool iter_ends_odd_backslash = + add_overflow(bs_bits, odd_starts, &odd_carries); + + odd_carries |= + prev_iter_ends_odd_backslash; // push in bit zero as a potential end + // if we had an odd-numbered run at the + // end of the previous iteration + prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; + uint64_t even_carry_ends = even_carries & ~bs_bits; + uint64_t odd_carry_ends = odd_carries & ~bs_bits; + uint64_t even_start_odd_end = even_carry_ends & odd_bits; + uint64_t odd_start_even_end = odd_carry_ends & even_bits; + uint64_t odd_ends = even_start_odd_end | odd_start_even_end; + return odd_ends; +} + +// return both the quote mask (which is a half-open mask that covers the first +// quote +// in an unescaped quote pair and everything in the quote pair) and the quote +// bits, which are the simple +// unescaped quoted bits. We also update the prev_iter_inside_quote value to +// tell the next iteration +// whether we finished the final iteration inside a quote pair; if so, this +// inverts our behavior of +// whether we're inside quotes for the next iteration. +// Note that we don't do any error checking to see if we have backslash +// sequences outside quotes; these +// backslash sequences (of any length) will be detected elsewhere. +template really_inline +uint64_t find_quote_mask_and_bits(simd_input in, uint64_t odd_ends, + uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask) { + quote_bits = cmp_mask_against_input(in, '"'); + quote_bits = quote_bits & ~odd_ends; + uint64_t quote_mask = compute_quote_mask(quote_bits); + quote_mask ^= prev_iter_inside_quote; + // All Unicode characters may be placed within the + // quotation marks, except for the characters that MUST be escaped: + // quotation mark, reverse solidus, and the control characters (U+0000 + //through U+001F). + // https://tools.ietf.org/html/rfc8259 + uint64_t unescaped = unsigned_lteq_against_input(in, 0x1F); + error_mask |= quote_mask & unescaped; + // right shift of a signed value expected to be well-defined and standard + // compliant as of C++20, + // John Regher from Utah U. says this is fine code + prev_iter_inside_quote = + static_cast(static_cast(quote_mask) >> 63); + return quote_mask; +} + +// do a 'shufti' to detect structural JSON characters +// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c +// these go into the first 3 buckets of the comparison (1/2/4) + +// we are also interested in the four whitespace characters +// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d +// these go into the next 2 buckets of the comparison (8/16) +template +void find_whitespace_and_structurals(simd_input in, + uint64_t &whitespace, + uint64_t &structurals); + +#ifdef __AVX2__ +template<> really_inline +void find_whitespace_and_structurals(simd_input in, + uint64_t &whitespace, + uint64_t &structurals) { +#ifdef SIMDJSON_NAIVE_STRUCTURAL + // You should never need this naive approach, but it can be useful + // for research purposes + const __m256i mask_open_brace = _mm256_set1_epi8(0x7b); + __m256i struct_lo = _mm256_cmpeq_epi8(in.lo, mask_open_brace); + __m256i struct_hi = _mm256_cmpeq_epi8(in.hi, mask_open_brace); + const __m256i mask_close_brace = _mm256_set1_epi8(0x7d); + struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_close_brace)); + struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_close_brace)); + const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b); + struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_open_bracket)); + struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_open_bracket)); + const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d); + struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_close_bracket)); + struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_close_bracket)); + const __m256i mask_column = _mm256_set1_epi8(0x3a); + struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_column)); + struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_column)); + const __m256i mask_comma = _mm256_set1_epi8(0x2c); + struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_comma)); + struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_comma)); + uint64_t structural_res_0 = static_cast(_mm256_movemask_epi8(struct_lo)); + uint64_t structural_res_1 = _mm256_movemask_epi8(struct_hi); + structurals = (structural_res_0 | (structural_res_1 << 32)); + + const __m256i mask_space = _mm256_set1_epi8(0x20); + __m256i space_lo = _mm256_cmpeq_epi8(in.lo, mask_space); + __m256i space_hi = _mm256_cmpeq_epi8(in.hi, mask_space); + const __m256i mask_linefeed = _mm256_set1_epi8(0x0a); + space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_linefeed)); + space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_linefeed)); + const __m256i mask_tab = _mm256_set1_epi8(0x09); + space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_tab)); + space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_tab)); + const __m256i mask_carriage = _mm256_set1_epi8(0x0d); + space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_carriage)); + space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_carriage)); + + uint64_t ws_res_0 = static_cast(_mm256_movemask_epi8(space_lo)); + uint64_t ws_res_1 = _mm256_movemask_epi8(space_hi); + whitespace = (ws_res_0 | (ws_res_1 << 32)); + // end of naive approach + +#else // SIMDJSON_NAIVE_STRUCTURAL + const __m256i low_nibble_mask = _mm256_setr_epi8( + 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, + 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); + const __m256i high_nibble_mask = _mm256_setr_epi8( + 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, + 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0); + + __m256i structural_shufti_mask = _mm256_set1_epi8(0x7); + __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18); + + __m256i v_lo = _mm256_and_si256( + _mm256_shuffle_epi8(low_nibble_mask, in.lo), + _mm256_shuffle_epi8(high_nibble_mask, + _mm256_and_si256(_mm256_srli_epi32(in.lo, 4), + _mm256_set1_epi8(0x7f)))); + + __m256i v_hi = _mm256_and_si256( + _mm256_shuffle_epi8(low_nibble_mask, in.hi), + _mm256_shuffle_epi8(high_nibble_mask, + _mm256_and_si256(_mm256_srli_epi32(in.hi, 4), + _mm256_set1_epi8(0x7f)))); + __m256i tmp_lo = _mm256_cmpeq_epi8( + _mm256_and_si256(v_lo, structural_shufti_mask), _mm256_set1_epi8(0)); + __m256i tmp_hi = _mm256_cmpeq_epi8( + _mm256_and_si256(v_hi, structural_shufti_mask), _mm256_set1_epi8(0)); + + uint64_t structural_res_0 = + static_cast(_mm256_movemask_epi8(tmp_lo)); + uint64_t structural_res_1 = _mm256_movemask_epi8(tmp_hi); + structurals = ~(structural_res_0 | (structural_res_1 << 32)); + + __m256i tmp_ws_lo = _mm256_cmpeq_epi8( + _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0)); + __m256i tmp_ws_hi = _mm256_cmpeq_epi8( + _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0)); + + uint64_t ws_res_0 = static_cast(_mm256_movemask_epi8(tmp_ws_lo)); + uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); + whitespace = ~(ws_res_0 | (ws_res_1 << 32)); +#endif // SIMDJSON_NAIVE_STRUCTURAL +} +#endif + +#ifdef __ARM_NEON +template<> really_inline +void find_whitespace_and_structurals(simd_input in, + uint64_t &whitespace, + uint64_t &structurals) { +#ifndef FUNKY_BAD_TABLE + const uint8x16_t low_nibble_mask = (uint8x16_t){ + 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0}; + const uint8x16_t high_nibble_mask = (uint8x16_t){ + 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0}; + const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7); + const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); + const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf); + + uint8x16_t nib_0_lo = vandq_u8(in.i.val[0], low_nib_and_mask); + uint8x16_t nib_0_hi = vshrq_n_u8(in.i.val[0], 4); + uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo); + uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi); + uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi); + + uint8x16_t nib_1_lo = vandq_u8(in.i.val[1], low_nib_and_mask); + uint8x16_t nib_1_hi = vshrq_n_u8(in.i.val[1], 4); + uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo); + uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi); + uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi); + + uint8x16_t nib_2_lo = vandq_u8(in.i.val[2], low_nib_and_mask); + uint8x16_t nib_2_hi = vshrq_n_u8(in.i.val[2], 4); + uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo); + uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi); + uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi); + + uint8x16_t nib_3_lo = vandq_u8(in.i.val[3], low_nib_and_mask); + uint8x16_t nib_3_hi = vshrq_n_u8(in.i.val[3], 4); + uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo); + uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi); + uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi); + + uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask); + uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask); + uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask); + uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask); + structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3); + + uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask); + uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask); + uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask); + uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask); + whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3); +#else + // I think this one is garbage. In order to save the expense + // of another shuffle, I use an equally expensive shift, and + // this gets glued to the end of the dependency chain. Seems a bit + // slower for no good reason. + // + // need to use a weird arrangement. Bytes in this bitvector + // are in conventional order, but bits are reversed as we are + // using a signed left shift (that is a +ve value from 0..7) to + // shift upwards to 0x80 in the bit. So we need to reverse bits. + + // note no structural/whitespace has the high bit on + // so it's OK to put the high 5 bits into our TBL shuffle + // + + // structurals are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c + // or in 5 bit, 3 bit form thats + // (15,3) (15, 5) (7,2) (11,3) (11,5) (5,4) + // bit-reversing (subtract low 3 bits from 7) yields: + // (15,4) (15, 2) (7,5) (11,4) (11,2) (5,3) + + const uint8x16_t structural_bitvec = (uint8x16_t){ + 0, 0, 0, 0, + 0, 8, 0, 32, + 0, 0, 0, 20, + 0, 0, 0, 20}; + // we are also interested in the four whitespace characters + // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d + // (4,0) (1, 2) (1, 1) (1, 5) + // bit-reversing (subtract low 3 bits from 7) yields: + // (4,7) (1, 5) (1, 6) (1, 2) + + const uint8x16_t whitespace_bitvec = (uint8x16_t){ + 0, 100, 0, 0, + 128, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0}; + const uint8x16_t low_3bits_and_mask = vmovq_n_u8(0x7); + const uint8x16_t high_1bit_tst_mask = vmovq_n_u8(0x80); + + int8x16_t low_3bits_0 = vreinterpretq_s8_u8(vandq_u8(in.i.val[0], low_3bits_and_mask)); + uint8x16_t high_5bits_0 = vshrq_n_u8(in.i.val[0], 3); + uint8x16_t shuffle_structural_0 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_0), low_3bits_0); + uint8x16_t shuffle_ws_0 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_0), low_3bits_0); + uint8x16_t tmp_0 = vtstq_u8(shuffle_structural_0, high_1bit_tst_mask); + uint8x16_t tmp_ws_0 = vtstq_u8(shuffle_ws_0, high_1bit_tst_mask); + + int8x16_t low_3bits_1 = vreinterpretq_s8_u8(vandq_u8(in.i.val[1], low_3bits_and_mask)); + uint8x16_t high_5bits_1 = vshrq_n_u8(in.i.val[1], 3); + uint8x16_t shuffle_structural_1 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_1), low_3bits_1); + uint8x16_t shuffle_ws_1 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_1), low_3bits_1); + uint8x16_t tmp_1 = vtstq_u8(shuffle_structural_1, high_1bit_tst_mask); + uint8x16_t tmp_ws_1 = vtstq_u8(shuffle_ws_1, high_1bit_tst_mask); + + int8x16_t low_3bits_2 = vreinterpretq_s8_u8(vandq_u8(in.i.val[2], low_3bits_and_mask)); + uint8x16_t high_5bits_2 = vshrq_n_u8(in.i.val[2], 3); + uint8x16_t shuffle_structural_2 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_2), low_3bits_2); + uint8x16_t shuffle_ws_2 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_2), low_3bits_2); + uint8x16_t tmp_2 = vtstq_u8(shuffle_structural_2, high_1bit_tst_mask); + uint8x16_t tmp_ws_2 = vtstq_u8(shuffle_ws_2, high_1bit_tst_mask); + + int8x16_t low_3bits_3 = vreinterpretq_s8_u8(vandq_u8(in.i.val[3], low_3bits_and_mask)); + uint8x16_t high_5bits_3 = vshrq_n_u8(in.i.val[3], 3); + uint8x16_t shuffle_structural_3 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_3), low_3bits_3); + uint8x16_t shuffle_ws_3 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_3), low_3bits_3); + uint8x16_t tmp_3 = vtstq_u8(shuffle_structural_3, high_1bit_tst_mask); + uint8x16_t tmp_ws_3 = vtstq_u8(shuffle_ws_3, high_1bit_tst_mask); + + structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3); + whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3); +#endif +} +#endif + + +#ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking +// +// This is just a naive implementation. It should be normally +// disable, but can be used for research purposes to compare +// again our optimized version. +really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, + uint32_t idx, uint64_t bits) { + uint32_t * out_ptr = base_ptr + base; + idx -= 64; + while(bits != 0) { + out_ptr[0] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + out_ptr++; + } + base = (out_ptr - base_ptr); +} + +#else +// flatten out values in 'bits' assuming that they are are to have values of idx +// plus their position in the bitvector, and store these indexes at +// base_ptr[base] incrementing base as we go +// will potentially store extra values beyond end of valid bits, so base_ptr +// needs to be large enough to handle this +really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, + uint32_t idx, uint64_t bits) { + // In some instances, the next branch is expensive because it is mispredicted. + // Unfortunately, in other cases, + // it helps tremendously. + if(bits == 0) return; + uint32_t cnt = hamming(bits); + uint32_t next_base = base + cnt; + idx -= 64; + base_ptr += base; + { + base_ptr[0] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[1] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[2] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[3] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[4] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[5] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[6] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[7] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr += 8; + } + // We hope that the next branch is easily predicted. + if (cnt > 8) { + base_ptr[0] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[1] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[2] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[3] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[4] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[5] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[6] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr[7] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr += 8; + } + if (cnt > 16) { // unluckly: we rarely get here + // since it means having one structural or pseudo-structral element + // every 4 characters (possible with inputs like "","","",...). + do { + base_ptr[0] = idx + trailingzeroes(bits); + bits = bits & (bits - 1); + base_ptr++; + } while(bits != 0); + } + base = next_base; +} +#endif + +// return a updated structural bit vector with quoted contents cleared out and +// pseudo-structural characters added to the mask +// updates prev_iter_ends_pseudo_pred which tells us whether the previous +// iteration ended on a whitespace or a structural character (which means that +// the next iteration +// will have a pseudo-structural character at its start) +really_inline uint64_t finalize_structurals( + uint64_t structurals, uint64_t whitespace, uint64_t quote_mask, + uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) { + // mask off anything inside quotes + structurals &= ~quote_mask; + // add the real quote bits back into our bitmask as well, so we can + // quickly traverse the strings we've spent all this trouble gathering + structurals |= quote_bits; + // Now, establish "pseudo-structural characters". These are non-whitespace + // characters that are (a) outside quotes and (b) have a predecessor that's + // either whitespace or a structural character. This means that subsequent + // passes will get a chance to encounter the first character of every string + // of non-whitespace and, if we're parsing an atom like true/false/null or a + // number we can stop at the first whitespace or structural character + // following it. + + // a qualified predecessor is something that can happen 1 position before an + // pseudo-structural character + uint64_t pseudo_pred = structurals | whitespace; + + uint64_t shifted_pseudo_pred = + (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; + prev_iter_ends_pseudo_pred = pseudo_pred >> 63; + uint64_t pseudo_structurals = + shifted_pseudo_pred & (~whitespace) & (~quote_mask); + structurals |= pseudo_structurals; + + // now, we've used our close quotes all we need to. So let's switch them off + // they will be off in the quote mask and on in quote bits. + structurals &= ~(quote_bits & ~quote_mask); + return structurals; +} + +template +WARN_UNUSED +/*never_inline*/ int find_structural_bits(const uint8_t *buf, size_t len, + ParsedJson &pj) { + if (len > pj.bytecapacity) { + std::cerr << "Your ParsedJson object only supports documents up to " + << pj.bytecapacity << " bytes but you are trying to process " << len + << " bytes" << std::endl; + return simdjson::CAPACITY; + } + uint32_t *base_ptr = pj.structural_indexes; + uint32_t base = 0; +#ifdef SIMDJSON_UTF8VALIDATE + __m256i has_error = _mm256_setzero_si256(); + struct avx_processed_utf_bytes previous {}; + previous.rawbytes = _mm256_setzero_si256(); + previous.high_nibbles = _mm256_setzero_si256(); + previous.carried_continuations = _mm256_setzero_si256(); +#endif + + // we have padded the input out to 64 byte multiple with the remainder being + // zeros + + // persistent state across loop + // does the last iteration end with an odd-length sequence of backslashes? + // either 0 or 1, but a 64-bit value + uint64_t prev_iter_ends_odd_backslash = 0ULL; + // does the previous iteration end inside a double-quote pair? + uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones + // does the previous iteration end on something that is a predecessor of a + // pseudo-structural character - i.e. whitespace or a structural character + // effectively the very first char is considered to follow "whitespace" for + // the + // purposes of pseudo-structural character detection so we initialize to 1 + uint64_t prev_iter_ends_pseudo_pred = 1ULL; + + // structurals are persistent state across loop as we flatten them on the + // subsequent iteration into our array pointed to be base_ptr. + // This is harmless on the first iteration as structurals==0 + // and is done for performance reasons; we can hide some of the latency of the + // expensive carryless multiply in the previous step with this work + uint64_t structurals = 0; + + size_t lenminus64 = len < 64 ? 0 : len - 64; + size_t idx = 0; + uint64_t error_mask = 0; // for unescaped characters within strings (ASCII code points < 0x20) + + for (; idx < lenminus64; idx += 64) { +#ifndef _MSC_VER + __builtin_prefetch(buf + idx + 128); +#endif + simd_input in = fill_input(buf+idx); +#ifdef SIMDJSON_UTF8VALIDATE + check_utf8(in, has_error, previous); +#endif + // detect odd sequences of backslashes + uint64_t odd_ends = find_odd_backslash_sequences( + in, prev_iter_ends_odd_backslash); + + // detect insides of quote pairs ("quote_mask") and also our quote_bits + // themselves + uint64_t quote_bits; + uint64_t quote_mask = find_quote_mask_and_bits( + in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); + + // take the previous iterations structural bits, not our current iteration, + // and flatten + flatten_bits(base_ptr, base, idx, structurals); + + uint64_t whitespace; + find_whitespace_and_structurals(in, whitespace, structurals); + + // fixup structurals to reflect quotes and add pseudo-structural characters + structurals = finalize_structurals(structurals, whitespace, quote_mask, + quote_bits, prev_iter_ends_pseudo_pred); + } + + //////////////// + /// we use a giant copy-paste which is ugly. + /// but otherwise the string needs to be properly padded or else we + /// risk invalidating the UTF-8 checks. + //////////// + if (idx < len) { + uint8_t tmpbuf[64]; + memset(tmpbuf, 0x20, 64); + memcpy(tmpbuf, buf + idx, len - idx); + simd_input in = fill_input(tmpbuf); +#ifdef SIMDJSON_UTF8VALIDATE + check_utf8(in, has_error, previous); +#endif + + // detect odd sequences of backslashes + uint64_t odd_ends = find_odd_backslash_sequences( + in, prev_iter_ends_odd_backslash); + + // detect insides of quote pairs ("quote_mask") and also our quote_bits + // themselves + uint64_t quote_bits; + uint64_t quote_mask = find_quote_mask_and_bits( + in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); + + // take the previous iterations structural bits, not our current iteration, + // and flatten + flatten_bits(base_ptr, base, idx, structurals); + + uint64_t whitespace; + find_whitespace_and_structurals(in, whitespace, structurals); + + // fixup structurals to reflect quotes and add pseudo-structural characters + structurals = finalize_structurals(structurals, whitespace, quote_mask, + quote_bits, prev_iter_ends_pseudo_pred); + idx += 64; + } + + // is last string quote closed? + if (prev_iter_inside_quote) { + return simdjson::UNCLOSED_STRING; + } + + // finally, flatten out the remaining structurals from the last iteration + flatten_bits(base_ptr, base, idx, structurals); + + pj.n_structural_indexes = base; + // a valid JSON file cannot have zero structural indexes - we should have + // found something + if (pj.n_structural_indexes == 0u) { + fprintf(stderr, "Empty document?\n"); + return simdjson::EMPTY; + } + if (base_ptr[pj.n_structural_indexes - 1] > len) { + fprintf(stderr, "Internal bug\n"); + return simdjson::UNEXPECTED_ERROR; + } + if (len != base_ptr[pj.n_structural_indexes - 1]) { + // the string might not be NULL terminated, but we add a virtual NULL ending + // character. + base_ptr[pj.n_structural_indexes++] = len; + } + // make it safe to dereference one beyond this array + base_ptr[pj.n_structural_indexes] = 0; + if (error_mask) { + fprintf(stderr, "Unescaped characters\n"); + return simdjson::UNESCAPED_CHARS; + } +#ifdef SIMDJSON_UTF8VALIDATE + return _mm256_testz_si256(has_error, has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; +#else + return simdjson::SUCCESS; +#endif +} + +template +WARN_UNUSED +int find_structural_bits(const char *buf, size_t len, ParsedJson &pj) { + return find_structural_bits(reinterpret_cast(buf), len, pj); +} #endif diff --git a/src/jsonparser.cpp b/src/jsonparser.cpp index a7d6306d..14238c36 100644 --- a/src/jsonparser.cpp +++ b/src/jsonparser.cpp @@ -7,51 +7,57 @@ #endif #include "simdjson/simdjson.h" -// parse a document found in buf, need to preallocate ParsedJson. -WARN_UNUSED -int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded) { - if (pj.bytecapacity < len) { - return simdjson::CAPACITY; - } - bool reallocated = false; - if(reallocifneeded) { -#ifdef ALLOW_SAME_PAGE_BUFFER_OVERRUN - // realloc is needed if the end of the memory crosses a page -#ifdef _MSC_VER - SYSTEM_INFO sysInfo; - GetSystemInfo(&sysInfo); - long pagesize = sysInfo.dwPageSize; + +// Responsible to select the best json_parse implementation +int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded) { + // Versions for each implementation +#ifdef __AVX2__ + json_parse_functype* avx_implementation = &json_parse_implementation; +#endif +#ifdef __SSE4_2__ + // json_parse_functype* sse4_2_implementation = &json_parse_implementation; // not implemented yet +#endif +#ifdef __ARM_NEON + json_parse_functype* neon_implementation = &json_parse_implementation; +#endif + + // Determining which implementation is the more suitable + // Should be done at runtime. Does not make any sense on preprocessor. +#ifdef __AVX2__ + simdjson::instruction_set best_implementation = simdjson::instruction_set::avx2; +#elif defined (__SSE4_2__) + simdjson::instruction_set best_implementation = simdjson::instruction_set::sse4_2; +#elif defined (__ARM_NEON) + simdjson::instruction_set best_implementation = simdjson::instruction_set::neon; #else - long pagesize = sysconf (_SC_PAGESIZE); + simdjson::instruction_set best_implementation = simdjson::instruction_set::none; #endif - ////////////// - // We want to check that buf + len - 1 and buf + len - 1 + SIMDJSON_PADDING - // are in the same page. - // That is, we want to check that - // (buf + len - 1) / pagesize == (buf + len - 1 + SIMDJSON_PADDING) / pagesize - // That's true if (buf + len - 1) % pagesize + SIMDJSON_PADDING < pagesize. - /////////// - if ( (reinterpret_cast(buf + len - 1) % pagesize ) + SIMDJSON_PADDING < static_cast(pagesize) ) { -#else // SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN - if(true) { // if not SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN, we always reallocate + + // Selecting the best implementation + switch (best_implementation) { +#ifdef __AVX2__ + case simdjson::instruction_set::avx2 : + json_parse_ptr = avx_implementation; + break; +#elif defined (__SSE4_2__) + /*case simdjson::instruction_set::sse4_2 : + json_parse_ptr = sse4_2_implementation; + break;*/ +#elif defined (__ARM_NEON) + case simdjson::instruction_set::neon : + json_parse_ptr = neon_implementation; + break; #endif - const uint8_t *tmpbuf = buf; - buf = (uint8_t *) allocate_padded_buffer(len); - if(buf == NULL) return simdjson::MEMALLOC; - memcpy((void*)buf,tmpbuf,len); - reallocated = true; - } + default : + std::cerr << "No implemented simd instruction set supported" << std::endl; + return simdjson::UNEXPECTED_ERROR; } - int stage1_is_ok = find_structural_bits(buf, len, pj); - if(stage1_is_ok != simdjson::SUCCESS) { - pj.errorcode = stage1_is_ok; - return pj.errorcode; - } - int res = unified_machine(buf, len, pj); - if(reallocated) { aligned_free((void*)buf);} - return res; + + return json_parse_ptr(buf, len, pj, reallocifneeded); } +json_parse_functype *json_parse_ptr = &json_parse_dispatch; + WARN_UNUSED ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool reallocifneeded) { ParsedJson pj; diff --git a/src/stage1_find_marks.cpp b/src/stage1_find_marks.cpp index 3b51a22a..d859edd6 100644 --- a/src/stage1_find_marks.cpp +++ b/src/stage1_find_marks.cpp @@ -1,780 +1 @@ -#include -#include "simdjson/common_defs.h" -#include "simdjson/parsedjson.h" -#include "simdjson/portability.h" - - -#ifdef __AVX2__ - -#ifndef SIMDJSON_SKIPUTF8VALIDATION -#define SIMDJSON_UTF8VALIDATE - -#endif -#else -// currently we don't UTF8 validate for ARM -// also we assume that if you're not __AVX2__ -// you're ARM, which is a bit dumb. TODO: Fix... -#ifdef __ARM_NEON -#include -#else -#warning It appears that neither ARM NEON nor AVX2 are detected. -#endif // __ARM_NEON -#endif // __AVX2__ - -// It seems that many parsers do UTF-8 validation. -// RapidJSON does not do it by default, but a flag -// allows it. -#ifdef SIMDJSON_UTF8VALIDATE -#include "simdjson/simdutf8check.h" -#endif - -#define TRANSPOSE - -struct simd_input { -#ifdef __AVX2__ - __m256i lo; - __m256i hi; -#elif defined(__ARM_NEON) -#ifndef TRANSPOSE - uint8x16_t i0; - uint8x16_t i1; - uint8x16_t i2; - uint8x16_t i3; -#else - uint8x16x4_t i; -#endif -#else -#warning It appears that neither ARM NEON nor AVX2 are detected. -#endif -}; - -really_inline uint64_t compute_quote_mask(uint64_t quote_bits) { - // In practice, if you have NEON or __PCLMUL__, you would - // always want to use them, but it might be useful, for research - // purposes, to disable it willingly, that's what SIMDJSON_AVOID_CLMUL - // does. - // Also: we don't know of an instance where AVX2 is supported but - // where clmul is not supported, so check for both, to be sure. -#if (defined(__PCLMUL__) || defined(__AVX2__)) && !defined(SIMDJSON_AVOID_CLMUL) - uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( - _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); -#elif defined(__ARM_NEON) && !defined(SIMDJSON_AVOID_CLMUL) - uint64_t quote_mask = vmull_p64( -1ULL, quote_bits); -#else - // this code should always be used if SIMDJSON_AVOID_CLMUL is defined. - uint64_t quote_mask = quote_bits ^ (quote_bits << 1); - quote_mask = quote_mask ^ (quote_mask << 2); - quote_mask = quote_mask ^ (quote_mask << 4); - quote_mask = quote_mask ^ (quote_mask << 8); - quote_mask = quote_mask ^ (quote_mask << 16); - quote_mask = quote_mask ^ (quote_mask << 32); -#endif - return quote_mask; -} - -really_inline simd_input fill_input(const uint8_t * ptr) { - struct simd_input in; -#ifdef __AVX2__ - in.lo = _mm256_loadu_si256(reinterpret_cast(ptr + 0)); - in.hi = _mm256_loadu_si256(reinterpret_cast(ptr + 32)); -#elif defined(__ARM_NEON) -#ifndef TRANSPOSE - in.i0 = vld1q_u8(ptr + 0); - in.i1 = vld1q_u8(ptr + 16); - in.i2 = vld1q_u8(ptr + 32); - in.i3 = vld1q_u8(ptr + 48); -#else - in.i = vld4q_u8(ptr); -#endif -#else -#warning It appears that neither ARM NEON nor AVX2 are detected. -#endif - return in; -} - -#ifdef SIMDJSON_UTF8VALIDATE -really_inline void check_utf8(simd_input in, - __m256i &has_error, - struct avx_processed_utf_bytes &previous) { - __m256i highbit = _mm256_set1_epi8(0x80); - if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) { - // it is ascii, we just check continuation - has_error = _mm256_or_si256( - _mm256_cmpgt_epi8( - previous.carried_continuations, - _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)), - has_error); - } else { - // it is not ascii so we have to do heavy work - previous = avxcheckUTF8Bytes(in.lo, &previous, &has_error); - previous = avxcheckUTF8Bytes(in.hi, &previous, &has_error); - } -} -#endif - -#ifdef __ARM_NEON -uint16_t neonmovemask(uint8x16_t input) { - const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; - uint8x16_t minput = vandq_u8(input, bitmask); - uint8x16_t tmp = vpaddq_u8(minput, minput); - tmp = vpaddq_u8(tmp, tmp); - tmp = vpaddq_u8(tmp, tmp); - return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0); -} - -really_inline -uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) { -#ifndef TRANSPOSE - const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; - uint8x16_t t0 = vandq_u8(p0, bitmask); - uint8x16_t t1 = vandq_u8(p1, bitmask); - uint8x16_t t2 = vandq_u8(p2, bitmask); - uint8x16_t t3 = vandq_u8(p3, bitmask); - uint8x16_t sum0 = vpaddq_u8(t0, t1); - uint8x16_t sum1 = vpaddq_u8(t2, t3); - sum0 = vpaddq_u8(sum0, sum1); - sum0 = vpaddq_u8(sum0, sum0); - return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); -#else - const uint8x16_t bitmask1 = { 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, - 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10}; - const uint8x16_t bitmask2 = { 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, - 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20}; - const uint8x16_t bitmask3 = { 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, - 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40}; - const uint8x16_t bitmask4 = { 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, - 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80}; -#if 0 - uint8x16_t t0 = vandq_u8(p0, bitmask1); - uint8x16_t t1 = vandq_u8(p1, bitmask2); - uint8x16_t t2 = vandq_u8(p2, bitmask3); - uint8x16_t t3 = vandq_u8(p3, bitmask4); - uint8x16_t tmp = vorrq_u8(vorrq_u8(t0, t1), vorrq_u8(t2, t3)); -#else - uint8x16_t t0 = vandq_u8(p0, bitmask1); - uint8x16_t t1 = vbslq_u8(bitmask2, p1, t0); - uint8x16_t t2 = vbslq_u8(bitmask3, p2, t1); - uint8x16_t tmp = vbslq_u8(bitmask4, p3, t2); -#endif - uint8x16_t sum = vpaddq_u8(tmp, tmp); - return vgetq_lane_u64(vreinterpretq_u64_u8(sum), 0); -#endif -} -#endif - -// a straightforward comparison of a mask against input. 5 uops; would be -// cheaper in AVX512. -really_inline uint64_t cmp_mask_against_input(simd_input in, uint8_t m) { -#ifdef __AVX2__ - const __m256i mask = _mm256_set1_epi8(m); - __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask); - uint64_t res_0 = static_cast(_mm256_movemask_epi8(cmp_res_0)); - __m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask); - uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1); - return res_0 | (res_1 << 32); -#elif defined(__ARM_NEON) - const uint8x16_t mask = vmovq_n_u8(m); - uint8x16_t cmp_res_0 = vceqq_u8(in.i.val[0], mask); - uint8x16_t cmp_res_1 = vceqq_u8(in.i.val[1], mask); - uint8x16_t cmp_res_2 = vceqq_u8(in.i.val[2], mask); - uint8x16_t cmp_res_3 = vceqq_u8(in.i.val[3], mask); - return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); -#else -#warning It appears that neither ARM NEON nor AVX2 are detected. -#endif -} - -// find all values less than or equal than the content of maxval (using unsigned arithmetic) -really_inline uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m) { -#ifdef __AVX2__ - const __m256i maxval = _mm256_set1_epi8(m); - __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.lo),maxval); - uint64_t res_0 = static_cast(_mm256_movemask_epi8(cmp_res_0)); - __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.hi),maxval); - uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1); - return res_0 | (res_1 << 32); -#elif defined(__ARM_NEON) - const uint8x16_t mask = vmovq_n_u8(m); - uint8x16_t cmp_res_0 = vcleq_u8(in.i.val[0], mask); - uint8x16_t cmp_res_1 = vcleq_u8(in.i.val[1], mask); - uint8x16_t cmp_res_2 = vcleq_u8(in.i.val[2], mask); - uint8x16_t cmp_res_3 = vcleq_u8(in.i.val[3], mask); - return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); -#else -#warning It appears that neither ARM NEON nor AVX2 are detected. -#endif -} - -// return a bitvector indicating where we have characters that end an odd-length -// sequence of backslashes (and thus change the behavior of the next character -// to follow). A even-length sequence of backslashes, and, for that matter, the -// largest even-length prefix of our odd-length sequence of backslashes, simply -// modify the behavior of the backslashes themselves. -// We also update the prev_iter_ends_odd_backslash reference parameter to -// indicate whether we end an iteration on an odd-length sequence of -// backslashes, which modifies our subsequent search for odd-length -// sequences of backslashes in an obvious way. -really_inline uint64_t -find_odd_backslash_sequences(simd_input in, - uint64_t &prev_iter_ends_odd_backslash) { - const uint64_t even_bits = 0x5555555555555555ULL; - const uint64_t odd_bits = ~even_bits; - uint64_t bs_bits = cmp_mask_against_input(in, '\\'); - uint64_t start_edges = bs_bits & ~(bs_bits << 1); - // flip lowest if we have an odd-length run at the end of the prior - // iteration - uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; - uint64_t even_starts = start_edges & even_start_mask; - uint64_t odd_starts = start_edges & ~even_start_mask; - uint64_t even_carries = bs_bits + even_starts; - - uint64_t odd_carries; - // must record the carry-out of our odd-carries out of bit 63; this - // indicates whether the sense of any edge going to the next iteration - // should be flipped - bool iter_ends_odd_backslash = - add_overflow(bs_bits, odd_starts, &odd_carries); - - odd_carries |= - prev_iter_ends_odd_backslash; // push in bit zero as a potential end - // if we had an odd-numbered run at the - // end of the previous iteration - prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; - uint64_t even_carry_ends = even_carries & ~bs_bits; - uint64_t odd_carry_ends = odd_carries & ~bs_bits; - uint64_t even_start_odd_end = even_carry_ends & odd_bits; - uint64_t odd_start_even_end = odd_carry_ends & even_bits; - uint64_t odd_ends = even_start_odd_end | odd_start_even_end; - return odd_ends; -} - -// return both the quote mask (which is a half-open mask that covers the first -// quote -// in an unescaped quote pair and everything in the quote pair) and the quote -// bits, which are the simple -// unescaped quoted bits. We also update the prev_iter_inside_quote value to -// tell the next iteration -// whether we finished the final iteration inside a quote pair; if so, this -// inverts our behavior of -// whether we're inside quotes for the next iteration. -// Note that we don't do any error checking to see if we have backslash -// sequences outside quotes; these -// backslash sequences (of any length) will be detected elsewhere. -really_inline uint64_t find_quote_mask_and_bits(simd_input in, uint64_t odd_ends, - uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask) { - quote_bits = cmp_mask_against_input(in, '"'); - quote_bits = quote_bits & ~odd_ends; - uint64_t quote_mask = compute_quote_mask(quote_bits); - quote_mask ^= prev_iter_inside_quote; - // All Unicode characters may be placed within the - // quotation marks, except for the characters that MUST be escaped: - // quotation mark, reverse solidus, and the control characters (U+0000 - //through U+001F). - // https://tools.ietf.org/html/rfc8259 - uint64_t unescaped = unsigned_lteq_against_input(in, 0x1F); - error_mask |= quote_mask & unescaped; - // right shift of a signed value expected to be well-defined and standard - // compliant as of C++20, - // John Regher from Utah U. says this is fine code - prev_iter_inside_quote = - static_cast(static_cast(quote_mask) >> 63); - return quote_mask; -} - -really_inline void find_whitespace_and_structurals(simd_input in, - uint64_t &whitespace, - uint64_t &structurals) { - // do a 'shufti' to detect structural JSON characters - // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c - // these go into the first 3 buckets of the comparison (1/2/4) - - // we are also interested in the four whitespace characters - // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d - // these go into the next 2 buckets of the comparison (8/16) -#ifdef __AVX2__ -#ifdef SIMDJSON_NAIVE_STRUCTURAL - // You should never need this naive approach, but it can be useful - // for research purposes - const __m256i mask_open_brace = _mm256_set1_epi8(0x7b); - __m256i struct_lo = _mm256_cmpeq_epi8(in.lo, mask_open_brace); - __m256i struct_hi = _mm256_cmpeq_epi8(in.hi, mask_open_brace); - const __m256i mask_close_brace = _mm256_set1_epi8(0x7d); - struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_close_brace)); - struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_close_brace)); - const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b); - struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_open_bracket)); - struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_open_bracket)); - const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d); - struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_close_bracket)); - struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_close_bracket)); - const __m256i mask_column = _mm256_set1_epi8(0x3a); - struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_column)); - struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_column)); - const __m256i mask_comma = _mm256_set1_epi8(0x2c); - struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_comma)); - struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_comma)); - uint64_t structural_res_0 = static_cast(_mm256_movemask_epi8(struct_lo)); - uint64_t structural_res_1 = _mm256_movemask_epi8(struct_hi); - structurals = (structural_res_0 | (structural_res_1 << 32)); - - const __m256i mask_space = _mm256_set1_epi8(0x20); - __m256i space_lo = _mm256_cmpeq_epi8(in.lo, mask_space); - __m256i space_hi = _mm256_cmpeq_epi8(in.hi, mask_space); - const __m256i mask_linefeed = _mm256_set1_epi8(0x0a); - space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_linefeed)); - space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_linefeed)); - const __m256i mask_tab = _mm256_set1_epi8(0x09); - space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_tab)); - space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_tab)); - const __m256i mask_carriage = _mm256_set1_epi8(0x0d); - space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_carriage)); - space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_carriage)); - - uint64_t ws_res_0 = static_cast(_mm256_movemask_epi8(space_lo)); - uint64_t ws_res_1 = _mm256_movemask_epi8(space_hi); - whitespace = (ws_res_0 | (ws_res_1 << 32)); - // end of naive approach - -#else // SIMDJSON_NAIVE_STRUCTURAL - const __m256i low_nibble_mask = _mm256_setr_epi8( - 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, - 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); - const __m256i high_nibble_mask = _mm256_setr_epi8( - 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, - 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0); - - __m256i structural_shufti_mask = _mm256_set1_epi8(0x7); - __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18); - - __m256i v_lo = _mm256_and_si256( - _mm256_shuffle_epi8(low_nibble_mask, in.lo), - _mm256_shuffle_epi8(high_nibble_mask, - _mm256_and_si256(_mm256_srli_epi32(in.lo, 4), - _mm256_set1_epi8(0x7f)))); - - __m256i v_hi = _mm256_and_si256( - _mm256_shuffle_epi8(low_nibble_mask, in.hi), - _mm256_shuffle_epi8(high_nibble_mask, - _mm256_and_si256(_mm256_srli_epi32(in.hi, 4), - _mm256_set1_epi8(0x7f)))); - __m256i tmp_lo = _mm256_cmpeq_epi8( - _mm256_and_si256(v_lo, structural_shufti_mask), _mm256_set1_epi8(0)); - __m256i tmp_hi = _mm256_cmpeq_epi8( - _mm256_and_si256(v_hi, structural_shufti_mask), _mm256_set1_epi8(0)); - - uint64_t structural_res_0 = - static_cast(_mm256_movemask_epi8(tmp_lo)); - uint64_t structural_res_1 = _mm256_movemask_epi8(tmp_hi); - structurals = ~(structural_res_0 | (structural_res_1 << 32)); - - __m256i tmp_ws_lo = _mm256_cmpeq_epi8( - _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0)); - __m256i tmp_ws_hi = _mm256_cmpeq_epi8( - _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0)); - - uint64_t ws_res_0 = static_cast(_mm256_movemask_epi8(tmp_ws_lo)); - uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); - whitespace = ~(ws_res_0 | (ws_res_1 << 32)); -#endif // SIMDJSON_NAIVE_STRUCTURAL -#elif defined(__ARM_NEON) -#ifndef FUNKY_BAD_TABLE - const uint8x16_t low_nibble_mask = (uint8x16_t){ - 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0}; - const uint8x16_t high_nibble_mask = (uint8x16_t){ - 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0}; - const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7); - const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); - const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf); - - uint8x16_t nib_0_lo = vandq_u8(in.i.val[0], low_nib_and_mask); - uint8x16_t nib_0_hi = vshrq_n_u8(in.i.val[0], 4); - uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo); - uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi); - uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi); - - uint8x16_t nib_1_lo = vandq_u8(in.i.val[1], low_nib_and_mask); - uint8x16_t nib_1_hi = vshrq_n_u8(in.i.val[1], 4); - uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo); - uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi); - uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi); - - uint8x16_t nib_2_lo = vandq_u8(in.i.val[2], low_nib_and_mask); - uint8x16_t nib_2_hi = vshrq_n_u8(in.i.val[2], 4); - uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo); - uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi); - uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi); - - uint8x16_t nib_3_lo = vandq_u8(in.i.val[3], low_nib_and_mask); - uint8x16_t nib_3_hi = vshrq_n_u8(in.i.val[3], 4); - uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo); - uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi); - uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi); - - uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask); - uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask); - uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask); - uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask); - structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3); - - uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask); - uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask); - uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask); - uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask); - whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3); -#else - // I think this one is garbage. In order to save the expense - // of another shuffle, I use an equally expensive shift, and - // this gets glued to the end of the dependency chain. Seems a bit - // slower for no good reason. - // - // need to use a weird arrangement. Bytes in this bitvector - // are in conventional order, but bits are reversed as we are - // using a signed left shift (that is a +ve value from 0..7) to - // shift upwards to 0x80 in the bit. So we need to reverse bits. - - // note no structural/whitespace has the high bit on - // so it's OK to put the high 5 bits into our TBL shuffle - // - - // structurals are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c - // or in 5 bit, 3 bit form thats - // (15,3) (15, 5) (7,2) (11,3) (11,5) (5,4) - // bit-reversing (subtract low 3 bits from 7) yields: - // (15,4) (15, 2) (7,5) (11,4) (11,2) (5,3) - - const uint8x16_t structural_bitvec = (uint8x16_t){ - 0, 0, 0, 0, - 0, 8, 0, 32, - 0, 0, 0, 20, - 0, 0, 0, 20}; - // we are also interested in the four whitespace characters - // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d - // (4,0) (1, 2) (1, 1) (1, 5) - // bit-reversing (subtract low 3 bits from 7) yields: - // (4,7) (1, 5) (1, 6) (1, 2) - - const uint8x16_t whitespace_bitvec = (uint8x16_t){ - 0, 100, 0, 0, - 128, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, 0}; - const uint8x16_t low_3bits_and_mask = vmovq_n_u8(0x7); - const uint8x16_t high_1bit_tst_mask = vmovq_n_u8(0x80); - - int8x16_t low_3bits_0 = vreinterpretq_s8_u8(vandq_u8(in.i.val[0], low_3bits_and_mask)); - uint8x16_t high_5bits_0 = vshrq_n_u8(in.i.val[0], 3); - uint8x16_t shuffle_structural_0 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_0), low_3bits_0); - uint8x16_t shuffle_ws_0 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_0), low_3bits_0); - uint8x16_t tmp_0 = vtstq_u8(shuffle_structural_0, high_1bit_tst_mask); - uint8x16_t tmp_ws_0 = vtstq_u8(shuffle_ws_0, high_1bit_tst_mask); - - int8x16_t low_3bits_1 = vreinterpretq_s8_u8(vandq_u8(in.i.val[1], low_3bits_and_mask)); - uint8x16_t high_5bits_1 = vshrq_n_u8(in.i.val[1], 3); - uint8x16_t shuffle_structural_1 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_1), low_3bits_1); - uint8x16_t shuffle_ws_1 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_1), low_3bits_1); - uint8x16_t tmp_1 = vtstq_u8(shuffle_structural_1, high_1bit_tst_mask); - uint8x16_t tmp_ws_1 = vtstq_u8(shuffle_ws_1, high_1bit_tst_mask); - - int8x16_t low_3bits_2 = vreinterpretq_s8_u8(vandq_u8(in.i.val[2], low_3bits_and_mask)); - uint8x16_t high_5bits_2 = vshrq_n_u8(in.i.val[2], 3); - uint8x16_t shuffle_structural_2 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_2), low_3bits_2); - uint8x16_t shuffle_ws_2 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_2), low_3bits_2); - uint8x16_t tmp_2 = vtstq_u8(shuffle_structural_2, high_1bit_tst_mask); - uint8x16_t tmp_ws_2 = vtstq_u8(shuffle_ws_2, high_1bit_tst_mask); - - int8x16_t low_3bits_3 = vreinterpretq_s8_u8(vandq_u8(in.i.val[3], low_3bits_and_mask)); - uint8x16_t high_5bits_3 = vshrq_n_u8(in.i.val[3], 3); - uint8x16_t shuffle_structural_3 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_3), low_3bits_3); - uint8x16_t shuffle_ws_3 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_3), low_3bits_3); - uint8x16_t tmp_3 = vtstq_u8(shuffle_structural_3, high_1bit_tst_mask); - uint8x16_t tmp_ws_3 = vtstq_u8(shuffle_ws_3, high_1bit_tst_mask); - - structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3); - whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3); -#endif -#else -#warning It appears that neither ARM NEON nor AVX2 are detected. -#endif -} - - -#ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking -// -// This is just a naive implementation. It should be normally -// disable, but can be used for research purposes to compare -// again our optimized version. -really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, - uint32_t idx, uint64_t bits) { - uint32_t * out_ptr = base_ptr + base; - idx -= 64; - while(bits != 0) { - out_ptr[0] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - out_ptr++; - } - base = (out_ptr - base_ptr); -} - -#else -// flatten out values in 'bits' assuming that they are are to have values of idx -// plus their position in the bitvector, and store these indexes at -// base_ptr[base] incrementing base as we go -// will potentially store extra values beyond end of valid bits, so base_ptr -// needs to be large enough to handle this -really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, - uint32_t idx, uint64_t bits) { - // In some instances, the next branch is expensive because it is mispredicted. - // Unfortunately, in other cases, - // it helps tremendously. - if(bits == 0) return; - uint32_t cnt = hamming(bits); - uint32_t next_base = base + cnt; - idx -= 64; - base_ptr += base; - { - base_ptr[0] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[1] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[2] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[3] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[4] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[5] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[6] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[7] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr += 8; - } - // We hope that the next branch is easily predicted. - if (cnt > 8) { - base_ptr[0] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[1] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[2] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[3] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[4] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[5] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[6] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr[7] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr += 8; - } - if (cnt > 16) { // unluckly: we rarely get here - // since it means having one structural or pseudo-structral element - // every 4 characters (possible with inputs like "","","",...). - do { - base_ptr[0] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - base_ptr++; - } while(bits != 0); - } - base = next_base; -} -#endif - -// return a updated structural bit vector with quoted contents cleared out and -// pseudo-structural characters added to the mask -// updates prev_iter_ends_pseudo_pred which tells us whether the previous -// iteration ended on a whitespace or a structural character (which means that -// the next iteration -// will have a pseudo-structural character at its start) -really_inline uint64_t finalize_structurals( - uint64_t structurals, uint64_t whitespace, uint64_t quote_mask, - uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) { - // mask off anything inside quotes - structurals &= ~quote_mask; - // add the real quote bits back into our bitmask as well, so we can - // quickly traverse the strings we've spent all this trouble gathering - structurals |= quote_bits; - // Now, establish "pseudo-structural characters". These are non-whitespace - // characters that are (a) outside quotes and (b) have a predecessor that's - // either whitespace or a structural character. This means that subsequent - // passes will get a chance to encounter the first character of every string - // of non-whitespace and, if we're parsing an atom like true/false/null or a - // number we can stop at the first whitespace or structural character - // following it. - - // a qualified predecessor is something that can happen 1 position before an - // pseudo-structural character - uint64_t pseudo_pred = structurals | whitespace; - - uint64_t shifted_pseudo_pred = - (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; - prev_iter_ends_pseudo_pred = pseudo_pred >> 63; - uint64_t pseudo_structurals = - shifted_pseudo_pred & (~whitespace) & (~quote_mask); - structurals |= pseudo_structurals; - - // now, we've used our close quotes all we need to. So let's switch them off - // they will be off in the quote mask and on in quote bits. - structurals &= ~(quote_bits & ~quote_mask); - return structurals; -} - -WARN_UNUSED -/*never_inline*/ int find_structural_bits(const uint8_t *buf, size_t len, - ParsedJson &pj) { - if (len > pj.bytecapacity) { - std::cerr << "Your ParsedJson object only supports documents up to " - << pj.bytecapacity << " bytes but you are trying to process " << len - << " bytes" << std::endl; - return simdjson::CAPACITY; - } - uint32_t *base_ptr = pj.structural_indexes; - uint32_t base = 0; -#ifdef SIMDJSON_UTF8VALIDATE - __m256i has_error = _mm256_setzero_si256(); - struct avx_processed_utf_bytes previous {}; - previous.rawbytes = _mm256_setzero_si256(); - previous.high_nibbles = _mm256_setzero_si256(); - previous.carried_continuations = _mm256_setzero_si256(); -#endif - - // we have padded the input out to 64 byte multiple with the remainder being - // zeros - - // persistent state across loop - // does the last iteration end with an odd-length sequence of backslashes? - // either 0 or 1, but a 64-bit value - uint64_t prev_iter_ends_odd_backslash = 0ULL; - // does the previous iteration end inside a double-quote pair? - uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones - // does the previous iteration end on something that is a predecessor of a - // pseudo-structural character - i.e. whitespace or a structural character - // effectively the very first char is considered to follow "whitespace" for - // the - // purposes of pseudo-structural character detection so we initialize to 1 - uint64_t prev_iter_ends_pseudo_pred = 1ULL; - - // structurals are persistent state across loop as we flatten them on the - // subsequent iteration into our array pointed to be base_ptr. - // This is harmless on the first iteration as structurals==0 - // and is done for performance reasons; we can hide some of the latency of the - // expensive carryless multiply in the previous step with this work - uint64_t structurals = 0; - - size_t lenminus64 = len < 64 ? 0 : len - 64; - size_t idx = 0; - uint64_t error_mask = 0; // for unescaped characters within strings (ASCII code points < 0x20) - - for (; idx < lenminus64; idx += 64) { -#ifndef _MSC_VER - __builtin_prefetch(buf + idx + 128); -#endif - simd_input in = fill_input(buf+idx); -#ifdef SIMDJSON_UTF8VALIDATE - check_utf8(in, has_error, previous); -#endif - // detect odd sequences of backslashes - uint64_t odd_ends = find_odd_backslash_sequences( - in, prev_iter_ends_odd_backslash); - - // detect insides of quote pairs ("quote_mask") and also our quote_bits - // themselves - uint64_t quote_bits; - uint64_t quote_mask = find_quote_mask_and_bits( - in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); - - // take the previous iterations structural bits, not our current iteration, - // and flatten - flatten_bits(base_ptr, base, idx, structurals); - - uint64_t whitespace; - find_whitespace_and_structurals(in, whitespace, structurals); - - // fixup structurals to reflect quotes and add pseudo-structural characters - structurals = finalize_structurals(structurals, whitespace, quote_mask, - quote_bits, prev_iter_ends_pseudo_pred); - } - - //////////////// - /// we use a giant copy-paste which is ugly. - /// but otherwise the string needs to be properly padded or else we - /// risk invalidating the UTF-8 checks. - //////////// - if (idx < len) { - uint8_t tmpbuf[64]; - memset(tmpbuf, 0x20, 64); - memcpy(tmpbuf, buf + idx, len - idx); - simd_input in = fill_input(tmpbuf); -#ifdef SIMDJSON_UTF8VALIDATE - check_utf8(in, has_error, previous); -#endif - - // detect odd sequences of backslashes - uint64_t odd_ends = find_odd_backslash_sequences( - in, prev_iter_ends_odd_backslash); - - // detect insides of quote pairs ("quote_mask") and also our quote_bits - // themselves - uint64_t quote_bits; - uint64_t quote_mask = find_quote_mask_and_bits( - in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); - - // take the previous iterations structural bits, not our current iteration, - // and flatten - flatten_bits(base_ptr, base, idx, structurals); - - uint64_t whitespace; - find_whitespace_and_structurals(in, whitespace, structurals); - - // fixup structurals to reflect quotes and add pseudo-structural characters - structurals = finalize_structurals(structurals, whitespace, quote_mask, - quote_bits, prev_iter_ends_pseudo_pred); - idx += 64; - } - - // is last string quote closed? - if (prev_iter_inside_quote) { - return simdjson::UNCLOSED_STRING; - } - - // finally, flatten out the remaining structurals from the last iteration - flatten_bits(base_ptr, base, idx, structurals); - - pj.n_structural_indexes = base; - // a valid JSON file cannot have zero structural indexes - we should have - // found something - if (pj.n_structural_indexes == 0u) { - fprintf(stderr, "Empty document?\n"); - return simdjson::EMPTY; - } - if (base_ptr[pj.n_structural_indexes - 1] > len) { - fprintf(stderr, "Internal bug\n"); - return simdjson::UNEXPECTED_ERROR; - } - if (len != base_ptr[pj.n_structural_indexes - 1]) { - // the string might not be NULL terminated, but we add a virtual NULL ending - // character. - base_ptr[pj.n_structural_indexes++] = len; - } - // make it safe to dereference one beyond this array - base_ptr[pj.n_structural_indexes] = 0; - if (error_mask) { - fprintf(stderr, "Unescaped characters\n"); - return simdjson::UNESCAPED_CHARS; - } -#ifdef SIMDJSON_UTF8VALIDATE - return _mm256_testz_si256(has_error, has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; -#else - return simdjson::SUCCESS; -#endif -} - -int find_structural_bits(const char *buf, size_t len, ParsedJson &pj) { - return find_structural_bits(reinterpret_cast(buf), len, pj); -} +// File kept in case we want to reuse it soon. (many configuration files to edit) \ No newline at end of file