diff --git a/.gitignore b/.gitignore index b871189a..5b1caa4c 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ /jsoncheck /jsonpointer /jsonstats +/integer_tests /libsimdjson.so* /minify /numberparsingcheck diff --git a/benchmark/parse.cpp b/benchmark/parse.cpp index 07f172cb..310eca41 100644 --- a/benchmark/parse.cpp +++ b/benchmark/parse.cpp @@ -34,6 +34,18 @@ #include "simdjson/parsedjson.h" #include "simdjson/stage1_find_marks.h" #include "simdjson/stage2_build_tape.h" + +// Global arguments +bool find_marks_only = false; +bool verbose = false; +bool dump = false; +bool json_output = false; +bool force_one_iteration = false; +bool just_data = false; +bool force_sse = false; +int32_t iterations = -1; +int32_t warmup_iterations = -1; + namespace simdjson { Architecture _find_best_supported_implementation() { constexpr uint32_t haswell_flags = @@ -43,7 +55,7 @@ Architecture _find_best_supported_implementation() { instruction_set::SSE42 | instruction_set::PCLMULQDQ; uint32_t supports = detect_supported_architectures(); // Order from best to worst (within architecture) - if ((haswell_flags & supports) == haswell_flags) { + if ((haswell_flags & supports) == haswell_flags && !force_sse) { return Architecture::HASWELL; } if ((westmere_flags & supports) == westmere_flags) { @@ -63,6 +75,9 @@ extern unified_functype *unified_ptr; extern stage1_functype *stage1_ptr; int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) { + if (find_marks_only) { + return simdjson::SUCCESS; + } Architecture best_implementation = _find_best_supported_implementation(); // Selecting the best implementation switch (best_implementation) { @@ -118,18 +133,11 @@ unified_functype *unified_ptr = &unified_machine_dispatch; } // namespace simdjson int main(int argc, char *argv[]) { - bool verbose = false; - bool dump = false; - bool json_output = false; - bool force_one_iteration = false; - bool just_data = false; - int32_t iterations = -1; - int32_t warmup_iterations = -1; #ifndef _MSC_VER int c; - while ((c = getopt(argc, argv, "1vdtn:w:")) != -1) { + while ((c = getopt(argc, argv, "1vdtn:w:fs")) != -1) { switch (c) { case 'n': iterations = atoi(optarg); @@ -137,6 +145,9 @@ int main(int argc, char *argv[]) { case 'w': warmup_iterations = atoi(optarg); break; + case 's': + force_sse = true; + break; case 't': just_data = true; break; @@ -152,6 +163,9 @@ int main(int argc, char *argv[]) { case '1': force_one_iteration = true; break; + case 'f': + find_marks_only = true; + break; default: abort(); } @@ -326,7 +340,7 @@ int main(int argc, char *argv[]) { isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) == simdjson::SUCCESS); isok = isok && - (simdjson::SUCCESS == + (simdjson::SUCCESS == simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj)); auto end = std::chrono::steady_clock::now(); std::chrono::duration secs = end - start; diff --git a/include/simdjson/common_defs.h b/include/simdjson/common_defs.h index fe18a00b..b831b634 100644 --- a/include/simdjson/common_defs.h +++ b/include/simdjson/common_defs.h @@ -17,6 +17,17 @@ #define SIMDJSON_PADDING 32 #endif +#if defined(__GNUC__) +// Marks a block with a name so that MCA analysis can see it. +#define BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name); +#define END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name); +#define DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name); +#else +#define BEGIN_DEBUG_BLOCK(name) +#define END_DEBUG_BLOCK(name) +#define DEBUG_BLOCK(name, block) +#endif + #ifndef _MSC_VER // Implemented using Labels as Values which works in GCC and CLANG (and maybe // also in Intel's compiler), but won't work in MSVC. diff --git a/scripts/checkperf.sh b/scripts/checkperf.sh index 1e90da3a..d348d566 100644 --- a/scripts/checkperf.sh +++ b/scripts/checkperf.sh @@ -29,5 +29,5 @@ make parse make perfdiff echo "Running perfdiff:" -echo ./perfdiff \"$current/parse -t $perftests\" \"$reference/parse -t $perftests\" -./perfdiff "$current/parse -t $perftests" "$reference/parse -t $perftests" +echo ./perfdiff \"$current/parse -t $perftests $CHECKPERF_ARGS\" \"$reference/parse -t $perftests $CHECKPERF_ARGS\" +./perfdiff "$current/parse -t $perftests $CHECKPERF_ARGS" "$reference/parse -t $perftests $CHECKPERF_ARGS" diff --git a/src/arm64/simd_input.h b/src/arm64/simd_input.h index ff126356..fb81d83e 100644 --- a/src/arm64/simd_input.h +++ b/src/arm64/simd_input.h @@ -40,25 +40,24 @@ using namespace simdjson::arm64; template <> struct simd_input { - uint8x16_t chunks[4]; + const uint8x16_t chunks[4]; - really_inline simd_input(const uint8_t *ptr) { - this->chunks[0] = vld1q_u8(ptr + 0*16); - this->chunks[1] = vld1q_u8(ptr + 1*16); - this->chunks[2] = vld1q_u8(ptr + 2*16); - this->chunks[3] = vld1q_u8(ptr + 3*16); - } + really_inline simd_input() + : chunks{uint8x16_t(), uint8x16_t(), uint8x16_t(), uint8x16_t() } {} - really_inline simd_input(uint8x16_t chunk0, uint8x16_t chunk1, uint8x16_t chunk2, uint8x16_t chunk3) { - this->chunks[0] = chunk0; - this->chunks[1] = chunk1; - this->chunks[2] = chunk2; - this->chunks[3] = chunk3; - } + really_inline simd_input(const uint8x16_t chunk0, const uint8x16_t chunk1, const uint8x16_t chunk2, const uint8x16_t chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3 } {} + + really_inline simd_input(const uint8_t *ptr) + : chunks{ + vld1q_u8(ptr + 0*16), + vld1q_u8(ptr + 1*16), + vld1q_u8(ptr + 2*16), + vld1q_u8(ptr + 3*16) + } {} template - really_inline void each(F const& each_chunk) - { + really_inline void each(F const& each_chunk) const { each_chunk(this->chunks[0]); each_chunk(this->chunks[1]); each_chunk(this->chunks[2]); @@ -66,7 +65,7 @@ struct simd_input { } template - really_inline simd_input map(F const& map_chunk) { + really_inline simd_input map(F const& map_chunk) const { return simd_input( map_chunk(this->chunks[0]), map_chunk(this->chunks[1]), @@ -76,7 +75,7 @@ struct simd_input { } template - really_inline simd_input map(simd_input b, F const& map_chunk) { + really_inline simd_input map(simd_input b, F const& map_chunk) const { return simd_input( map_chunk(this->chunks[0], b.chunks[0]), map_chunk(this->chunks[1], b.chunks[1]), @@ -86,24 +85,31 @@ struct simd_input { } template - really_inline uint8x16_t reduce(F const& reduce_pair) { + really_inline uint8x16_t reduce(F const& reduce_pair) const { uint8x16_t r01 = reduce_pair(this->chunks[0], this->chunks[1]); uint8x16_t r23 = reduce_pair(this->chunks[2], this->chunks[3]); return reduce_pair(r01, r23); } - really_inline uint64_t to_bitmask() { + really_inline uint64_t to_bitmask() const { return neon_movemask_bulk(this->chunks[0], this->chunks[1], this->chunks[2], this->chunks[3]); } - really_inline uint64_t eq(uint8_t m) { + really_inline simd_input bit_or(const uint8_t m) const { + const uint8x16_t mask = vmovq_n_u8(m); + return this->map( [&](auto a) { + return vorrq_u8(a, mask); + }); + } + + really_inline uint64_t eq(const uint8_t m) const { const uint8x16_t mask = vmovq_n_u8(m); return this->map( [&](auto a) { return vceqq_u8(a, mask); }).to_bitmask(); } - really_inline uint64_t lteq(uint8_t m) { + really_inline uint64_t lteq(const uint8_t m) const { const uint8x16_t mask = vmovq_n_u8(m); return this->map( [&](auto a) { return vcleq_u8(a, mask); diff --git a/src/arm64/stage1_find_marks.h b/src/arm64/stage1_find_marks.h index 91978b51..9adc8ae0 100644 --- a/src/arm64/stage1_find_marks.h +++ b/src/arm64/stage1_find_marks.h @@ -12,7 +12,7 @@ namespace simdjson::arm64 { -really_inline uint64_t compute_quote_mask(uint64_t quote_bits) { +really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) { #ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension return vmull_p64(-1ULL, quote_bits); @@ -21,9 +21,9 @@ really_inline uint64_t compute_quote_mask(uint64_t quote_bits) { #endif } -really_inline void find_whitespace_and_structurals( - simd_input in, uint64_t &whitespace, - uint64_t &structurals) { +really_inline void find_whitespace_and_operators( + const simd_input in, + uint64_t &whitespace, uint64_t &op) { const uint8x16_t low_nibble_mask = (uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0}; const uint8x16_t high_nibble_mask = @@ -38,9 +38,9 @@ really_inline void find_whitespace_and_structurals( return vandq_u8(shuf_lo, shuf_hi); }); - const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7); - structurals = v.map([&](auto _v) { - return vtstq_u8(_v, structural_shufti_mask); + const uint8x16_t operator_shufti_mask = vmovq_n_u8(0x7); + op = v.map([&](auto _v) { + return vtstq_u8(_v, operator_shufti_mask); }).to_bitmask(); const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); diff --git a/src/generic/stage1_find_marks.h b/src/generic/stage1_find_marks.h index 44858f66..85e5b6ae 100644 --- a/src/generic/stage1_find_marks.h +++ b/src/generic/stage1_find_marks.h @@ -12,230 +12,271 @@ // indicate whether we end an iteration on an odd-length sequence of // backslashes, which modifies our subsequent search for odd-length // sequences of backslashes in an obvious way. -really_inline uint64_t find_odd_backslash_sequences( - simd_input in, - uint64_t &prev_iter_ends_odd_backslash) { +really_inline uint64_t follows_odd_sequence_of(const uint64_t match, uint64_t &overflow) { const uint64_t even_bits = 0x5555555555555555ULL; const uint64_t odd_bits = ~even_bits; - uint64_t bs_bits = in.eq('\\'); - uint64_t start_edges = bs_bits & ~(bs_bits << 1); + uint64_t start_edges = match & ~(match << 1); /* flip lowest if we have an odd-length run at the end of the prior * iteration */ - uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; + uint64_t even_start_mask = even_bits ^ overflow; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; - uint64_t even_carries = bs_bits + even_starts; + uint64_t even_carries = match + even_starts; uint64_t odd_carries; /* must record the carry-out of our odd-carries out of bit 63; this * indicates whether the sense of any edge going to the next iteration * should be flipped */ - bool iter_ends_odd_backslash = - add_overflow(bs_bits, odd_starts, &odd_carries); + bool new_overflow = add_overflow(match, odd_starts, &odd_carries); - odd_carries |= prev_iter_ends_odd_backslash; /* push in bit zero as a - * potential end if we had an - * odd-numbered run at the - * end of the previous - * iteration */ - prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; - uint64_t even_carry_ends = even_carries & ~bs_bits; - uint64_t odd_carry_ends = odd_carries & ~bs_bits; + odd_carries |= overflow; /* push in bit zero as a + * potential end if we had an + * odd-numbered run at the + * end of the previous + * iteration */ + overflow = new_overflow ? 0x1ULL : 0x0ULL; + uint64_t even_carry_ends = even_carries & ~match; + uint64_t odd_carry_ends = odd_carries & ~match; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; return odd_ends; } -// return both the quote mask (which is a half-open mask that covers the first -// quote -// in an unescaped quote pair and everything in the quote pair) and the quote -// bits, which are the simple -// unescaped quoted bits. We also update the prev_iter_inside_quote value to -// tell the next iteration -// whether we finished the final iteration inside a quote pair; if so, this -// inverts our behavior of -// whether we're inside quotes for the next iteration. -// Note that we don't do any error checking to see if we have backslash -// sequences outside quotes; these -// backslash sequences (of any length) will be detected elsewhere. -really_inline uint64_t find_quote_mask_and_bits( - simd_input in, uint64_t odd_ends, - uint64_t &prev_iter_inside_quote, uint64_t "e_bits, - uint64_t &error_mask) { - quote_bits = in.eq('"'); - quote_bits = quote_bits & ~odd_ends; - uint64_t quote_mask = compute_quote_mask(quote_bits); - quote_mask ^= prev_iter_inside_quote; +// +// Check if the current character immediately follows a matching character. +// +// For example, this checks for quotes with backslashes in front of them: +// +// const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash); +// +really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) { + const uint64_t result = match << 1 | overflow; + overflow = match >> 63; + return result; +} + +// +// Check if the current character follows a matching character, with possible "filler" between. +// For example, this checks for empty curly braces, e.g. +// +// in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { * } +// +really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow ) { + uint64_t follows_match = follows(match, overflow); + uint64_t result; + overflow |= add_overflow(follows_match, filler, &result); + return result; +} + +really_inline ErrorValues detect_errors_on_eof( + uint64_t &unescaped_chars_error, + const uint64_t prev_in_string) { + if (prev_in_string) { + return UNCLOSED_STRING; + } + if (unescaped_chars_error) { + return UNESCAPED_CHARS; + } + return SUCCESS; +} + +// +// Return a mask of all string characters plus end quotes. +// +// prev_escaped is overflow saying whether the next character is escaped. +// prev_in_string is overflow saying whether we're still in a string. +// +// Backslash sequences outside of quotes will be detected in stage 2. +// +really_inline uint64_t find_strings(const simd_input in, uint64_t &prev_escaped, uint64_t &prev_in_string) { + const uint64_t backslash = in.eq('\\'); + const uint64_t escaped = follows_odd_sequence_of(backslash, prev_escaped); + const uint64_t quote = in.eq('"') & ~escaped; + // compute_quote_mask returns start quote plus string contents. + const uint64_t in_string = compute_quote_mask(quote) ^ prev_in_string; + /* right shift of a signed value expected to be well-defined and standard + * compliant as of C++20, + * John Regher from Utah U. says this is fine code */ + prev_in_string = static_cast(static_cast(in_string) >> 63); + // Use ^ to turn the beginning quote off, and the end quote on. + return in_string ^ quote; +} + +really_inline uint64_t invalid_string_bytes(const uint64_t unescaped, const uint64_t quote_mask) { /* All Unicode characters may be placed within the * quotation marks, except for the characters that MUST be escaped: * quotation mark, reverse solidus, and the control characters (U+0000 * through U+001F). * https://tools.ietf.org/html/rfc8259 */ - uint64_t unescaped = in.lteq(0x1F); - error_mask |= quote_mask & unescaped; - /* right shift of a signed value expected to be well-defined and standard - * compliant as of C++20, - * John Regher from Utah U. says this is fine code */ - prev_iter_inside_quote = - static_cast(static_cast(quote_mask) >> 63); - return quote_mask; + return quote_mask & unescaped; } -really_inline uint64_t finalize_structurals( - uint64_t structurals, uint64_t whitespace, uint64_t quote_mask, - uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) { - // mask off anything inside quotes - structurals &= ~quote_mask; - // add the real quote bits back into our bit_mask as well, so we can - // quickly traverse the strings we've spent all this trouble gathering - structurals |= quote_bits; - // Now, establish "pseudo-structural characters". These are non-whitespace - // characters that are (a) outside quotes and (b) have a predecessor that's - // either whitespace or a structural character. This means that subsequent - // passes will get a chance to encounter the first character of every string - // of non-whitespace and, if we're parsing an atom like true/false/null or a - // number we can stop at the first whitespace or structural character - // following it. +// +// Determine which characters are *structural*: +// - braces: [] and {} +// - the start of primitives (123, true, false, null) +// - the start of invalid non-whitespace (+, &, ture, UTF-8) +// +// Also detects value sequence errors: +// - two values with no separator between ("hello" "world") +// - separators with no values ([1,] [1,,]and [,2]) +// +// This method will find all of the above whether it is in a string or not. +// +// To reduce dependency on the expensive "what is in a string" computation, this method treats the +// contents of a string the same as content outside. Errors and structurals inside the string or on +// the trailing quote will need to be removed later when the correct string information is known. +// +really_inline uint64_t find_potential_structurals(const simd_input in, uint64_t &prev_primitive) { + // These use SIMD so let's kick them off before running the regular 64-bit stuff ... + uint64_t whitespace, op; + find_whitespace_and_operators(in, whitespace, op); - // a qualified predecessor is something that can happen 1 position before an - // pseudo-structural character - uint64_t pseudo_pred = structurals | whitespace; + // Detect the start of a run of primitive characters. Includes numbers, booleans, and strings ("). + // Everything except whitespace, braces, colon and comma. + const uint64_t primitive = ~(op | whitespace); + const uint64_t follows_primitive = follows(primitive, prev_primitive); + const uint64_t start_primitive = primitive & ~follows_primitive; - uint64_t shifted_pseudo_pred = - (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; - prev_iter_ends_pseudo_pred = pseudo_pred >> 63; - uint64_t pseudo_structurals = - shifted_pseudo_pred & (~whitespace) & (~quote_mask); - structurals |= pseudo_structurals; - - // now, we've used our close quotes all we need to. So let's switch them off - // they will be off in the quote mask and on in quote bits. - structurals &= ~(quote_bits & ~quote_mask); - return structurals; + // Return final structurals + return op | start_primitive; } -// Find structural bits in a 64-byte chunk. -really_inline void find_structural_bits_64( - const uint8_t *buf, size_t idx, uint32_t *base_ptr, uint32_t &base, - uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote, - uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals, - uint64_t &error_mask, +static const size_t STEP_SIZE = 128; + +// +// Find the important bits of JSON in a 128-byte chunk, and add them to : +// +// +// +// PERF NOTES: +// We pipe 2 inputs through these stages: +// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load +// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available. +// 2. Scan the JSON for critical data: strings, primitives and operators. This is the critical path. +// The output of step 1 depends entirely on this information. These functions don't quite use +// up enough CPU: the second half of the functions is highly serial, only using 1 execution core +// at a time. The second input's scans has some dependency on the first ones finishing it, but +// they can make a lot of progress before they need that information. +// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that +// to finish: utf-8 checks and generating the output from the last iteration. +// +// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all +// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough +// workout. +// +really_inline void find_structural_bits_128( + const uint8_t *buf, const size_t idx, uint32_t *&base_ptr, + uint64_t &prev_escaped, uint64_t &prev_in_string, + uint64_t &prev_primitive, + uint64_t &prev_structurals, + uint64_t &unescaped_chars_error, utf8_checker &utf8_state) { - simd_input in(buf); - utf8_state.check_next_input(in); - /* detect odd sequences of backslashes */ - uint64_t odd_ends = find_odd_backslash_sequences( - in, prev_iter_ends_odd_backslash); + // + // Load up all 128 bytes into SIMD registers + // + simd_input in_1(buf); + simd_input in_2(buf+64); - /* detect insides of quote pairs ("quote_mask") and also our quote_bits - * themselves */ - uint64_t quote_bits; - uint64_t quote_mask = find_quote_mask_and_bits( - in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); + // + // Find the strings and potential structurals (operators / primitives). + // + // This will include false structurals that are *inside* strings--we'll filter strings out + // before we return. + // + uint64_t string_1 = find_strings(in_1, prev_escaped, prev_in_string); + uint64_t structurals_1 = find_potential_structurals(in_1, prev_primitive); + uint64_t string_2 = find_strings(in_2, prev_escaped, prev_in_string); + uint64_t structurals_2 = find_potential_structurals(in_2, prev_primitive); - /* take the previous iterations structural bits, not our current - * iteration, - * and flatten */ - flatten_bits(base_ptr, base, idx, structurals); + // + // Do miscellaneous work while the processor is busy calculating strings and structurals. + // + // After that, weed out structurals that are inside strings and find invalid string characters. + // + uint64_t unescaped_1 = in_1.lteq(0x1F); + utf8_state.check_next_input(in_1); + flatten_bits(base_ptr, idx, prev_structurals); // Output *last* iteration's structurals to ParsedJson + prev_structurals = structurals_1 & ~string_1; + unescaped_chars_error |= unescaped_1 & string_1; - uint64_t whitespace; - find_whitespace_and_structurals(in, whitespace, structurals); - - /* fixup structurals to reflect quotes and add pseudo-structural - * characters */ - structurals = finalize_structurals(structurals, whitespace, quote_mask, - quote_bits, prev_iter_ends_pseudo_pred); + uint64_t unescaped_2 = in_2.lteq(0x1F); + utf8_state.check_next_input(in_2); + flatten_bits(base_ptr, idx+64, prev_structurals); // Output *last* iteration's structurals to ParsedJson + prev_structurals = structurals_2 & ~string_2; + unescaped_chars_error |= unescaped_2 & string_2; } int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) { - if (len > pj.byte_capacity) { + if (unlikely(len > pj.byte_capacity)) { std::cerr << "Your ParsedJson object only supports documents up to " << pj.byte_capacity << " bytes but you are trying to process " << len << " bytes" << std::endl; return simdjson::CAPACITY; } uint32_t *base_ptr = pj.structural_indexes; - uint32_t base = 0; utf8_checker utf8_state; - /* we have padded the input out to 64 byte multiple with the remainder - * being zeros persistent state across loop does the last iteration end - * with an odd-length sequence of backslashes? */ - - /* either 0 or 1, but a 64-bit value */ - uint64_t prev_iter_ends_odd_backslash = 0ULL; - /* does the previous iteration end inside a double-quote pair? */ - uint64_t prev_iter_inside_quote = - 0ULL; /* either all zeros or all ones - * does the previous iteration end on something that is a - * predecessor of a pseudo-structural character - i.e. - * whitespace or a structural character effectively the very - * first char is considered to follow "whitespace" for the - * purposes of pseudo-structural character detection so we - * initialize to 1 */ - uint64_t prev_iter_ends_pseudo_pred = 1ULL; - - /* structurals are persistent state across loop as we flatten them on the - * subsequent iteration into our array pointed to be base_ptr. - * This is harmless on the first iteration as structurals==0 - * and is done for performance reasons; we can hide some of the latency of - * the - * expensive carryless multiply in the previous step with this work */ + // Whether the first character of the next iteration is escaped. + uint64_t prev_escaped = 0ULL; + // Whether the last iteration was still inside a string (all 1's = true, all 0's = false). + uint64_t prev_in_string = 0ULL; + // Whether the last character of the previous iteration is a primitive value character + // (anything except whitespace, braces, comma or colon). + uint64_t prev_primitive = 0ULL; + // Mask of structural characters from the last iteration. + // Kept around for performance reasons, so we can call flatten_bits to soak up some unused + // CPU capacity while the next iteration is busy with an expensive clmul in compute_quote_mask. uint64_t structurals = 0; - size_t lenminus64 = len < 64 ? 0 : len - 64; + size_t lenminusstep = len < STEP_SIZE ? 0 : len - STEP_SIZE; size_t idx = 0; - uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII - code points < 0x20) */ + // Errors with unescaped characters in strings (ASCII codepoints < 0x20) + uint64_t unescaped_chars_error = 0; - for (; idx < lenminus64; idx += 64) { - find_structural_bits_64(&buf[idx], idx, base_ptr, base, - prev_iter_ends_odd_backslash, - prev_iter_inside_quote, prev_iter_ends_pseudo_pred, - structurals, error_mask, utf8_state); + for (; idx < lenminusstep; idx += STEP_SIZE) { + find_structural_bits_128(&buf[idx], idx, base_ptr, + prev_escaped, prev_in_string, prev_primitive, + structurals, unescaped_chars_error, utf8_state); } + /* If we have a final chunk of less than 64 bytes, pad it to 64 with * spaces before processing it (otherwise, we risk invalidating the UTF-8 * checks). */ - if (idx < len) { - uint8_t tmp_buf[64]; - memset(tmp_buf, 0x20, 64); + if (likely(idx < len)) { + uint8_t tmp_buf[STEP_SIZE]; + memset(tmp_buf, 0x20, STEP_SIZE); memcpy(tmp_buf, buf + idx, len - idx); - find_structural_bits_64(&tmp_buf[0], idx, base_ptr, base, - prev_iter_ends_odd_backslash, - prev_iter_inside_quote, prev_iter_ends_pseudo_pred, - structurals, error_mask, utf8_state); - idx += 64; + find_structural_bits_128(&tmp_buf[0], idx, base_ptr, + prev_escaped, prev_in_string, prev_primitive, + structurals, unescaped_chars_error, utf8_state); + idx += STEP_SIZE; } - /* is last string quote closed? */ - if (prev_iter_inside_quote) { - return simdjson::UNCLOSED_STRING; + /* finally, flatten out the remaining structurals from the last iteration */ + flatten_bits(base_ptr, idx, structurals); + + simdjson::ErrorValues error = detect_errors_on_eof(unescaped_chars_error, prev_in_string); + if (unlikely(error != simdjson::SUCCESS)) { + return error; } - /* finally, flatten out the remaining structurals from the last iteration - */ - flatten_bits(base_ptr, base, idx, structurals); - - pj.n_structural_indexes = base; + pj.n_structural_indexes = base_ptr - pj.structural_indexes; /* a valid JSON file cannot have zero structural indexes - we should have * found something */ - if (pj.n_structural_indexes == 0u) { + if (unlikely(pj.n_structural_indexes == 0u)) { return simdjson::EMPTY; } - if (base_ptr[pj.n_structural_indexes - 1] > len) { + if (unlikely(pj.structural_indexes[pj.n_structural_indexes - 1] > len)) { return simdjson::UNEXPECTED_ERROR; } - if (len != base_ptr[pj.n_structural_indexes - 1]) { + if (len != pj.structural_indexes[pj.n_structural_indexes - 1]) { /* the string might not be NULL terminated, but we add a virtual NULL * ending character. */ - base_ptr[pj.n_structural_indexes++] = len; + pj.structural_indexes[pj.n_structural_indexes++] = len; } /* make it safe to dereference one beyond this array */ - base_ptr[pj.n_structural_indexes] = 0; - if (error_mask) { - return simdjson::UNESCAPED_CHARS; - } + pj.structural_indexes[pj.n_structural_indexes] = 0; return utf8_state.errors(); } diff --git a/src/generic/stage1_find_marks_flatten.h b/src/generic/stage1_find_marks_flatten.h index 9583759f..3159229a 100644 --- a/src/generic/stage1_find_marks_flatten.h +++ b/src/generic/stage1_find_marks_flatten.h @@ -26,64 +26,42 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx // base_ptr[base] incrementing base as we go // will potentially store extra values beyond end of valid bits, so base_ptr // needs to be large enough to handle this -really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) { +really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) { // In some instances, the next branch is expensive because it is mispredicted. // Unfortunately, in other cases, // it helps tremendously. if (bits == 0) return; uint32_t cnt = hamming(bits); - uint32_t next_base = base + cnt; idx -= 64; - base_ptr += base; - { - base_ptr[0] = idx + trailing_zeroes(bits); + + // Do the first 8 all together + for (int i=0; i<8; i++) { + base_ptr[i] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[1] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr[2] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr[3] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr[4] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr[5] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr[6] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr[7] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr += 8; } - // We hope that the next branch is easily predicted. - if (cnt > 8) { - base_ptr[0] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr[1] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr[2] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr[3] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr[4] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr[5] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr[6] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr[7] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - base_ptr += 8; - } - if (cnt > 16) { // unluckly: we rarely get here - // since it means having one structural or pseudo-structral element - // every 4 characters (possible with inputs like "","","",...). - do { - base_ptr[0] = idx + trailing_zeroes(bits); + + // Do the next 8 all together (we hope in most cases it won't happen at all + // and the branch is easily predicted). + if (unlikely(cnt > 8)) { + for (int i=8; i<16; i++) { + base_ptr[i] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr++; - } while (bits != 0); + } + + // Most files don't have 16+ structurals per block, so we take several basically guaranteed + // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) + // or the start of a value ("abc" true 123) every 4 characters. + if (unlikely(cnt > 16)) { + uint32_t i = 16; + do { + base_ptr[i] = idx + trailing_zeroes(bits); + bits = bits & (bits - 1); + i++; + } while (i < cnt); + } } - base = next_base; + + base_ptr += cnt; } #endif // SIMDJSON_NAIVE_FLATTEN diff --git a/src/haswell/simd_input.h b/src/haswell/simd_input.h index c9b8a5f7..58680ddb 100644 --- a/src/haswell/simd_input.h +++ b/src/haswell/simd_input.h @@ -10,29 +10,28 @@ namespace simdjson { template <> struct simd_input { - __m256i chunks[2]; + const __m256i chunks[2]; + + really_inline simd_input() : chunks{__m256i(), __m256i()} {} + + really_inline simd_input(const __m256i chunk0, const __m256i chunk1) + : chunks{chunk0, chunk1} {} really_inline simd_input(const uint8_t *ptr) - { - this->chunks[0] = _mm256_loadu_si256(reinterpret_cast(ptr + 0*32)); - this->chunks[1] = _mm256_loadu_si256(reinterpret_cast(ptr + 1*32)); - } - - really_inline simd_input(__m256i chunk0, __m256i chunk1) - { - this->chunks[0] = chunk0; - this->chunks[1] = chunk1; - } + : chunks{ + _mm256_loadu_si256(reinterpret_cast(ptr + 0*32)), + _mm256_loadu_si256(reinterpret_cast(ptr + 1*32)) + } {} template - really_inline void each(F const& each_chunk) + really_inline void each(F const& each_chunk) const { each_chunk(this->chunks[0]); each_chunk(this->chunks[1]); } template - really_inline simd_input map(F const& map_chunk) { + really_inline simd_input map(F const& map_chunk) const { return simd_input( map_chunk(this->chunks[0]), map_chunk(this->chunks[1]) @@ -40,7 +39,7 @@ struct simd_input { } template - really_inline simd_input map(simd_input b, F const& map_chunk) { + really_inline simd_input map(const simd_input b, F const& map_chunk) const { return simd_input( map_chunk(this->chunks[0], b.chunks[0]), map_chunk(this->chunks[1], b.chunks[1]) @@ -48,24 +47,31 @@ struct simd_input { } template - really_inline __m256i reduce(F const& reduce_pair) { + really_inline __m256i reduce(F const& reduce_pair) const { return reduce_pair(this->chunks[0], this->chunks[1]); } - really_inline uint64_t to_bitmask() { + really_inline uint64_t to_bitmask() const { uint64_t r_lo = static_cast(_mm256_movemask_epi8(this->chunks[0])); uint64_t r_hi = _mm256_movemask_epi8(this->chunks[1]); return r_lo | (r_hi << 32); } - really_inline uint64_t eq(uint8_t m) { + really_inline simd_input bit_or(const uint8_t m) const { + const __m256i mask = _mm256_set1_epi8(m); + return this->map( [&](auto a) { + return _mm256_or_si256(a, mask); + }); + } + + really_inline uint64_t eq(const uint8_t m) const { const __m256i mask = _mm256_set1_epi8(m); return this->map( [&](auto a) { return _mm256_cmpeq_epi8(a, mask); }).to_bitmask(); } - really_inline uint64_t lteq(uint8_t m) { + really_inline uint64_t lteq(const uint8_t m) const { const __m256i maxval = _mm256_set1_epi8(m); return this->map( [&](auto a) { return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, a), maxval); diff --git a/src/haswell/simdutf8check.h b/src/haswell/simdutf8check.h index e0b993d4..2ebada18 100644 --- a/src/haswell/simdutf8check.h +++ b/src/haswell/simdutf8check.h @@ -218,7 +218,7 @@ struct utf8_checker { __m256i any_bits_on = in.reduce([&](auto a, auto b) { return _mm256_or_si256(a, b); }); - if ((_mm256_testz_si256(any_bits_on, high_bit)) == 1) { + if (likely(_mm256_testz_si256(any_bits_on, high_bit) == 1)) { // it is ascii, we just check continuation this->has_error = _mm256_or_si256( _mm256_cmpgt_epi8(this->previous.carried_continuations, diff --git a/src/haswell/stage1_find_marks.h b/src/haswell/stage1_find_marks.h index 88579cb2..b89dacd4 100644 --- a/src/haswell/stage1_find_marks.h +++ b/src/haswell/stage1_find_marks.h @@ -13,7 +13,7 @@ TARGET_HASWELL namespace simdjson::haswell { -really_inline uint64_t compute_quote_mask(uint64_t quote_bits) { +really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) { // There should be no such thing with a processing supporting avx2 // but not clmul. uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( @@ -21,8 +21,9 @@ really_inline uint64_t compute_quote_mask(uint64_t quote_bits) { return quote_mask; } -really_inline void find_whitespace_and_structurals(simd_input in, - uint64_t &whitespace, uint64_t &structurals) { +really_inline void find_whitespace_and_operators( + const simd_input in, + uint64_t &whitespace, uint64_t &op) { #ifdef SIMDJSON_NAIVE_STRUCTURAL @@ -34,14 +35,14 @@ really_inline void find_whitespace_and_structurals(simd_input in, const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d); const __m256i mask_column = _mm256_set1_epi8(0x3a); const __m256i mask_comma = _mm256_set1_epi8(0x2c); - structurals = in.map([&](auto in) { - __m256i structurals = _mm256_cmpeq_epi8(in, mask_open_brace); - structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_brace)); - structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_open_bracket)); - structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_bracket)); - structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_column)); - structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_comma)); - return structurals; + op = in.map([&](auto in) { + __m256i op = _mm256_cmpeq_epi8(in, mask_open_brace); + op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_brace)); + op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_open_bracket)); + op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_bracket)); + op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_column)); + op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_comma)); + return op; }).to_bitmask(); const __m256i mask_space = _mm256_set1_epi8(0x20); @@ -60,24 +61,24 @@ really_inline void find_whitespace_and_structurals(simd_input in, #else // SIMDJSON_NAIVE_STRUCTURAL // clang-format off - const __m256i structural_table = + const __m256i operator_table = _mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123, 44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123); const __m256i white_table = _mm256_setr_epi8( 32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100, 32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100); // clang-format on - const __m256i struct_offset = _mm256_set1_epi8(0xd4u); - const __m256i struct_mask = _mm256_set1_epi8(32); + const __m256i op_offset = _mm256_set1_epi8(0xd4u); + const __m256i op_mask = _mm256_set1_epi8(32); whitespace = in.map([&](auto _in) { return _mm256_cmpeq_epi8(_in, _mm256_shuffle_epi8(white_table, _in)); }).to_bitmask(); - structurals = in.map([&](auto _in) { - const __m256i r1 = _mm256_add_epi8(struct_offset, _in); - const __m256i r2 = _mm256_or_si256(_in, struct_mask); - const __m256i r3 = _mm256_shuffle_epi8(structural_table, r1); + op = in.map([&](auto _in) { + const __m256i r1 = _mm256_add_epi8(op_offset, _in); + const __m256i r2 = _mm256_or_si256(_in, op_mask); + const __m256i r3 = _mm256_shuffle_epi8(operator_table, r1); return _mm256_cmpeq_epi8(r2, r3); }).to_bitmask(); @@ -89,65 +90,43 @@ really_inline void find_whitespace_and_structurals(simd_input in, // base_ptr[base] incrementing base as we go // will potentially store extra values beyond end of valid bits, so base_ptr // needs to be large enough to handle this -really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) { +really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) { // In some instances, the next branch is expensive because it is mispredicted. // Unfortunately, in other cases, // it helps tremendously. if (bits == 0) return; uint32_t cnt = _mm_popcnt_u64(bits); - uint32_t next_base = base + cnt; idx -= 64; - base_ptr += base; - { - base_ptr[0] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr[1] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr[2] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr[3] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr[4] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr[5] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr[6] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr[7] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr += 8; + + // Do the first 8 all together + for (int i=0; i<8; i++) { + base_ptr[i] = idx + trailing_zeroes(bits); + bits = _blsr_u64(bits); } - // We hope that the next branch is easily predicted. - if (cnt > 8) { - base_ptr[0] = idx + trailing_zeroes(bits); + + // Do the next 8 all together (we hope in most cases it won't happen at all + // and the branch is easily predicted). + if (unlikely(cnt > 8)) { + for (int i=8; i<16; i++) { + base_ptr[i] = idx + trailing_zeroes(bits); bits = _blsr_u64(bits); - base_ptr[1] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr[2] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr[3] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr[4] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr[5] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr[6] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr[7] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr += 8; - } - if (cnt > 16) { // unluckly: we rarely get here - // since it means having one structural or pseudo-structral element - // every 4 characters (possible with inputs like "","","",...). + } + + // Most files don't have 16+ structurals per block, so we take several basically guaranteed + // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) + // or the start of a value ("abc" true 123) every four characters. + if (unlikely(cnt > 16)) { + uint32_t i = 16; do { - base_ptr[0] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - base_ptr++; - } while (bits != 0); + base_ptr[i] = idx + trailing_zeroes(bits); + bits = _blsr_u64(bits); + i++; + } while (i < cnt); + } } - base = next_base; + + base_ptr += cnt; } #include "generic/stage1_find_marks.h" diff --git a/src/westmere/simd_input.h b/src/westmere/simd_input.h index 7c9a1b6d..2978eae0 100644 --- a/src/westmere/simd_input.h +++ b/src/westmere/simd_input.h @@ -10,26 +10,24 @@ namespace simdjson { template <> struct simd_input { - __m128i chunks[4]; + const __m128i chunks[4]; - really_inline simd_input(const uint8_t *ptr) { - this->chunks[0] = _mm_loadu_si128(reinterpret_cast(ptr + 0)); - this->chunks[1] = _mm_loadu_si128(reinterpret_cast(ptr + 16)); - this->chunks[2] = _mm_loadu_si128(reinterpret_cast(ptr + 32)); - this->chunks[3] = _mm_loadu_si128(reinterpret_cast(ptr + 48)); - } + really_inline simd_input() + : chunks { __m128i(), __m128i(), __m128i(), __m128i() } {} - really_inline simd_input(__m128i i0, __m128i i1, __m128i i2, __m128i i3) - { - this->chunks[0] = i0; - this->chunks[1] = i1; - this->chunks[2] = i2; - this->chunks[3] = i3; - } + really_inline simd_input(const __m128i chunk0, const __m128i chunk1, const __m128i chunk2, const __m128i chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + + really_inline simd_input(const uint8_t *ptr) + : simd_input( + _mm_loadu_si128(reinterpret_cast(ptr + 0)), + _mm_loadu_si128(reinterpret_cast(ptr + 16)), + _mm_loadu_si128(reinterpret_cast(ptr + 32)), + _mm_loadu_si128(reinterpret_cast(ptr + 48)) + ) {} template - really_inline void each(F const& each_chunk) - { + really_inline void each(F const& each_chunk) const { each_chunk(this->chunks[0]); each_chunk(this->chunks[1]); each_chunk(this->chunks[2]); @@ -37,7 +35,7 @@ struct simd_input { } template - really_inline simd_input map(F const& map_chunk) { + really_inline simd_input map(F const& map_chunk) const { return simd_input( map_chunk(this->chunks[0]), map_chunk(this->chunks[1]), @@ -47,7 +45,7 @@ struct simd_input { } template - really_inline simd_input map(simd_input b, F const& map_chunk) { + really_inline simd_input map(const simd_input b, F const& map_chunk) const { return simd_input( map_chunk(this->chunks[0], b.chunks[0]), map_chunk(this->chunks[1], b.chunks[1]), @@ -57,13 +55,13 @@ struct simd_input { } template - really_inline __m128i reduce(F const& reduce_pair) { + really_inline __m128i reduce(F const& reduce_pair) const { __m128i r01 = reduce_pair(this->chunks[0], this->chunks[1]); __m128i r23 = reduce_pair(this->chunks[2], this->chunks[3]); return reduce_pair(r01, r23); } - really_inline uint64_t to_bitmask() { + really_inline uint64_t to_bitmask() const { uint64_t r0 = static_cast(_mm_movemask_epi8(this->chunks[0])); uint64_t r1 = _mm_movemask_epi8(this->chunks[1]); uint64_t r2 = _mm_movemask_epi8(this->chunks[2]); @@ -71,14 +69,21 @@ struct simd_input { return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); } - really_inline uint64_t eq(uint8_t m) { + really_inline simd_input bit_or(const uint8_t m) const { + const __m128i mask = _mm_set1_epi8(m); + return this->map( [&](auto a) { + return _mm_or_si128(a, mask); + }); + } + + really_inline uint64_t eq(const uint8_t m) const { const __m128i mask = _mm_set1_epi8(m); return this->map( [&](auto a) { return _mm_cmpeq_epi8(a, mask); }).to_bitmask(); } - really_inline uint64_t lteq(uint8_t m) { + really_inline uint64_t lteq(const uint8_t m) const { const __m128i maxval = _mm_set1_epi8(m); return this->map( [&](auto a) { return _mm_cmpeq_epi8(_mm_max_epu8(maxval, a), maxval); diff --git a/src/westmere/stage1_find_marks.h b/src/westmere/stage1_find_marks.h index 71e5a440..7a5db2ee 100644 --- a/src/westmere/stage1_find_marks.h +++ b/src/westmere/stage1_find_marks.h @@ -13,29 +13,30 @@ TARGET_WESTMERE namespace simdjson::westmere { -really_inline uint64_t compute_quote_mask(uint64_t quote_bits) { +really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) { return _mm_cvtsi128_si64(_mm_clmulepi64_si128( _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0)); } -really_inline void find_whitespace_and_structurals(simd_input in, - uint64_t &whitespace, uint64_t &structurals) { +really_inline void find_whitespace_and_operators( + const simd_input in, + uint64_t &whitespace, uint64_t &op) { - const __m128i structural_table = + const __m128i operator_table = _mm_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123); const __m128i white_table = _mm_setr_epi8(32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100); - const __m128i struct_offset = _mm_set1_epi8(0xd4u); - const __m128i struct_mask = _mm_set1_epi8(32); + const __m128i op_offset = _mm_set1_epi8(0xd4u); + const __m128i op_mask = _mm_set1_epi8(32); whitespace = in.map([&](auto _in) { return _mm_cmpeq_epi8(_in, _mm_shuffle_epi8(white_table, _in)); }).to_bitmask(); - structurals = in.map([&](auto _in) { - const __m128i r1 = _mm_add_epi8(struct_offset, _in); - const __m128i r2 = _mm_or_si128(_in, struct_mask); - const __m128i r3 = _mm_shuffle_epi8(structural_table, r1); + op = in.map([&](auto _in) { + const __m128i r1 = _mm_add_epi8(op_offset, _in); + const __m128i r2 = _mm_or_si128(_in, op_mask); + const __m128i r3 = _mm_shuffle_epi8(operator_table, r1); return _mm_cmpeq_epi8(r2, r3); }).to_bitmask(); }