Combined performance patch (5% overall, 15% stage 1) (#317)

* Allow -f

* Support parse -s (force sse)

* Simplify flatten_bits

- Add directly to base instead of storing variable
- Don't modify base_ptr after beginning of function
- Eliminate base variable and increment base_ptr instead

* De-unroll the flatten_bits loops

* Decrease dependencies in stage 1

- Do all finalize_structurals work before computing the quote mask; mask
  out the quote mask later
- Join find_whitespace_and_structurals and finalize_structurals into
  single find_structurals call, to reduce variable leakage
- Rework pseudo_pred algorithm to refer to "primitive" for clarity and some
  dependency reduction
- Rename quote_mask to in_string to describe what we're trying to
  achieve ("mask" could mean many things)
- Break up find_quote_mask_and_bits into find_quote_mask and
  invalid_string_bytes to reduce data leakage (i.e. don't expose quote bits
  or odd_ends at all to find_structural_bits)
- Genericize overflow methods "follows" and "follows_odd_sequence" for
  descriptiveness and possible lifting into a generic simd parsing library

* Mark branches as likely/unlikely

* Reorder and unroll+interleave stage 1 loop

* Nest the cnt > 16 branch inside cnt > 8
This commit is contained in:
John Keiser 2019-10-01 09:01:09 -07:00 committed by Daniel Lemire
parent 53b6deaeae
commit de8df0a05f
13 changed files with 405 additions and 363 deletions

1
.gitignore vendored
View File

@ -14,6 +14,7 @@
/jsoncheck
/jsonpointer
/jsonstats
/integer_tests
/libsimdjson.so*
/minify
/numberparsingcheck

View File

@ -34,6 +34,18 @@
#include "simdjson/parsedjson.h"
#include "simdjson/stage1_find_marks.h"
#include "simdjson/stage2_build_tape.h"
// Global arguments
bool find_marks_only = false;
bool verbose = false;
bool dump = false;
bool json_output = false;
bool force_one_iteration = false;
bool just_data = false;
bool force_sse = false;
int32_t iterations = -1;
int32_t warmup_iterations = -1;
namespace simdjson {
Architecture _find_best_supported_implementation() {
constexpr uint32_t haswell_flags =
@ -43,7 +55,7 @@ Architecture _find_best_supported_implementation() {
instruction_set::SSE42 | instruction_set::PCLMULQDQ;
uint32_t supports = detect_supported_architectures();
// Order from best to worst (within architecture)
if ((haswell_flags & supports) == haswell_flags) {
if ((haswell_flags & supports) == haswell_flags && !force_sse) {
return Architecture::HASWELL;
}
if ((westmere_flags & supports) == westmere_flags) {
@ -63,6 +75,9 @@ extern unified_functype *unified_ptr;
extern stage1_functype *stage1_ptr;
int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
if (find_marks_only) {
return simdjson::SUCCESS;
}
Architecture best_implementation = _find_best_supported_implementation();
// Selecting the best implementation
switch (best_implementation) {
@ -118,18 +133,11 @@ unified_functype *unified_ptr = &unified_machine_dispatch;
} // namespace simdjson
int main(int argc, char *argv[]) {
bool verbose = false;
bool dump = false;
bool json_output = false;
bool force_one_iteration = false;
bool just_data = false;
int32_t iterations = -1;
int32_t warmup_iterations = -1;
#ifndef _MSC_VER
int c;
while ((c = getopt(argc, argv, "1vdtn:w:")) != -1) {
while ((c = getopt(argc, argv, "1vdtn:w:fs")) != -1) {
switch (c) {
case 'n':
iterations = atoi(optarg);
@ -137,6 +145,9 @@ int main(int argc, char *argv[]) {
case 'w':
warmup_iterations = atoi(optarg);
break;
case 's':
force_sse = true;
break;
case 't':
just_data = true;
break;
@ -152,6 +163,9 @@ int main(int argc, char *argv[]) {
case '1':
force_one_iteration = true;
break;
case 'f':
find_marks_only = true;
break;
default:
abort();
}
@ -326,7 +340,7 @@ int main(int argc, char *argv[]) {
isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
simdjson::SUCCESS);
isok = isok &&
(simdjson::SUCCESS ==
(simdjson::SUCCESS ==
simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;

View File

@ -17,6 +17,17 @@
#define SIMDJSON_PADDING 32
#endif
#if defined(__GNUC__)
// Marks a block with a name so that MCA analysis can see it.
#define BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
#define END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
#define DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
#else
#define BEGIN_DEBUG_BLOCK(name)
#define END_DEBUG_BLOCK(name)
#define DEBUG_BLOCK(name, block)
#endif
#ifndef _MSC_VER
// Implemented using Labels as Values which works in GCC and CLANG (and maybe
// also in Intel's compiler), but won't work in MSVC.

View File

@ -29,5 +29,5 @@ make parse
make perfdiff
echo "Running perfdiff:"
echo ./perfdiff \"$current/parse -t $perftests\" \"$reference/parse -t $perftests\"
./perfdiff "$current/parse -t $perftests" "$reference/parse -t $perftests"
echo ./perfdiff \"$current/parse -t $perftests $CHECKPERF_ARGS\" \"$reference/parse -t $perftests $CHECKPERF_ARGS\"
./perfdiff "$current/parse -t $perftests $CHECKPERF_ARGS" "$reference/parse -t $perftests $CHECKPERF_ARGS"

View File

@ -40,25 +40,24 @@ using namespace simdjson::arm64;
template <>
struct simd_input<Architecture::ARM64> {
uint8x16_t chunks[4];
const uint8x16_t chunks[4];
really_inline simd_input(const uint8_t *ptr) {
this->chunks[0] = vld1q_u8(ptr + 0*16);
this->chunks[1] = vld1q_u8(ptr + 1*16);
this->chunks[2] = vld1q_u8(ptr + 2*16);
this->chunks[3] = vld1q_u8(ptr + 3*16);
}
really_inline simd_input()
: chunks{uint8x16_t(), uint8x16_t(), uint8x16_t(), uint8x16_t() } {}
really_inline simd_input(uint8x16_t chunk0, uint8x16_t chunk1, uint8x16_t chunk2, uint8x16_t chunk3) {
this->chunks[0] = chunk0;
this->chunks[1] = chunk1;
this->chunks[2] = chunk2;
this->chunks[3] = chunk3;
}
really_inline simd_input(const uint8x16_t chunk0, const uint8x16_t chunk1, const uint8x16_t chunk2, const uint8x16_t chunk3)
: chunks{chunk0, chunk1, chunk2, chunk3 } {}
really_inline simd_input(const uint8_t *ptr)
: chunks{
vld1q_u8(ptr + 0*16),
vld1q_u8(ptr + 1*16),
vld1q_u8(ptr + 2*16),
vld1q_u8(ptr + 3*16)
} {}
template <typename F>
really_inline void each(F const& each_chunk)
{
really_inline void each(F const& each_chunk) const {
each_chunk(this->chunks[0]);
each_chunk(this->chunks[1]);
each_chunk(this->chunks[2]);
@ -66,7 +65,7 @@ struct simd_input<Architecture::ARM64> {
}
template <typename F>
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) const {
return simd_input<Architecture::ARM64>(
map_chunk(this->chunks[0]),
map_chunk(this->chunks[1]),
@ -76,7 +75,7 @@ struct simd_input<Architecture::ARM64> {
}
template <typename F>
really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) {
really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) const {
return simd_input<Architecture::ARM64>(
map_chunk(this->chunks[0], b.chunks[0]),
map_chunk(this->chunks[1], b.chunks[1]),
@ -86,24 +85,31 @@ struct simd_input<Architecture::ARM64> {
}
template <typename F>
really_inline uint8x16_t reduce(F const& reduce_pair) {
really_inline uint8x16_t reduce(F const& reduce_pair) const {
uint8x16_t r01 = reduce_pair(this->chunks[0], this->chunks[1]);
uint8x16_t r23 = reduce_pair(this->chunks[2], this->chunks[3]);
return reduce_pair(r01, r23);
}
really_inline uint64_t to_bitmask() {
really_inline uint64_t to_bitmask() const {
return neon_movemask_bulk(this->chunks[0], this->chunks[1], this->chunks[2], this->chunks[3]);
}
really_inline uint64_t eq(uint8_t m) {
really_inline simd_input<Architecture::ARM64> bit_or(const uint8_t m) const {
const uint8x16_t mask = vmovq_n_u8(m);
return this->map( [&](auto a) {
return vorrq_u8(a, mask);
});
}
really_inline uint64_t eq(const uint8_t m) const {
const uint8x16_t mask = vmovq_n_u8(m);
return this->map( [&](auto a) {
return vceqq_u8(a, mask);
}).to_bitmask();
}
really_inline uint64_t lteq(uint8_t m) {
really_inline uint64_t lteq(const uint8_t m) const {
const uint8x16_t mask = vmovq_n_u8(m);
return this->map( [&](auto a) {
return vcleq_u8(a, mask);

View File

@ -12,7 +12,7 @@
namespace simdjson::arm64 {
really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
return vmull_p64(-1ULL, quote_bits);
@ -21,9 +21,9 @@ really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
#endif
}
really_inline void find_whitespace_and_structurals(
simd_input<ARCHITECTURE> in, uint64_t &whitespace,
uint64_t &structurals) {
really_inline void find_whitespace_and_operators(
const simd_input<ARCHITECTURE> in,
uint64_t &whitespace, uint64_t &op) {
const uint8x16_t low_nibble_mask =
(uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
const uint8x16_t high_nibble_mask =
@ -38,9 +38,9 @@ really_inline void find_whitespace_and_structurals(
return vandq_u8(shuf_lo, shuf_hi);
});
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
structurals = v.map([&](auto _v) {
return vtstq_u8(_v, structural_shufti_mask);
const uint8x16_t operator_shufti_mask = vmovq_n_u8(0x7);
op = v.map([&](auto _v) {
return vtstq_u8(_v, operator_shufti_mask);
}).to_bitmask();
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);

View File

@ -12,230 +12,271 @@
// indicate whether we end an iteration on an odd-length sequence of
// backslashes, which modifies our subsequent search for odd-length
// sequences of backslashes in an obvious way.
really_inline uint64_t find_odd_backslash_sequences(
simd_input<ARCHITECTURE> in,
uint64_t &prev_iter_ends_odd_backslash) {
really_inline uint64_t follows_odd_sequence_of(const uint64_t match, uint64_t &overflow) {
const uint64_t even_bits = 0x5555555555555555ULL;
const uint64_t odd_bits = ~even_bits;
uint64_t bs_bits = in.eq('\\');
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
uint64_t start_edges = match & ~(match << 1);
/* flip lowest if we have an odd-length run at the end of the prior
* iteration */
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
uint64_t even_start_mask = even_bits ^ overflow;
uint64_t even_starts = start_edges & even_start_mask;
uint64_t odd_starts = start_edges & ~even_start_mask;
uint64_t even_carries = bs_bits + even_starts;
uint64_t even_carries = match + even_starts;
uint64_t odd_carries;
/* must record the carry-out of our odd-carries out of bit 63; this
* indicates whether the sense of any edge going to the next iteration
* should be flipped */
bool iter_ends_odd_backslash =
add_overflow(bs_bits, odd_starts, &odd_carries);
bool new_overflow = add_overflow(match, odd_starts, &odd_carries);
odd_carries |= prev_iter_ends_odd_backslash; /* push in bit zero as a
* potential end if we had an
* odd-numbered run at the
* end of the previous
* iteration */
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
uint64_t even_carry_ends = even_carries & ~bs_bits;
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
odd_carries |= overflow; /* push in bit zero as a
* potential end if we had an
* odd-numbered run at the
* end of the previous
* iteration */
overflow = new_overflow ? 0x1ULL : 0x0ULL;
uint64_t even_carry_ends = even_carries & ~match;
uint64_t odd_carry_ends = odd_carries & ~match;
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
uint64_t odd_start_even_end = odd_carry_ends & even_bits;
uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
return odd_ends;
}
// return both the quote mask (which is a half-open mask that covers the first
// quote
// in an unescaped quote pair and everything in the quote pair) and the quote
// bits, which are the simple
// unescaped quoted bits. We also update the prev_iter_inside_quote value to
// tell the next iteration
// whether we finished the final iteration inside a quote pair; if so, this
// inverts our behavior of
// whether we're inside quotes for the next iteration.
// Note that we don't do any error checking to see if we have backslash
// sequences outside quotes; these
// backslash sequences (of any length) will be detected elsewhere.
really_inline uint64_t find_quote_mask_and_bits(
simd_input<ARCHITECTURE> in, uint64_t odd_ends,
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
uint64_t &error_mask) {
quote_bits = in.eq('"');
quote_bits = quote_bits & ~odd_ends;
uint64_t quote_mask = compute_quote_mask(quote_bits);
quote_mask ^= prev_iter_inside_quote;
//
// Check if the current character immediately follows a matching character.
//
// For example, this checks for quotes with backslashes in front of them:
//
// const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
//
really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
const uint64_t result = match << 1 | overflow;
overflow = match >> 63;
return result;
}
//
// Check if the current character follows a matching character, with possible "filler" between.
// For example, this checks for empty curly braces, e.g.
//
// in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
//
really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow ) {
uint64_t follows_match = follows(match, overflow);
uint64_t result;
overflow |= add_overflow(follows_match, filler, &result);
return result;
}
really_inline ErrorValues detect_errors_on_eof(
uint64_t &unescaped_chars_error,
const uint64_t prev_in_string) {
if (prev_in_string) {
return UNCLOSED_STRING;
}
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
return SUCCESS;
}
//
// Return a mask of all string characters plus end quotes.
//
// prev_escaped is overflow saying whether the next character is escaped.
// prev_in_string is overflow saying whether we're still in a string.
//
// Backslash sequences outside of quotes will be detected in stage 2.
//
really_inline uint64_t find_strings(const simd_input<ARCHITECTURE> in, uint64_t &prev_escaped, uint64_t &prev_in_string) {
const uint64_t backslash = in.eq('\\');
const uint64_t escaped = follows_odd_sequence_of(backslash, prev_escaped);
const uint64_t quote = in.eq('"') & ~escaped;
// compute_quote_mask returns start quote plus string contents.
const uint64_t in_string = compute_quote_mask(quote) ^ prev_in_string;
/* right shift of a signed value expected to be well-defined and standard
* compliant as of C++20,
* John Regher from Utah U. says this is fine code */
prev_in_string = static_cast<uint64_t>(static_cast<int64_t>(in_string) >> 63);
// Use ^ to turn the beginning quote off, and the end quote on.
return in_string ^ quote;
}
really_inline uint64_t invalid_string_bytes(const uint64_t unescaped, const uint64_t quote_mask) {
/* All Unicode characters may be placed within the
* quotation marks, except for the characters that MUST be escaped:
* quotation mark, reverse solidus, and the control characters (U+0000
* through U+001F).
* https://tools.ietf.org/html/rfc8259 */
uint64_t unescaped = in.lteq(0x1F);
error_mask |= quote_mask & unescaped;
/* right shift of a signed value expected to be well-defined and standard
* compliant as of C++20,
* John Regher from Utah U. says this is fine code */
prev_iter_inside_quote =
static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63);
return quote_mask;
return quote_mask & unescaped;
}
really_inline uint64_t finalize_structurals(
uint64_t structurals, uint64_t whitespace, uint64_t quote_mask,
uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) {
// mask off anything inside quotes
structurals &= ~quote_mask;
// add the real quote bits back into our bit_mask as well, so we can
// quickly traverse the strings we've spent all this trouble gathering
structurals |= quote_bits;
// Now, establish "pseudo-structural characters". These are non-whitespace
// characters that are (a) outside quotes and (b) have a predecessor that's
// either whitespace or a structural character. This means that subsequent
// passes will get a chance to encounter the first character of every string
// of non-whitespace and, if we're parsing an atom like true/false/null or a
// number we can stop at the first whitespace or structural character
// following it.
//
// Determine which characters are *structural*:
// - braces: [] and {}
// - the start of primitives (123, true, false, null)
// - the start of invalid non-whitespace (+, &, ture, UTF-8)
//
// Also detects value sequence errors:
// - two values with no separator between ("hello" "world")
// - separators with no values ([1,] [1,,]and [,2])
//
// This method will find all of the above whether it is in a string or not.
//
// To reduce dependency on the expensive "what is in a string" computation, this method treats the
// contents of a string the same as content outside. Errors and structurals inside the string or on
// the trailing quote will need to be removed later when the correct string information is known.
//
really_inline uint64_t find_potential_structurals(const simd_input<ARCHITECTURE> in, uint64_t &prev_primitive) {
// These use SIMD so let's kick them off before running the regular 64-bit stuff ...
uint64_t whitespace, op;
find_whitespace_and_operators(in, whitespace, op);
// a qualified predecessor is something that can happen 1 position before an
// pseudo-structural character
uint64_t pseudo_pred = structurals | whitespace;
// Detect the start of a run of primitive characters. Includes numbers, booleans, and strings (").
// Everything except whitespace, braces, colon and comma.
const uint64_t primitive = ~(op | whitespace);
const uint64_t follows_primitive = follows(primitive, prev_primitive);
const uint64_t start_primitive = primitive & ~follows_primitive;
uint64_t shifted_pseudo_pred =
(pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
uint64_t pseudo_structurals =
shifted_pseudo_pred & (~whitespace) & (~quote_mask);
structurals |= pseudo_structurals;
// now, we've used our close quotes all we need to. So let's switch them off
// they will be off in the quote mask and on in quote bits.
structurals &= ~(quote_bits & ~quote_mask);
return structurals;
// Return final structurals
return op | start_primitive;
}
// Find structural bits in a 64-byte chunk.
really_inline void find_structural_bits_64(
const uint8_t *buf, size_t idx, uint32_t *base_ptr, uint32_t &base,
uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote,
uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals,
uint64_t &error_mask,
static const size_t STEP_SIZE = 128;
//
// Find the important bits of JSON in a 128-byte chunk, and add them to :
//
//
//
// PERF NOTES:
// We pipe 2 inputs through these stages:
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
// 2. Scan the JSON for critical data: strings, primitives and operators. This is the critical path.
// The output of step 1 depends entirely on this information. These functions don't quite use
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
// at a time. The second input's scans has some dependency on the first ones finishing it, but
// they can make a lot of progress before they need that information.
// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
// to finish: utf-8 checks and generating the output from the last iteration.
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
really_inline void find_structural_bits_128(
const uint8_t *buf, const size_t idx, uint32_t *&base_ptr,
uint64_t &prev_escaped, uint64_t &prev_in_string,
uint64_t &prev_primitive,
uint64_t &prev_structurals,
uint64_t &unescaped_chars_error,
utf8_checker<ARCHITECTURE> &utf8_state) {
simd_input<ARCHITECTURE> in(buf);
utf8_state.check_next_input(in);
/* detect odd sequences of backslashes */
uint64_t odd_ends = find_odd_backslash_sequences(
in, prev_iter_ends_odd_backslash);
//
// Load up all 128 bytes into SIMD registers
//
simd_input<ARCHITECTURE> in_1(buf);
simd_input<ARCHITECTURE> in_2(buf+64);
/* detect insides of quote pairs ("quote_mask") and also our quote_bits
* themselves */
uint64_t quote_bits;
uint64_t quote_mask = find_quote_mask_and_bits(
in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
//
// Find the strings and potential structurals (operators / primitives).
//
// This will include false structurals that are *inside* strings--we'll filter strings out
// before we return.
//
uint64_t string_1 = find_strings(in_1, prev_escaped, prev_in_string);
uint64_t structurals_1 = find_potential_structurals(in_1, prev_primitive);
uint64_t string_2 = find_strings(in_2, prev_escaped, prev_in_string);
uint64_t structurals_2 = find_potential_structurals(in_2, prev_primitive);
/* take the previous iterations structural bits, not our current
* iteration,
* and flatten */
flatten_bits(base_ptr, base, idx, structurals);
//
// Do miscellaneous work while the processor is busy calculating strings and structurals.
//
// After that, weed out structurals that are inside strings and find invalid string characters.
//
uint64_t unescaped_1 = in_1.lteq(0x1F);
utf8_state.check_next_input(in_1);
flatten_bits(base_ptr, idx, prev_structurals); // Output *last* iteration's structurals to ParsedJson
prev_structurals = structurals_1 & ~string_1;
unescaped_chars_error |= unescaped_1 & string_1;
uint64_t whitespace;
find_whitespace_and_structurals(in, whitespace, structurals);
/* fixup structurals to reflect quotes and add pseudo-structural
* characters */
structurals = finalize_structurals(structurals, whitespace, quote_mask,
quote_bits, prev_iter_ends_pseudo_pred);
uint64_t unescaped_2 = in_2.lteq(0x1F);
utf8_state.check_next_input(in_2);
flatten_bits(base_ptr, idx+64, prev_structurals); // Output *last* iteration's structurals to ParsedJson
prev_structurals = structurals_2 & ~string_2;
unescaped_chars_error |= unescaped_2 & string_2;
}
int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
if (len > pj.byte_capacity) {
if (unlikely(len > pj.byte_capacity)) {
std::cerr << "Your ParsedJson object only supports documents up to "
<< pj.byte_capacity << " bytes but you are trying to process "
<< len << " bytes" << std::endl;
return simdjson::CAPACITY;
}
uint32_t *base_ptr = pj.structural_indexes;
uint32_t base = 0;
utf8_checker<ARCHITECTURE> utf8_state;
/* we have padded the input out to 64 byte multiple with the remainder
* being zeros persistent state across loop does the last iteration end
* with an odd-length sequence of backslashes? */
/* either 0 or 1, but a 64-bit value */
uint64_t prev_iter_ends_odd_backslash = 0ULL;
/* does the previous iteration end inside a double-quote pair? */
uint64_t prev_iter_inside_quote =
0ULL; /* either all zeros or all ones
* does the previous iteration end on something that is a
* predecessor of a pseudo-structural character - i.e.
* whitespace or a structural character effectively the very
* first char is considered to follow "whitespace" for the
* purposes of pseudo-structural character detection so we
* initialize to 1 */
uint64_t prev_iter_ends_pseudo_pred = 1ULL;
/* structurals are persistent state across loop as we flatten them on the
* subsequent iteration into our array pointed to be base_ptr.
* This is harmless on the first iteration as structurals==0
* and is done for performance reasons; we can hide some of the latency of
* the
* expensive carryless multiply in the previous step with this work */
// Whether the first character of the next iteration is escaped.
uint64_t prev_escaped = 0ULL;
// Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
uint64_t prev_in_string = 0ULL;
// Whether the last character of the previous iteration is a primitive value character
// (anything except whitespace, braces, comma or colon).
uint64_t prev_primitive = 0ULL;
// Mask of structural characters from the last iteration.
// Kept around for performance reasons, so we can call flatten_bits to soak up some unused
// CPU capacity while the next iteration is busy with an expensive clmul in compute_quote_mask.
uint64_t structurals = 0;
size_t lenminus64 = len < 64 ? 0 : len - 64;
size_t lenminusstep = len < STEP_SIZE ? 0 : len - STEP_SIZE;
size_t idx = 0;
uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII
code points < 0x20) */
// Errors with unescaped characters in strings (ASCII codepoints < 0x20)
uint64_t unescaped_chars_error = 0;
for (; idx < lenminus64; idx += 64) {
find_structural_bits_64(&buf[idx], idx, base_ptr, base,
prev_iter_ends_odd_backslash,
prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
structurals, error_mask, utf8_state);
for (; idx < lenminusstep; idx += STEP_SIZE) {
find_structural_bits_128(&buf[idx], idx, base_ptr,
prev_escaped, prev_in_string, prev_primitive,
structurals, unescaped_chars_error, utf8_state);
}
/* If we have a final chunk of less than 64 bytes, pad it to 64 with
* spaces before processing it (otherwise, we risk invalidating the UTF-8
* checks). */
if (idx < len) {
uint8_t tmp_buf[64];
memset(tmp_buf, 0x20, 64);
if (likely(idx < len)) {
uint8_t tmp_buf[STEP_SIZE];
memset(tmp_buf, 0x20, STEP_SIZE);
memcpy(tmp_buf, buf + idx, len - idx);
find_structural_bits_64(&tmp_buf[0], idx, base_ptr, base,
prev_iter_ends_odd_backslash,
prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
structurals, error_mask, utf8_state);
idx += 64;
find_structural_bits_128(&tmp_buf[0], idx, base_ptr,
prev_escaped, prev_in_string, prev_primitive,
structurals, unescaped_chars_error, utf8_state);
idx += STEP_SIZE;
}
/* is last string quote closed? */
if (prev_iter_inside_quote) {
return simdjson::UNCLOSED_STRING;
/* finally, flatten out the remaining structurals from the last iteration */
flatten_bits(base_ptr, idx, structurals);
simdjson::ErrorValues error = detect_errors_on_eof(unescaped_chars_error, prev_in_string);
if (unlikely(error != simdjson::SUCCESS)) {
return error;
}
/* finally, flatten out the remaining structurals from the last iteration
*/
flatten_bits(base_ptr, base, idx, structurals);
pj.n_structural_indexes = base;
pj.n_structural_indexes = base_ptr - pj.structural_indexes;
/* a valid JSON file cannot have zero structural indexes - we should have
* found something */
if (pj.n_structural_indexes == 0u) {
if (unlikely(pj.n_structural_indexes == 0u)) {
return simdjson::EMPTY;
}
if (base_ptr[pj.n_structural_indexes - 1] > len) {
if (unlikely(pj.structural_indexes[pj.n_structural_indexes - 1] > len)) {
return simdjson::UNEXPECTED_ERROR;
}
if (len != base_ptr[pj.n_structural_indexes - 1]) {
if (len != pj.structural_indexes[pj.n_structural_indexes - 1]) {
/* the string might not be NULL terminated, but we add a virtual NULL
* ending character. */
base_ptr[pj.n_structural_indexes++] = len;
pj.structural_indexes[pj.n_structural_indexes++] = len;
}
/* make it safe to dereference one beyond this array */
base_ptr[pj.n_structural_indexes] = 0;
if (error_mask) {
return simdjson::UNESCAPED_CHARS;
}
pj.structural_indexes[pj.n_structural_indexes] = 0;
return utf8_state.errors();
}

View File

@ -26,64 +26,42 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx
// base_ptr[base] incrementing base as we go
// will potentially store extra values beyond end of valid bits, so base_ptr
// needs to be large enough to handle this
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) {
really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) {
// In some instances, the next branch is expensive because it is mispredicted.
// Unfortunately, in other cases,
// it helps tremendously.
if (bits == 0)
return;
uint32_t cnt = hamming(bits);
uint32_t next_base = base + cnt;
idx -= 64;
base_ptr += base;
{
base_ptr[0] = idx + trailing_zeroes(bits);
// Do the first 8 all together
for (int i=0; i<8; i++) {
base_ptr[i] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[1] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[2] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[3] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[4] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[5] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[6] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[7] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr += 8;
}
// We hope that the next branch is easily predicted.
if (cnt > 8) {
base_ptr[0] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[1] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[2] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[3] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[4] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[5] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[6] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[7] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr += 8;
}
if (cnt > 16) { // unluckly: we rarely get here
// since it means having one structural or pseudo-structral element
// every 4 characters (possible with inputs like "","","",...).
do {
base_ptr[0] = idx + trailing_zeroes(bits);
// Do the next 8 all together (we hope in most cases it won't happen at all
// and the branch is easily predicted).
if (unlikely(cnt > 8)) {
for (int i=8; i<16; i++) {
base_ptr[i] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr++;
} while (bits != 0);
}
// Most files don't have 16+ structurals per block, so we take several basically guaranteed
// branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
// or the start of a value ("abc" true 123) every 4 characters.
if (unlikely(cnt > 16)) {
uint32_t i = 16;
do {
base_ptr[i] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
i++;
} while (i < cnt);
}
}
base = next_base;
base_ptr += cnt;
}
#endif // SIMDJSON_NAIVE_FLATTEN

View File

@ -10,29 +10,28 @@ namespace simdjson {
template <>
struct simd_input<Architecture::HASWELL> {
__m256i chunks[2];
const __m256i chunks[2];
really_inline simd_input() : chunks{__m256i(), __m256i()} {}
really_inline simd_input(const __m256i chunk0, const __m256i chunk1)
: chunks{chunk0, chunk1} {}
really_inline simd_input(const uint8_t *ptr)
{
this->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0*32));
this->chunks[1] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 1*32));
}
really_inline simd_input(__m256i chunk0, __m256i chunk1)
{
this->chunks[0] = chunk0;
this->chunks[1] = chunk1;
}
: chunks{
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0*32)),
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 1*32))
} {}
template <typename F>
really_inline void each(F const& each_chunk)
really_inline void each(F const& each_chunk) const
{
each_chunk(this->chunks[0]);
each_chunk(this->chunks[1]);
}
template <typename F>
really_inline simd_input<Architecture::HASWELL> map(F const& map_chunk) {
really_inline simd_input<Architecture::HASWELL> map(F const& map_chunk) const {
return simd_input<Architecture::HASWELL>(
map_chunk(this->chunks[0]),
map_chunk(this->chunks[1])
@ -40,7 +39,7 @@ struct simd_input<Architecture::HASWELL> {
}
template <typename F>
really_inline simd_input<Architecture::HASWELL> map(simd_input<Architecture::HASWELL> b, F const& map_chunk) {
really_inline simd_input<Architecture::HASWELL> map(const simd_input<Architecture::HASWELL> b, F const& map_chunk) const {
return simd_input<Architecture::HASWELL>(
map_chunk(this->chunks[0], b.chunks[0]),
map_chunk(this->chunks[1], b.chunks[1])
@ -48,24 +47,31 @@ struct simd_input<Architecture::HASWELL> {
}
template <typename F>
really_inline __m256i reduce(F const& reduce_pair) {
really_inline __m256i reduce(F const& reduce_pair) const {
return reduce_pair(this->chunks[0], this->chunks[1]);
}
really_inline uint64_t to_bitmask() {
really_inline uint64_t to_bitmask() const {
uint64_t r_lo = static_cast<uint32_t>(_mm256_movemask_epi8(this->chunks[0]));
uint64_t r_hi = _mm256_movemask_epi8(this->chunks[1]);
return r_lo | (r_hi << 32);
}
really_inline uint64_t eq(uint8_t m) {
really_inline simd_input<Architecture::HASWELL> bit_or(const uint8_t m) const {
const __m256i mask = _mm256_set1_epi8(m);
return this->map( [&](auto a) {
return _mm256_or_si256(a, mask);
});
}
really_inline uint64_t eq(const uint8_t m) const {
const __m256i mask = _mm256_set1_epi8(m);
return this->map( [&](auto a) {
return _mm256_cmpeq_epi8(a, mask);
}).to_bitmask();
}
really_inline uint64_t lteq(uint8_t m) {
really_inline uint64_t lteq(const uint8_t m) const {
const __m256i maxval = _mm256_set1_epi8(m);
return this->map( [&](auto a) {
return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, a), maxval);

View File

@ -218,7 +218,7 @@ struct utf8_checker<Architecture::HASWELL> {
__m256i any_bits_on = in.reduce([&](auto a, auto b) {
return _mm256_or_si256(a, b);
});
if ((_mm256_testz_si256(any_bits_on, high_bit)) == 1) {
if (likely(_mm256_testz_si256(any_bits_on, high_bit) == 1)) {
// it is ascii, we just check continuation
this->has_error = _mm256_or_si256(
_mm256_cmpgt_epi8(this->previous.carried_continuations,

View File

@ -13,7 +13,7 @@
TARGET_HASWELL
namespace simdjson::haswell {
really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
// There should be no such thing with a processing supporting avx2
// but not clmul.
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
@ -21,8 +21,9 @@ really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
return quote_mask;
}
really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
uint64_t &whitespace, uint64_t &structurals) {
really_inline void find_whitespace_and_operators(
const simd_input<ARCHITECTURE> in,
uint64_t &whitespace, uint64_t &op) {
#ifdef SIMDJSON_NAIVE_STRUCTURAL
@ -34,14 +35,14 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
const __m256i mask_column = _mm256_set1_epi8(0x3a);
const __m256i mask_comma = _mm256_set1_epi8(0x2c);
structurals = in.map([&](auto in) {
__m256i structurals = _mm256_cmpeq_epi8(in, mask_open_brace);
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_brace));
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_open_bracket));
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_bracket));
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_column));
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_comma));
return structurals;
op = in.map([&](auto in) {
__m256i op = _mm256_cmpeq_epi8(in, mask_open_brace);
op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_brace));
op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_open_bracket));
op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_bracket));
op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_column));
op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_comma));
return op;
}).to_bitmask();
const __m256i mask_space = _mm256_set1_epi8(0x20);
@ -60,24 +61,24 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
#else // SIMDJSON_NAIVE_STRUCTURAL
// clang-format off
const __m256i structural_table =
const __m256i operator_table =
_mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
const __m256i white_table = _mm256_setr_epi8(
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
// clang-format on
const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
const __m256i struct_mask = _mm256_set1_epi8(32);
const __m256i op_offset = _mm256_set1_epi8(0xd4u);
const __m256i op_mask = _mm256_set1_epi8(32);
whitespace = in.map([&](auto _in) {
return _mm256_cmpeq_epi8(_in, _mm256_shuffle_epi8(white_table, _in));
}).to_bitmask();
structurals = in.map([&](auto _in) {
const __m256i r1 = _mm256_add_epi8(struct_offset, _in);
const __m256i r2 = _mm256_or_si256(_in, struct_mask);
const __m256i r3 = _mm256_shuffle_epi8(structural_table, r1);
op = in.map([&](auto _in) {
const __m256i r1 = _mm256_add_epi8(op_offset, _in);
const __m256i r2 = _mm256_or_si256(_in, op_mask);
const __m256i r3 = _mm256_shuffle_epi8(operator_table, r1);
return _mm256_cmpeq_epi8(r2, r3);
}).to_bitmask();
@ -89,65 +90,43 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
// base_ptr[base] incrementing base as we go
// will potentially store extra values beyond end of valid bits, so base_ptr
// needs to be large enough to handle this
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) {
really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) {
// In some instances, the next branch is expensive because it is mispredicted.
// Unfortunately, in other cases,
// it helps tremendously.
if (bits == 0)
return;
uint32_t cnt = _mm_popcnt_u64(bits);
uint32_t next_base = base + cnt;
idx -= 64;
base_ptr += base;
{
base_ptr[0] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[1] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[2] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[3] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[4] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[5] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[6] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[7] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr += 8;
// Do the first 8 all together
for (int i=0; i<8; i++) {
base_ptr[i] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
}
// We hope that the next branch is easily predicted.
if (cnt > 8) {
base_ptr[0] = idx + trailing_zeroes(bits);
// Do the next 8 all together (we hope in most cases it won't happen at all
// and the branch is easily predicted).
if (unlikely(cnt > 8)) {
for (int i=8; i<16; i++) {
base_ptr[i] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[1] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[2] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[3] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[4] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[5] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[6] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr[7] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr += 8;
}
if (cnt > 16) { // unluckly: we rarely get here
// since it means having one structural or pseudo-structral element
// every 4 characters (possible with inputs like "","","",...).
}
// Most files don't have 16+ structurals per block, so we take several basically guaranteed
// branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
// or the start of a value ("abc" true 123) every four characters.
if (unlikely(cnt > 16)) {
uint32_t i = 16;
do {
base_ptr[0] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
base_ptr++;
} while (bits != 0);
base_ptr[i] = idx + trailing_zeroes(bits);
bits = _blsr_u64(bits);
i++;
} while (i < cnt);
}
}
base = next_base;
base_ptr += cnt;
}
#include "generic/stage1_find_marks.h"

View File

@ -10,26 +10,24 @@ namespace simdjson {
template <>
struct simd_input<Architecture::WESTMERE> {
__m128i chunks[4];
const __m128i chunks[4];
really_inline simd_input(const uint8_t *ptr) {
this->chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
this->chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
this->chunks[2] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
this->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
}
really_inline simd_input()
: chunks { __m128i(), __m128i(), __m128i(), __m128i() } {}
really_inline simd_input(__m128i i0, __m128i i1, __m128i i2, __m128i i3)
{
this->chunks[0] = i0;
this->chunks[1] = i1;
this->chunks[2] = i2;
this->chunks[3] = i3;
}
really_inline simd_input(const __m128i chunk0, const __m128i chunk1, const __m128i chunk2, const __m128i chunk3)
: chunks{chunk0, chunk1, chunk2, chunk3} {}
really_inline simd_input(const uint8_t *ptr)
: simd_input(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48))
) {}
template <typename F>
really_inline void each(F const& each_chunk)
{
really_inline void each(F const& each_chunk) const {
each_chunk(this->chunks[0]);
each_chunk(this->chunks[1]);
each_chunk(this->chunks[2]);
@ -37,7 +35,7 @@ struct simd_input<Architecture::WESTMERE> {
}
template <typename F>
really_inline simd_input<Architecture::WESTMERE> map(F const& map_chunk) {
really_inline simd_input<Architecture::WESTMERE> map(F const& map_chunk) const {
return simd_input<Architecture::WESTMERE>(
map_chunk(this->chunks[0]),
map_chunk(this->chunks[1]),
@ -47,7 +45,7 @@ struct simd_input<Architecture::WESTMERE> {
}
template <typename F>
really_inline simd_input<Architecture::WESTMERE> map(simd_input<Architecture::WESTMERE> b, F const& map_chunk) {
really_inline simd_input<Architecture::WESTMERE> map(const simd_input<Architecture::WESTMERE> b, F const& map_chunk) const {
return simd_input<Architecture::WESTMERE>(
map_chunk(this->chunks[0], b.chunks[0]),
map_chunk(this->chunks[1], b.chunks[1]),
@ -57,13 +55,13 @@ struct simd_input<Architecture::WESTMERE> {
}
template <typename F>
really_inline __m128i reduce(F const& reduce_pair) {
really_inline __m128i reduce(F const& reduce_pair) const {
__m128i r01 = reduce_pair(this->chunks[0], this->chunks[1]);
__m128i r23 = reduce_pair(this->chunks[2], this->chunks[3]);
return reduce_pair(r01, r23);
}
really_inline uint64_t to_bitmask() {
really_inline uint64_t to_bitmask() const {
uint64_t r0 = static_cast<uint32_t>(_mm_movemask_epi8(this->chunks[0]));
uint64_t r1 = _mm_movemask_epi8(this->chunks[1]);
uint64_t r2 = _mm_movemask_epi8(this->chunks[2]);
@ -71,14 +69,21 @@ struct simd_input<Architecture::WESTMERE> {
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
}
really_inline uint64_t eq(uint8_t m) {
really_inline simd_input<Architecture::WESTMERE> bit_or(const uint8_t m) const {
const __m128i mask = _mm_set1_epi8(m);
return this->map( [&](auto a) {
return _mm_or_si128(a, mask);
});
}
really_inline uint64_t eq(const uint8_t m) const {
const __m128i mask = _mm_set1_epi8(m);
return this->map( [&](auto a) {
return _mm_cmpeq_epi8(a, mask);
}).to_bitmask();
}
really_inline uint64_t lteq(uint8_t m) {
really_inline uint64_t lteq(const uint8_t m) const {
const __m128i maxval = _mm_set1_epi8(m);
return this->map( [&](auto a) {
return _mm_cmpeq_epi8(_mm_max_epu8(maxval, a), maxval);

View File

@ -13,29 +13,30 @@
TARGET_WESTMERE
namespace simdjson::westmere {
really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
return _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
}
really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
uint64_t &whitespace, uint64_t &structurals) {
really_inline void find_whitespace_and_operators(
const simd_input<ARCHITECTURE> in,
uint64_t &whitespace, uint64_t &op) {
const __m128i structural_table =
const __m128i operator_table =
_mm_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
const __m128i white_table = _mm_setr_epi8(32, 100, 100, 100, 17, 100, 113, 2,
100, 9, 10, 112, 100, 13, 100, 100);
const __m128i struct_offset = _mm_set1_epi8(0xd4u);
const __m128i struct_mask = _mm_set1_epi8(32);
const __m128i op_offset = _mm_set1_epi8(0xd4u);
const __m128i op_mask = _mm_set1_epi8(32);
whitespace = in.map([&](auto _in) {
return _mm_cmpeq_epi8(_in, _mm_shuffle_epi8(white_table, _in));
}).to_bitmask();
structurals = in.map([&](auto _in) {
const __m128i r1 = _mm_add_epi8(struct_offset, _in);
const __m128i r2 = _mm_or_si128(_in, struct_mask);
const __m128i r3 = _mm_shuffle_epi8(structural_table, r1);
op = in.map([&](auto _in) {
const __m128i r1 = _mm_add_epi8(op_offset, _in);
const __m128i r2 = _mm_or_si128(_in, op_mask);
const __m128i r3 = _mm_shuffle_epi8(operator_table, r1);
return _mm_cmpeq_epi8(r2, r3);
}).to_bitmask();
}