Combined performance patch (5% overall, 15% stage 1) (#317)
* Allow -f * Support parse -s (force sse) * Simplify flatten_bits - Add directly to base instead of storing variable - Don't modify base_ptr after beginning of function - Eliminate base variable and increment base_ptr instead * De-unroll the flatten_bits loops * Decrease dependencies in stage 1 - Do all finalize_structurals work before computing the quote mask; mask out the quote mask later - Join find_whitespace_and_structurals and finalize_structurals into single find_structurals call, to reduce variable leakage - Rework pseudo_pred algorithm to refer to "primitive" for clarity and some dependency reduction - Rename quote_mask to in_string to describe what we're trying to achieve ("mask" could mean many things) - Break up find_quote_mask_and_bits into find_quote_mask and invalid_string_bytes to reduce data leakage (i.e. don't expose quote bits or odd_ends at all to find_structural_bits) - Genericize overflow methods "follows" and "follows_odd_sequence" for descriptiveness and possible lifting into a generic simd parsing library * Mark branches as likely/unlikely * Reorder and unroll+interleave stage 1 loop * Nest the cnt > 16 branch inside cnt > 8
This commit is contained in:
parent
53b6deaeae
commit
de8df0a05f
|
@ -14,6 +14,7 @@
|
|||
/jsoncheck
|
||||
/jsonpointer
|
||||
/jsonstats
|
||||
/integer_tests
|
||||
/libsimdjson.so*
|
||||
/minify
|
||||
/numberparsingcheck
|
||||
|
|
|
@ -34,6 +34,18 @@
|
|||
#include "simdjson/parsedjson.h"
|
||||
#include "simdjson/stage1_find_marks.h"
|
||||
#include "simdjson/stage2_build_tape.h"
|
||||
|
||||
// Global arguments
|
||||
bool find_marks_only = false;
|
||||
bool verbose = false;
|
||||
bool dump = false;
|
||||
bool json_output = false;
|
||||
bool force_one_iteration = false;
|
||||
bool just_data = false;
|
||||
bool force_sse = false;
|
||||
int32_t iterations = -1;
|
||||
int32_t warmup_iterations = -1;
|
||||
|
||||
namespace simdjson {
|
||||
Architecture _find_best_supported_implementation() {
|
||||
constexpr uint32_t haswell_flags =
|
||||
|
@ -43,7 +55,7 @@ Architecture _find_best_supported_implementation() {
|
|||
instruction_set::SSE42 | instruction_set::PCLMULQDQ;
|
||||
uint32_t supports = detect_supported_architectures();
|
||||
// Order from best to worst (within architecture)
|
||||
if ((haswell_flags & supports) == haswell_flags) {
|
||||
if ((haswell_flags & supports) == haswell_flags && !force_sse) {
|
||||
return Architecture::HASWELL;
|
||||
}
|
||||
if ((westmere_flags & supports) == westmere_flags) {
|
||||
|
@ -63,6 +75,9 @@ extern unified_functype *unified_ptr;
|
|||
extern stage1_functype *stage1_ptr;
|
||||
|
||||
int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
||||
if (find_marks_only) {
|
||||
return simdjson::SUCCESS;
|
||||
}
|
||||
Architecture best_implementation = _find_best_supported_implementation();
|
||||
// Selecting the best implementation
|
||||
switch (best_implementation) {
|
||||
|
@ -118,18 +133,11 @@ unified_functype *unified_ptr = &unified_machine_dispatch;
|
|||
} // namespace simdjson
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
bool verbose = false;
|
||||
bool dump = false;
|
||||
bool json_output = false;
|
||||
bool force_one_iteration = false;
|
||||
bool just_data = false;
|
||||
int32_t iterations = -1;
|
||||
int32_t warmup_iterations = -1;
|
||||
|
||||
#ifndef _MSC_VER
|
||||
int c;
|
||||
|
||||
while ((c = getopt(argc, argv, "1vdtn:w:")) != -1) {
|
||||
while ((c = getopt(argc, argv, "1vdtn:w:fs")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
iterations = atoi(optarg);
|
||||
|
@ -137,6 +145,9 @@ int main(int argc, char *argv[]) {
|
|||
case 'w':
|
||||
warmup_iterations = atoi(optarg);
|
||||
break;
|
||||
case 's':
|
||||
force_sse = true;
|
||||
break;
|
||||
case 't':
|
||||
just_data = true;
|
||||
break;
|
||||
|
@ -152,6 +163,9 @@ int main(int argc, char *argv[]) {
|
|||
case '1':
|
||||
force_one_iteration = true;
|
||||
break;
|
||||
case 'f':
|
||||
find_marks_only = true;
|
||||
break;
|
||||
default:
|
||||
abort();
|
||||
}
|
||||
|
@ -326,7 +340,7 @@ int main(int argc, char *argv[]) {
|
|||
isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
|
||||
simdjson::SUCCESS);
|
||||
isok = isok &&
|
||||
(simdjson::SUCCESS ==
|
||||
(simdjson::SUCCESS ==
|
||||
simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
|
||||
auto end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> secs = end - start;
|
||||
|
|
|
@ -17,6 +17,17 @@
|
|||
#define SIMDJSON_PADDING 32
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__)
|
||||
// Marks a block with a name so that MCA analysis can see it.
|
||||
#define BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
|
||||
#define END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
|
||||
#define DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
|
||||
#else
|
||||
#define BEGIN_DEBUG_BLOCK(name)
|
||||
#define END_DEBUG_BLOCK(name)
|
||||
#define DEBUG_BLOCK(name, block)
|
||||
#endif
|
||||
|
||||
#ifndef _MSC_VER
|
||||
// Implemented using Labels as Values which works in GCC and CLANG (and maybe
|
||||
// also in Intel's compiler), but won't work in MSVC.
|
||||
|
|
|
@ -29,5 +29,5 @@ make parse
|
|||
make perfdiff
|
||||
|
||||
echo "Running perfdiff:"
|
||||
echo ./perfdiff \"$current/parse -t $perftests\" \"$reference/parse -t $perftests\"
|
||||
./perfdiff "$current/parse -t $perftests" "$reference/parse -t $perftests"
|
||||
echo ./perfdiff \"$current/parse -t $perftests $CHECKPERF_ARGS\" \"$reference/parse -t $perftests $CHECKPERF_ARGS\"
|
||||
./perfdiff "$current/parse -t $perftests $CHECKPERF_ARGS" "$reference/parse -t $perftests $CHECKPERF_ARGS"
|
||||
|
|
|
@ -40,25 +40,24 @@ using namespace simdjson::arm64;
|
|||
|
||||
template <>
|
||||
struct simd_input<Architecture::ARM64> {
|
||||
uint8x16_t chunks[4];
|
||||
const uint8x16_t chunks[4];
|
||||
|
||||
really_inline simd_input(const uint8_t *ptr) {
|
||||
this->chunks[0] = vld1q_u8(ptr + 0*16);
|
||||
this->chunks[1] = vld1q_u8(ptr + 1*16);
|
||||
this->chunks[2] = vld1q_u8(ptr + 2*16);
|
||||
this->chunks[3] = vld1q_u8(ptr + 3*16);
|
||||
}
|
||||
really_inline simd_input()
|
||||
: chunks{uint8x16_t(), uint8x16_t(), uint8x16_t(), uint8x16_t() } {}
|
||||
|
||||
really_inline simd_input(uint8x16_t chunk0, uint8x16_t chunk1, uint8x16_t chunk2, uint8x16_t chunk3) {
|
||||
this->chunks[0] = chunk0;
|
||||
this->chunks[1] = chunk1;
|
||||
this->chunks[2] = chunk2;
|
||||
this->chunks[3] = chunk3;
|
||||
}
|
||||
really_inline simd_input(const uint8x16_t chunk0, const uint8x16_t chunk1, const uint8x16_t chunk2, const uint8x16_t chunk3)
|
||||
: chunks{chunk0, chunk1, chunk2, chunk3 } {}
|
||||
|
||||
really_inline simd_input(const uint8_t *ptr)
|
||||
: chunks{
|
||||
vld1q_u8(ptr + 0*16),
|
||||
vld1q_u8(ptr + 1*16),
|
||||
vld1q_u8(ptr + 2*16),
|
||||
vld1q_u8(ptr + 3*16)
|
||||
} {}
|
||||
|
||||
template <typename F>
|
||||
really_inline void each(F const& each_chunk)
|
||||
{
|
||||
really_inline void each(F const& each_chunk) const {
|
||||
each_chunk(this->chunks[0]);
|
||||
each_chunk(this->chunks[1]);
|
||||
each_chunk(this->chunks[2]);
|
||||
|
@ -66,7 +65,7 @@ struct simd_input<Architecture::ARM64> {
|
|||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
|
||||
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) const {
|
||||
return simd_input<Architecture::ARM64>(
|
||||
map_chunk(this->chunks[0]),
|
||||
map_chunk(this->chunks[1]),
|
||||
|
@ -76,7 +75,7 @@ struct simd_input<Architecture::ARM64> {
|
|||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) {
|
||||
really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) const {
|
||||
return simd_input<Architecture::ARM64>(
|
||||
map_chunk(this->chunks[0], b.chunks[0]),
|
||||
map_chunk(this->chunks[1], b.chunks[1]),
|
||||
|
@ -86,24 +85,31 @@ struct simd_input<Architecture::ARM64> {
|
|||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline uint8x16_t reduce(F const& reduce_pair) {
|
||||
really_inline uint8x16_t reduce(F const& reduce_pair) const {
|
||||
uint8x16_t r01 = reduce_pair(this->chunks[0], this->chunks[1]);
|
||||
uint8x16_t r23 = reduce_pair(this->chunks[2], this->chunks[3]);
|
||||
return reduce_pair(r01, r23);
|
||||
}
|
||||
|
||||
really_inline uint64_t to_bitmask() {
|
||||
really_inline uint64_t to_bitmask() const {
|
||||
return neon_movemask_bulk(this->chunks[0], this->chunks[1], this->chunks[2], this->chunks[3]);
|
||||
}
|
||||
|
||||
really_inline uint64_t eq(uint8_t m) {
|
||||
really_inline simd_input<Architecture::ARM64> bit_or(const uint8_t m) const {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
return this->map( [&](auto a) {
|
||||
return vorrq_u8(a, mask);
|
||||
});
|
||||
}
|
||||
|
||||
really_inline uint64_t eq(const uint8_t m) const {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
return this->map( [&](auto a) {
|
||||
return vceqq_u8(a, mask);
|
||||
}).to_bitmask();
|
||||
}
|
||||
|
||||
really_inline uint64_t lteq(uint8_t m) {
|
||||
really_inline uint64_t lteq(const uint8_t m) const {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
return this->map( [&](auto a) {
|
||||
return vcleq_u8(a, mask);
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
|
||||
namespace simdjson::arm64 {
|
||||
|
||||
really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
|
||||
really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
|
||||
|
||||
#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
|
||||
return vmull_p64(-1ULL, quote_bits);
|
||||
|
@ -21,9 +21,9 @@ really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
|
|||
#endif
|
||||
}
|
||||
|
||||
really_inline void find_whitespace_and_structurals(
|
||||
simd_input<ARCHITECTURE> in, uint64_t &whitespace,
|
||||
uint64_t &structurals) {
|
||||
really_inline void find_whitespace_and_operators(
|
||||
const simd_input<ARCHITECTURE> in,
|
||||
uint64_t &whitespace, uint64_t &op) {
|
||||
const uint8x16_t low_nibble_mask =
|
||||
(uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
|
||||
const uint8x16_t high_nibble_mask =
|
||||
|
@ -38,9 +38,9 @@ really_inline void find_whitespace_and_structurals(
|
|||
return vandq_u8(shuf_lo, shuf_hi);
|
||||
});
|
||||
|
||||
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
|
||||
structurals = v.map([&](auto _v) {
|
||||
return vtstq_u8(_v, structural_shufti_mask);
|
||||
const uint8x16_t operator_shufti_mask = vmovq_n_u8(0x7);
|
||||
op = v.map([&](auto _v) {
|
||||
return vtstq_u8(_v, operator_shufti_mask);
|
||||
}).to_bitmask();
|
||||
|
||||
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
|
||||
|
|
|
@ -12,230 +12,271 @@
|
|||
// indicate whether we end an iteration on an odd-length sequence of
|
||||
// backslashes, which modifies our subsequent search for odd-length
|
||||
// sequences of backslashes in an obvious way.
|
||||
really_inline uint64_t find_odd_backslash_sequences(
|
||||
simd_input<ARCHITECTURE> in,
|
||||
uint64_t &prev_iter_ends_odd_backslash) {
|
||||
really_inline uint64_t follows_odd_sequence_of(const uint64_t match, uint64_t &overflow) {
|
||||
const uint64_t even_bits = 0x5555555555555555ULL;
|
||||
const uint64_t odd_bits = ~even_bits;
|
||||
uint64_t bs_bits = in.eq('\\');
|
||||
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
|
||||
uint64_t start_edges = match & ~(match << 1);
|
||||
/* flip lowest if we have an odd-length run at the end of the prior
|
||||
* iteration */
|
||||
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
|
||||
uint64_t even_start_mask = even_bits ^ overflow;
|
||||
uint64_t even_starts = start_edges & even_start_mask;
|
||||
uint64_t odd_starts = start_edges & ~even_start_mask;
|
||||
uint64_t even_carries = bs_bits + even_starts;
|
||||
uint64_t even_carries = match + even_starts;
|
||||
|
||||
uint64_t odd_carries;
|
||||
/* must record the carry-out of our odd-carries out of bit 63; this
|
||||
* indicates whether the sense of any edge going to the next iteration
|
||||
* should be flipped */
|
||||
bool iter_ends_odd_backslash =
|
||||
add_overflow(bs_bits, odd_starts, &odd_carries);
|
||||
bool new_overflow = add_overflow(match, odd_starts, &odd_carries);
|
||||
|
||||
odd_carries |= prev_iter_ends_odd_backslash; /* push in bit zero as a
|
||||
* potential end if we had an
|
||||
* odd-numbered run at the
|
||||
* end of the previous
|
||||
* iteration */
|
||||
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
|
||||
uint64_t even_carry_ends = even_carries & ~bs_bits;
|
||||
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
|
||||
odd_carries |= overflow; /* push in bit zero as a
|
||||
* potential end if we had an
|
||||
* odd-numbered run at the
|
||||
* end of the previous
|
||||
* iteration */
|
||||
overflow = new_overflow ? 0x1ULL : 0x0ULL;
|
||||
uint64_t even_carry_ends = even_carries & ~match;
|
||||
uint64_t odd_carry_ends = odd_carries & ~match;
|
||||
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
|
||||
uint64_t odd_start_even_end = odd_carry_ends & even_bits;
|
||||
uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
|
||||
return odd_ends;
|
||||
}
|
||||
|
||||
// return both the quote mask (which is a half-open mask that covers the first
|
||||
// quote
|
||||
// in an unescaped quote pair and everything in the quote pair) and the quote
|
||||
// bits, which are the simple
|
||||
// unescaped quoted bits. We also update the prev_iter_inside_quote value to
|
||||
// tell the next iteration
|
||||
// whether we finished the final iteration inside a quote pair; if so, this
|
||||
// inverts our behavior of
|
||||
// whether we're inside quotes for the next iteration.
|
||||
// Note that we don't do any error checking to see if we have backslash
|
||||
// sequences outside quotes; these
|
||||
// backslash sequences (of any length) will be detected elsewhere.
|
||||
really_inline uint64_t find_quote_mask_and_bits(
|
||||
simd_input<ARCHITECTURE> in, uint64_t odd_ends,
|
||||
uint64_t &prev_iter_inside_quote, uint64_t "e_bits,
|
||||
uint64_t &error_mask) {
|
||||
quote_bits = in.eq('"');
|
||||
quote_bits = quote_bits & ~odd_ends;
|
||||
uint64_t quote_mask = compute_quote_mask(quote_bits);
|
||||
quote_mask ^= prev_iter_inside_quote;
|
||||
//
|
||||
// Check if the current character immediately follows a matching character.
|
||||
//
|
||||
// For example, this checks for quotes with backslashes in front of them:
|
||||
//
|
||||
// const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
|
||||
//
|
||||
really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
|
||||
const uint64_t result = match << 1 | overflow;
|
||||
overflow = match >> 63;
|
||||
return result;
|
||||
}
|
||||
|
||||
//
|
||||
// Check if the current character follows a matching character, with possible "filler" between.
|
||||
// For example, this checks for empty curly braces, e.g.
|
||||
//
|
||||
// in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
|
||||
//
|
||||
really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow ) {
|
||||
uint64_t follows_match = follows(match, overflow);
|
||||
uint64_t result;
|
||||
overflow |= add_overflow(follows_match, filler, &result);
|
||||
return result;
|
||||
}
|
||||
|
||||
really_inline ErrorValues detect_errors_on_eof(
|
||||
uint64_t &unescaped_chars_error,
|
||||
const uint64_t prev_in_string) {
|
||||
if (prev_in_string) {
|
||||
return UNCLOSED_STRING;
|
||||
}
|
||||
if (unescaped_chars_error) {
|
||||
return UNESCAPED_CHARS;
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
//
|
||||
// Return a mask of all string characters plus end quotes.
|
||||
//
|
||||
// prev_escaped is overflow saying whether the next character is escaped.
|
||||
// prev_in_string is overflow saying whether we're still in a string.
|
||||
//
|
||||
// Backslash sequences outside of quotes will be detected in stage 2.
|
||||
//
|
||||
really_inline uint64_t find_strings(const simd_input<ARCHITECTURE> in, uint64_t &prev_escaped, uint64_t &prev_in_string) {
|
||||
const uint64_t backslash = in.eq('\\');
|
||||
const uint64_t escaped = follows_odd_sequence_of(backslash, prev_escaped);
|
||||
const uint64_t quote = in.eq('"') & ~escaped;
|
||||
// compute_quote_mask returns start quote plus string contents.
|
||||
const uint64_t in_string = compute_quote_mask(quote) ^ prev_in_string;
|
||||
/* right shift of a signed value expected to be well-defined and standard
|
||||
* compliant as of C++20,
|
||||
* John Regher from Utah U. says this is fine code */
|
||||
prev_in_string = static_cast<uint64_t>(static_cast<int64_t>(in_string) >> 63);
|
||||
// Use ^ to turn the beginning quote off, and the end quote on.
|
||||
return in_string ^ quote;
|
||||
}
|
||||
|
||||
really_inline uint64_t invalid_string_bytes(const uint64_t unescaped, const uint64_t quote_mask) {
|
||||
/* All Unicode characters may be placed within the
|
||||
* quotation marks, except for the characters that MUST be escaped:
|
||||
* quotation mark, reverse solidus, and the control characters (U+0000
|
||||
* through U+001F).
|
||||
* https://tools.ietf.org/html/rfc8259 */
|
||||
uint64_t unescaped = in.lteq(0x1F);
|
||||
error_mask |= quote_mask & unescaped;
|
||||
/* right shift of a signed value expected to be well-defined and standard
|
||||
* compliant as of C++20,
|
||||
* John Regher from Utah U. says this is fine code */
|
||||
prev_iter_inside_quote =
|
||||
static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63);
|
||||
return quote_mask;
|
||||
return quote_mask & unescaped;
|
||||
}
|
||||
|
||||
really_inline uint64_t finalize_structurals(
|
||||
uint64_t structurals, uint64_t whitespace, uint64_t quote_mask,
|
||||
uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) {
|
||||
// mask off anything inside quotes
|
||||
structurals &= ~quote_mask;
|
||||
// add the real quote bits back into our bit_mask as well, so we can
|
||||
// quickly traverse the strings we've spent all this trouble gathering
|
||||
structurals |= quote_bits;
|
||||
// Now, establish "pseudo-structural characters". These are non-whitespace
|
||||
// characters that are (a) outside quotes and (b) have a predecessor that's
|
||||
// either whitespace or a structural character. This means that subsequent
|
||||
// passes will get a chance to encounter the first character of every string
|
||||
// of non-whitespace and, if we're parsing an atom like true/false/null or a
|
||||
// number we can stop at the first whitespace or structural character
|
||||
// following it.
|
||||
//
|
||||
// Determine which characters are *structural*:
|
||||
// - braces: [] and {}
|
||||
// - the start of primitives (123, true, false, null)
|
||||
// - the start of invalid non-whitespace (+, &, ture, UTF-8)
|
||||
//
|
||||
// Also detects value sequence errors:
|
||||
// - two values with no separator between ("hello" "world")
|
||||
// - separators with no values ([1,] [1,,]and [,2])
|
||||
//
|
||||
// This method will find all of the above whether it is in a string or not.
|
||||
//
|
||||
// To reduce dependency on the expensive "what is in a string" computation, this method treats the
|
||||
// contents of a string the same as content outside. Errors and structurals inside the string or on
|
||||
// the trailing quote will need to be removed later when the correct string information is known.
|
||||
//
|
||||
really_inline uint64_t find_potential_structurals(const simd_input<ARCHITECTURE> in, uint64_t &prev_primitive) {
|
||||
// These use SIMD so let's kick them off before running the regular 64-bit stuff ...
|
||||
uint64_t whitespace, op;
|
||||
find_whitespace_and_operators(in, whitespace, op);
|
||||
|
||||
// a qualified predecessor is something that can happen 1 position before an
|
||||
// pseudo-structural character
|
||||
uint64_t pseudo_pred = structurals | whitespace;
|
||||
// Detect the start of a run of primitive characters. Includes numbers, booleans, and strings (").
|
||||
// Everything except whitespace, braces, colon and comma.
|
||||
const uint64_t primitive = ~(op | whitespace);
|
||||
const uint64_t follows_primitive = follows(primitive, prev_primitive);
|
||||
const uint64_t start_primitive = primitive & ~follows_primitive;
|
||||
|
||||
uint64_t shifted_pseudo_pred =
|
||||
(pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
|
||||
prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
|
||||
uint64_t pseudo_structurals =
|
||||
shifted_pseudo_pred & (~whitespace) & (~quote_mask);
|
||||
structurals |= pseudo_structurals;
|
||||
|
||||
// now, we've used our close quotes all we need to. So let's switch them off
|
||||
// they will be off in the quote mask and on in quote bits.
|
||||
structurals &= ~(quote_bits & ~quote_mask);
|
||||
return structurals;
|
||||
// Return final structurals
|
||||
return op | start_primitive;
|
||||
}
|
||||
|
||||
// Find structural bits in a 64-byte chunk.
|
||||
really_inline void find_structural_bits_64(
|
||||
const uint8_t *buf, size_t idx, uint32_t *base_ptr, uint32_t &base,
|
||||
uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote,
|
||||
uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals,
|
||||
uint64_t &error_mask,
|
||||
static const size_t STEP_SIZE = 128;
|
||||
|
||||
//
|
||||
// Find the important bits of JSON in a 128-byte chunk, and add them to :
|
||||
//
|
||||
//
|
||||
//
|
||||
// PERF NOTES:
|
||||
// We pipe 2 inputs through these stages:
|
||||
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
|
||||
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
|
||||
// 2. Scan the JSON for critical data: strings, primitives and operators. This is the critical path.
|
||||
// The output of step 1 depends entirely on this information. These functions don't quite use
|
||||
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
|
||||
// at a time. The second input's scans has some dependency on the first ones finishing it, but
|
||||
// they can make a lot of progress before they need that information.
|
||||
// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
|
||||
// to finish: utf-8 checks and generating the output from the last iteration.
|
||||
//
|
||||
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
|
||||
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
|
||||
// workout.
|
||||
//
|
||||
really_inline void find_structural_bits_128(
|
||||
const uint8_t *buf, const size_t idx, uint32_t *&base_ptr,
|
||||
uint64_t &prev_escaped, uint64_t &prev_in_string,
|
||||
uint64_t &prev_primitive,
|
||||
uint64_t &prev_structurals,
|
||||
uint64_t &unescaped_chars_error,
|
||||
utf8_checker<ARCHITECTURE> &utf8_state) {
|
||||
simd_input<ARCHITECTURE> in(buf);
|
||||
utf8_state.check_next_input(in);
|
||||
/* detect odd sequences of backslashes */
|
||||
uint64_t odd_ends = find_odd_backslash_sequences(
|
||||
in, prev_iter_ends_odd_backslash);
|
||||
//
|
||||
// Load up all 128 bytes into SIMD registers
|
||||
//
|
||||
simd_input<ARCHITECTURE> in_1(buf);
|
||||
simd_input<ARCHITECTURE> in_2(buf+64);
|
||||
|
||||
/* detect insides of quote pairs ("quote_mask") and also our quote_bits
|
||||
* themselves */
|
||||
uint64_t quote_bits;
|
||||
uint64_t quote_mask = find_quote_mask_and_bits(
|
||||
in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
|
||||
//
|
||||
// Find the strings and potential structurals (operators / primitives).
|
||||
//
|
||||
// This will include false structurals that are *inside* strings--we'll filter strings out
|
||||
// before we return.
|
||||
//
|
||||
uint64_t string_1 = find_strings(in_1, prev_escaped, prev_in_string);
|
||||
uint64_t structurals_1 = find_potential_structurals(in_1, prev_primitive);
|
||||
uint64_t string_2 = find_strings(in_2, prev_escaped, prev_in_string);
|
||||
uint64_t structurals_2 = find_potential_structurals(in_2, prev_primitive);
|
||||
|
||||
/* take the previous iterations structural bits, not our current
|
||||
* iteration,
|
||||
* and flatten */
|
||||
flatten_bits(base_ptr, base, idx, structurals);
|
||||
//
|
||||
// Do miscellaneous work while the processor is busy calculating strings and structurals.
|
||||
//
|
||||
// After that, weed out structurals that are inside strings and find invalid string characters.
|
||||
//
|
||||
uint64_t unescaped_1 = in_1.lteq(0x1F);
|
||||
utf8_state.check_next_input(in_1);
|
||||
flatten_bits(base_ptr, idx, prev_structurals); // Output *last* iteration's structurals to ParsedJson
|
||||
prev_structurals = structurals_1 & ~string_1;
|
||||
unescaped_chars_error |= unescaped_1 & string_1;
|
||||
|
||||
uint64_t whitespace;
|
||||
find_whitespace_and_structurals(in, whitespace, structurals);
|
||||
|
||||
/* fixup structurals to reflect quotes and add pseudo-structural
|
||||
* characters */
|
||||
structurals = finalize_structurals(structurals, whitespace, quote_mask,
|
||||
quote_bits, prev_iter_ends_pseudo_pred);
|
||||
uint64_t unescaped_2 = in_2.lteq(0x1F);
|
||||
utf8_state.check_next_input(in_2);
|
||||
flatten_bits(base_ptr, idx+64, prev_structurals); // Output *last* iteration's structurals to ParsedJson
|
||||
prev_structurals = structurals_2 & ~string_2;
|
||||
unescaped_chars_error |= unescaped_2 & string_2;
|
||||
}
|
||||
|
||||
int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
|
||||
if (len > pj.byte_capacity) {
|
||||
if (unlikely(len > pj.byte_capacity)) {
|
||||
std::cerr << "Your ParsedJson object only supports documents up to "
|
||||
<< pj.byte_capacity << " bytes but you are trying to process "
|
||||
<< len << " bytes" << std::endl;
|
||||
return simdjson::CAPACITY;
|
||||
}
|
||||
uint32_t *base_ptr = pj.structural_indexes;
|
||||
uint32_t base = 0;
|
||||
utf8_checker<ARCHITECTURE> utf8_state;
|
||||
|
||||
/* we have padded the input out to 64 byte multiple with the remainder
|
||||
* being zeros persistent state across loop does the last iteration end
|
||||
* with an odd-length sequence of backslashes? */
|
||||
|
||||
/* either 0 or 1, but a 64-bit value */
|
||||
uint64_t prev_iter_ends_odd_backslash = 0ULL;
|
||||
/* does the previous iteration end inside a double-quote pair? */
|
||||
uint64_t prev_iter_inside_quote =
|
||||
0ULL; /* either all zeros or all ones
|
||||
* does the previous iteration end on something that is a
|
||||
* predecessor of a pseudo-structural character - i.e.
|
||||
* whitespace or a structural character effectively the very
|
||||
* first char is considered to follow "whitespace" for the
|
||||
* purposes of pseudo-structural character detection so we
|
||||
* initialize to 1 */
|
||||
uint64_t prev_iter_ends_pseudo_pred = 1ULL;
|
||||
|
||||
/* structurals are persistent state across loop as we flatten them on the
|
||||
* subsequent iteration into our array pointed to be base_ptr.
|
||||
* This is harmless on the first iteration as structurals==0
|
||||
* and is done for performance reasons; we can hide some of the latency of
|
||||
* the
|
||||
* expensive carryless multiply in the previous step with this work */
|
||||
// Whether the first character of the next iteration is escaped.
|
||||
uint64_t prev_escaped = 0ULL;
|
||||
// Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
|
||||
uint64_t prev_in_string = 0ULL;
|
||||
// Whether the last character of the previous iteration is a primitive value character
|
||||
// (anything except whitespace, braces, comma or colon).
|
||||
uint64_t prev_primitive = 0ULL;
|
||||
// Mask of structural characters from the last iteration.
|
||||
// Kept around for performance reasons, so we can call flatten_bits to soak up some unused
|
||||
// CPU capacity while the next iteration is busy with an expensive clmul in compute_quote_mask.
|
||||
uint64_t structurals = 0;
|
||||
|
||||
size_t lenminus64 = len < 64 ? 0 : len - 64;
|
||||
size_t lenminusstep = len < STEP_SIZE ? 0 : len - STEP_SIZE;
|
||||
size_t idx = 0;
|
||||
uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII
|
||||
code points < 0x20) */
|
||||
// Errors with unescaped characters in strings (ASCII codepoints < 0x20)
|
||||
uint64_t unescaped_chars_error = 0;
|
||||
|
||||
for (; idx < lenminus64; idx += 64) {
|
||||
find_structural_bits_64(&buf[idx], idx, base_ptr, base,
|
||||
prev_iter_ends_odd_backslash,
|
||||
prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
|
||||
structurals, error_mask, utf8_state);
|
||||
for (; idx < lenminusstep; idx += STEP_SIZE) {
|
||||
find_structural_bits_128(&buf[idx], idx, base_ptr,
|
||||
prev_escaped, prev_in_string, prev_primitive,
|
||||
structurals, unescaped_chars_error, utf8_state);
|
||||
}
|
||||
|
||||
/* If we have a final chunk of less than 64 bytes, pad it to 64 with
|
||||
* spaces before processing it (otherwise, we risk invalidating the UTF-8
|
||||
* checks). */
|
||||
if (idx < len) {
|
||||
uint8_t tmp_buf[64];
|
||||
memset(tmp_buf, 0x20, 64);
|
||||
if (likely(idx < len)) {
|
||||
uint8_t tmp_buf[STEP_SIZE];
|
||||
memset(tmp_buf, 0x20, STEP_SIZE);
|
||||
memcpy(tmp_buf, buf + idx, len - idx);
|
||||
find_structural_bits_64(&tmp_buf[0], idx, base_ptr, base,
|
||||
prev_iter_ends_odd_backslash,
|
||||
prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
|
||||
structurals, error_mask, utf8_state);
|
||||
idx += 64;
|
||||
find_structural_bits_128(&tmp_buf[0], idx, base_ptr,
|
||||
prev_escaped, prev_in_string, prev_primitive,
|
||||
structurals, unescaped_chars_error, utf8_state);
|
||||
idx += STEP_SIZE;
|
||||
}
|
||||
|
||||
/* is last string quote closed? */
|
||||
if (prev_iter_inside_quote) {
|
||||
return simdjson::UNCLOSED_STRING;
|
||||
/* finally, flatten out the remaining structurals from the last iteration */
|
||||
flatten_bits(base_ptr, idx, structurals);
|
||||
|
||||
simdjson::ErrorValues error = detect_errors_on_eof(unescaped_chars_error, prev_in_string);
|
||||
if (unlikely(error != simdjson::SUCCESS)) {
|
||||
return error;
|
||||
}
|
||||
|
||||
/* finally, flatten out the remaining structurals from the last iteration
|
||||
*/
|
||||
flatten_bits(base_ptr, base, idx, structurals);
|
||||
|
||||
pj.n_structural_indexes = base;
|
||||
pj.n_structural_indexes = base_ptr - pj.structural_indexes;
|
||||
/* a valid JSON file cannot have zero structural indexes - we should have
|
||||
* found something */
|
||||
if (pj.n_structural_indexes == 0u) {
|
||||
if (unlikely(pj.n_structural_indexes == 0u)) {
|
||||
return simdjson::EMPTY;
|
||||
}
|
||||
if (base_ptr[pj.n_structural_indexes - 1] > len) {
|
||||
if (unlikely(pj.structural_indexes[pj.n_structural_indexes - 1] > len)) {
|
||||
return simdjson::UNEXPECTED_ERROR;
|
||||
}
|
||||
if (len != base_ptr[pj.n_structural_indexes - 1]) {
|
||||
if (len != pj.structural_indexes[pj.n_structural_indexes - 1]) {
|
||||
/* the string might not be NULL terminated, but we add a virtual NULL
|
||||
* ending character. */
|
||||
base_ptr[pj.n_structural_indexes++] = len;
|
||||
pj.structural_indexes[pj.n_structural_indexes++] = len;
|
||||
}
|
||||
/* make it safe to dereference one beyond this array */
|
||||
base_ptr[pj.n_structural_indexes] = 0;
|
||||
if (error_mask) {
|
||||
return simdjson::UNESCAPED_CHARS;
|
||||
}
|
||||
pj.structural_indexes[pj.n_structural_indexes] = 0;
|
||||
return utf8_state.errors();
|
||||
}
|
||||
|
|
|
@ -26,64 +26,42 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx
|
|||
// base_ptr[base] incrementing base as we go
|
||||
// will potentially store extra values beyond end of valid bits, so base_ptr
|
||||
// needs to be large enough to handle this
|
||||
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) {
|
||||
really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) {
|
||||
// In some instances, the next branch is expensive because it is mispredicted.
|
||||
// Unfortunately, in other cases,
|
||||
// it helps tremendously.
|
||||
if (bits == 0)
|
||||
return;
|
||||
uint32_t cnt = hamming(bits);
|
||||
uint32_t next_base = base + cnt;
|
||||
idx -= 64;
|
||||
base_ptr += base;
|
||||
{
|
||||
base_ptr[0] = idx + trailing_zeroes(bits);
|
||||
|
||||
// Do the first 8 all together
|
||||
for (int i=0; i<8; i++) {
|
||||
base_ptr[i] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[1] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[2] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[3] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[4] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[5] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[6] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[7] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr += 8;
|
||||
}
|
||||
// We hope that the next branch is easily predicted.
|
||||
if (cnt > 8) {
|
||||
base_ptr[0] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[1] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[2] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[3] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[4] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[5] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[6] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[7] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr += 8;
|
||||
}
|
||||
if (cnt > 16) { // unluckly: we rarely get here
|
||||
// since it means having one structural or pseudo-structral element
|
||||
// every 4 characters (possible with inputs like "","","",...).
|
||||
do {
|
||||
base_ptr[0] = idx + trailing_zeroes(bits);
|
||||
|
||||
// Do the next 8 all together (we hope in most cases it won't happen at all
|
||||
// and the branch is easily predicted).
|
||||
if (unlikely(cnt > 8)) {
|
||||
for (int i=8; i<16; i++) {
|
||||
base_ptr[i] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr++;
|
||||
} while (bits != 0);
|
||||
}
|
||||
|
||||
// Most files don't have 16+ structurals per block, so we take several basically guaranteed
|
||||
// branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
|
||||
// or the start of a value ("abc" true 123) every 4 characters.
|
||||
if (unlikely(cnt > 16)) {
|
||||
uint32_t i = 16;
|
||||
do {
|
||||
base_ptr[i] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
i++;
|
||||
} while (i < cnt);
|
||||
}
|
||||
}
|
||||
base = next_base;
|
||||
|
||||
base_ptr += cnt;
|
||||
}
|
||||
#endif // SIMDJSON_NAIVE_FLATTEN
|
||||
|
|
|
@ -10,29 +10,28 @@ namespace simdjson {
|
|||
|
||||
template <>
|
||||
struct simd_input<Architecture::HASWELL> {
|
||||
__m256i chunks[2];
|
||||
const __m256i chunks[2];
|
||||
|
||||
really_inline simd_input() : chunks{__m256i(), __m256i()} {}
|
||||
|
||||
really_inline simd_input(const __m256i chunk0, const __m256i chunk1)
|
||||
: chunks{chunk0, chunk1} {}
|
||||
|
||||
really_inline simd_input(const uint8_t *ptr)
|
||||
{
|
||||
this->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0*32));
|
||||
this->chunks[1] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 1*32));
|
||||
}
|
||||
|
||||
really_inline simd_input(__m256i chunk0, __m256i chunk1)
|
||||
{
|
||||
this->chunks[0] = chunk0;
|
||||
this->chunks[1] = chunk1;
|
||||
}
|
||||
: chunks{
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0*32)),
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 1*32))
|
||||
} {}
|
||||
|
||||
template <typename F>
|
||||
really_inline void each(F const& each_chunk)
|
||||
really_inline void each(F const& each_chunk) const
|
||||
{
|
||||
each_chunk(this->chunks[0]);
|
||||
each_chunk(this->chunks[1]);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline simd_input<Architecture::HASWELL> map(F const& map_chunk) {
|
||||
really_inline simd_input<Architecture::HASWELL> map(F const& map_chunk) const {
|
||||
return simd_input<Architecture::HASWELL>(
|
||||
map_chunk(this->chunks[0]),
|
||||
map_chunk(this->chunks[1])
|
||||
|
@ -40,7 +39,7 @@ struct simd_input<Architecture::HASWELL> {
|
|||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline simd_input<Architecture::HASWELL> map(simd_input<Architecture::HASWELL> b, F const& map_chunk) {
|
||||
really_inline simd_input<Architecture::HASWELL> map(const simd_input<Architecture::HASWELL> b, F const& map_chunk) const {
|
||||
return simd_input<Architecture::HASWELL>(
|
||||
map_chunk(this->chunks[0], b.chunks[0]),
|
||||
map_chunk(this->chunks[1], b.chunks[1])
|
||||
|
@ -48,24 +47,31 @@ struct simd_input<Architecture::HASWELL> {
|
|||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline __m256i reduce(F const& reduce_pair) {
|
||||
really_inline __m256i reduce(F const& reduce_pair) const {
|
||||
return reduce_pair(this->chunks[0], this->chunks[1]);
|
||||
}
|
||||
|
||||
really_inline uint64_t to_bitmask() {
|
||||
really_inline uint64_t to_bitmask() const {
|
||||
uint64_t r_lo = static_cast<uint32_t>(_mm256_movemask_epi8(this->chunks[0]));
|
||||
uint64_t r_hi = _mm256_movemask_epi8(this->chunks[1]);
|
||||
return r_lo | (r_hi << 32);
|
||||
}
|
||||
|
||||
really_inline uint64_t eq(uint8_t m) {
|
||||
really_inline simd_input<Architecture::HASWELL> bit_or(const uint8_t m) const {
|
||||
const __m256i mask = _mm256_set1_epi8(m);
|
||||
return this->map( [&](auto a) {
|
||||
return _mm256_or_si256(a, mask);
|
||||
});
|
||||
}
|
||||
|
||||
really_inline uint64_t eq(const uint8_t m) const {
|
||||
const __m256i mask = _mm256_set1_epi8(m);
|
||||
return this->map( [&](auto a) {
|
||||
return _mm256_cmpeq_epi8(a, mask);
|
||||
}).to_bitmask();
|
||||
}
|
||||
|
||||
really_inline uint64_t lteq(uint8_t m) {
|
||||
really_inline uint64_t lteq(const uint8_t m) const {
|
||||
const __m256i maxval = _mm256_set1_epi8(m);
|
||||
return this->map( [&](auto a) {
|
||||
return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, a), maxval);
|
||||
|
|
|
@ -218,7 +218,7 @@ struct utf8_checker<Architecture::HASWELL> {
|
|||
__m256i any_bits_on = in.reduce([&](auto a, auto b) {
|
||||
return _mm256_or_si256(a, b);
|
||||
});
|
||||
if ((_mm256_testz_si256(any_bits_on, high_bit)) == 1) {
|
||||
if (likely(_mm256_testz_si256(any_bits_on, high_bit) == 1)) {
|
||||
// it is ascii, we just check continuation
|
||||
this->has_error = _mm256_or_si256(
|
||||
_mm256_cmpgt_epi8(this->previous.carried_continuations,
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
TARGET_HASWELL
|
||||
namespace simdjson::haswell {
|
||||
|
||||
really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
|
||||
really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
|
||||
// There should be no such thing with a processing supporting avx2
|
||||
// but not clmul.
|
||||
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
||||
|
@ -21,8 +21,9 @@ really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
|
|||
return quote_mask;
|
||||
}
|
||||
|
||||
really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
|
||||
uint64_t &whitespace, uint64_t &structurals) {
|
||||
really_inline void find_whitespace_and_operators(
|
||||
const simd_input<ARCHITECTURE> in,
|
||||
uint64_t &whitespace, uint64_t &op) {
|
||||
|
||||
#ifdef SIMDJSON_NAIVE_STRUCTURAL
|
||||
|
||||
|
@ -34,14 +35,14 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
|
|||
const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
|
||||
const __m256i mask_column = _mm256_set1_epi8(0x3a);
|
||||
const __m256i mask_comma = _mm256_set1_epi8(0x2c);
|
||||
structurals = in.map([&](auto in) {
|
||||
__m256i structurals = _mm256_cmpeq_epi8(in, mask_open_brace);
|
||||
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_brace));
|
||||
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_open_bracket));
|
||||
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_bracket));
|
||||
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_column));
|
||||
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_comma));
|
||||
return structurals;
|
||||
op = in.map([&](auto in) {
|
||||
__m256i op = _mm256_cmpeq_epi8(in, mask_open_brace);
|
||||
op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_brace));
|
||||
op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_open_bracket));
|
||||
op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_bracket));
|
||||
op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_column));
|
||||
op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_comma));
|
||||
return op;
|
||||
}).to_bitmask();
|
||||
|
||||
const __m256i mask_space = _mm256_set1_epi8(0x20);
|
||||
|
@ -60,24 +61,24 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
|
|||
#else // SIMDJSON_NAIVE_STRUCTURAL
|
||||
|
||||
// clang-format off
|
||||
const __m256i structural_table =
|
||||
const __m256i operator_table =
|
||||
_mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
|
||||
44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
|
||||
const __m256i white_table = _mm256_setr_epi8(
|
||||
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
|
||||
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
|
||||
// clang-format on
|
||||
const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
|
||||
const __m256i struct_mask = _mm256_set1_epi8(32);
|
||||
const __m256i op_offset = _mm256_set1_epi8(0xd4u);
|
||||
const __m256i op_mask = _mm256_set1_epi8(32);
|
||||
|
||||
whitespace = in.map([&](auto _in) {
|
||||
return _mm256_cmpeq_epi8(_in, _mm256_shuffle_epi8(white_table, _in));
|
||||
}).to_bitmask();
|
||||
|
||||
structurals = in.map([&](auto _in) {
|
||||
const __m256i r1 = _mm256_add_epi8(struct_offset, _in);
|
||||
const __m256i r2 = _mm256_or_si256(_in, struct_mask);
|
||||
const __m256i r3 = _mm256_shuffle_epi8(structural_table, r1);
|
||||
op = in.map([&](auto _in) {
|
||||
const __m256i r1 = _mm256_add_epi8(op_offset, _in);
|
||||
const __m256i r2 = _mm256_or_si256(_in, op_mask);
|
||||
const __m256i r3 = _mm256_shuffle_epi8(operator_table, r1);
|
||||
return _mm256_cmpeq_epi8(r2, r3);
|
||||
}).to_bitmask();
|
||||
|
||||
|
@ -89,65 +90,43 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
|
|||
// base_ptr[base] incrementing base as we go
|
||||
// will potentially store extra values beyond end of valid bits, so base_ptr
|
||||
// needs to be large enough to handle this
|
||||
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) {
|
||||
really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) {
|
||||
// In some instances, the next branch is expensive because it is mispredicted.
|
||||
// Unfortunately, in other cases,
|
||||
// it helps tremendously.
|
||||
if (bits == 0)
|
||||
return;
|
||||
uint32_t cnt = _mm_popcnt_u64(bits);
|
||||
uint32_t next_base = base + cnt;
|
||||
idx -= 64;
|
||||
base_ptr += base;
|
||||
{
|
||||
base_ptr[0] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[1] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[2] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[3] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[4] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[5] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[6] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[7] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr += 8;
|
||||
|
||||
// Do the first 8 all together
|
||||
for (int i=0; i<8; i++) {
|
||||
base_ptr[i] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
}
|
||||
// We hope that the next branch is easily predicted.
|
||||
if (cnt > 8) {
|
||||
base_ptr[0] = idx + trailing_zeroes(bits);
|
||||
|
||||
// Do the next 8 all together (we hope in most cases it won't happen at all
|
||||
// and the branch is easily predicted).
|
||||
if (unlikely(cnt > 8)) {
|
||||
for (int i=8; i<16; i++) {
|
||||
base_ptr[i] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[1] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[2] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[3] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[4] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[5] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[6] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[7] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr += 8;
|
||||
}
|
||||
if (cnt > 16) { // unluckly: we rarely get here
|
||||
// since it means having one structural or pseudo-structral element
|
||||
// every 4 characters (possible with inputs like "","","",...).
|
||||
}
|
||||
|
||||
// Most files don't have 16+ structurals per block, so we take several basically guaranteed
|
||||
// branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
|
||||
// or the start of a value ("abc" true 123) every four characters.
|
||||
if (unlikely(cnt > 16)) {
|
||||
uint32_t i = 16;
|
||||
do {
|
||||
base_ptr[0] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr++;
|
||||
} while (bits != 0);
|
||||
base_ptr[i] = idx + trailing_zeroes(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
i++;
|
||||
} while (i < cnt);
|
||||
}
|
||||
}
|
||||
base = next_base;
|
||||
|
||||
base_ptr += cnt;
|
||||
}
|
||||
|
||||
#include "generic/stage1_find_marks.h"
|
||||
|
|
|
@ -10,26 +10,24 @@ namespace simdjson {
|
|||
|
||||
template <>
|
||||
struct simd_input<Architecture::WESTMERE> {
|
||||
__m128i chunks[4];
|
||||
const __m128i chunks[4];
|
||||
|
||||
really_inline simd_input(const uint8_t *ptr) {
|
||||
this->chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
|
||||
this->chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
|
||||
this->chunks[2] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
|
||||
this->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
|
||||
}
|
||||
really_inline simd_input()
|
||||
: chunks { __m128i(), __m128i(), __m128i(), __m128i() } {}
|
||||
|
||||
really_inline simd_input(__m128i i0, __m128i i1, __m128i i2, __m128i i3)
|
||||
{
|
||||
this->chunks[0] = i0;
|
||||
this->chunks[1] = i1;
|
||||
this->chunks[2] = i2;
|
||||
this->chunks[3] = i3;
|
||||
}
|
||||
really_inline simd_input(const __m128i chunk0, const __m128i chunk1, const __m128i chunk2, const __m128i chunk3)
|
||||
: chunks{chunk0, chunk1, chunk2, chunk3} {}
|
||||
|
||||
really_inline simd_input(const uint8_t *ptr)
|
||||
: simd_input(
|
||||
_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0)),
|
||||
_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16)),
|
||||
_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32)),
|
||||
_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48))
|
||||
) {}
|
||||
|
||||
template <typename F>
|
||||
really_inline void each(F const& each_chunk)
|
||||
{
|
||||
really_inline void each(F const& each_chunk) const {
|
||||
each_chunk(this->chunks[0]);
|
||||
each_chunk(this->chunks[1]);
|
||||
each_chunk(this->chunks[2]);
|
||||
|
@ -37,7 +35,7 @@ struct simd_input<Architecture::WESTMERE> {
|
|||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline simd_input<Architecture::WESTMERE> map(F const& map_chunk) {
|
||||
really_inline simd_input<Architecture::WESTMERE> map(F const& map_chunk) const {
|
||||
return simd_input<Architecture::WESTMERE>(
|
||||
map_chunk(this->chunks[0]),
|
||||
map_chunk(this->chunks[1]),
|
||||
|
@ -47,7 +45,7 @@ struct simd_input<Architecture::WESTMERE> {
|
|||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline simd_input<Architecture::WESTMERE> map(simd_input<Architecture::WESTMERE> b, F const& map_chunk) {
|
||||
really_inline simd_input<Architecture::WESTMERE> map(const simd_input<Architecture::WESTMERE> b, F const& map_chunk) const {
|
||||
return simd_input<Architecture::WESTMERE>(
|
||||
map_chunk(this->chunks[0], b.chunks[0]),
|
||||
map_chunk(this->chunks[1], b.chunks[1]),
|
||||
|
@ -57,13 +55,13 @@ struct simd_input<Architecture::WESTMERE> {
|
|||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline __m128i reduce(F const& reduce_pair) {
|
||||
really_inline __m128i reduce(F const& reduce_pair) const {
|
||||
__m128i r01 = reduce_pair(this->chunks[0], this->chunks[1]);
|
||||
__m128i r23 = reduce_pair(this->chunks[2], this->chunks[3]);
|
||||
return reduce_pair(r01, r23);
|
||||
}
|
||||
|
||||
really_inline uint64_t to_bitmask() {
|
||||
really_inline uint64_t to_bitmask() const {
|
||||
uint64_t r0 = static_cast<uint32_t>(_mm_movemask_epi8(this->chunks[0]));
|
||||
uint64_t r1 = _mm_movemask_epi8(this->chunks[1]);
|
||||
uint64_t r2 = _mm_movemask_epi8(this->chunks[2]);
|
||||
|
@ -71,14 +69,21 @@ struct simd_input<Architecture::WESTMERE> {
|
|||
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
||||
}
|
||||
|
||||
really_inline uint64_t eq(uint8_t m) {
|
||||
really_inline simd_input<Architecture::WESTMERE> bit_or(const uint8_t m) const {
|
||||
const __m128i mask = _mm_set1_epi8(m);
|
||||
return this->map( [&](auto a) {
|
||||
return _mm_or_si128(a, mask);
|
||||
});
|
||||
}
|
||||
|
||||
really_inline uint64_t eq(const uint8_t m) const {
|
||||
const __m128i mask = _mm_set1_epi8(m);
|
||||
return this->map( [&](auto a) {
|
||||
return _mm_cmpeq_epi8(a, mask);
|
||||
}).to_bitmask();
|
||||
}
|
||||
|
||||
really_inline uint64_t lteq(uint8_t m) {
|
||||
really_inline uint64_t lteq(const uint8_t m) const {
|
||||
const __m128i maxval = _mm_set1_epi8(m);
|
||||
return this->map( [&](auto a) {
|
||||
return _mm_cmpeq_epi8(_mm_max_epu8(maxval, a), maxval);
|
||||
|
|
|
@ -13,29 +13,30 @@
|
|||
TARGET_WESTMERE
|
||||
namespace simdjson::westmere {
|
||||
|
||||
really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
|
||||
really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
|
||||
return _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
||||
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
|
||||
}
|
||||
|
||||
really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
|
||||
uint64_t &whitespace, uint64_t &structurals) {
|
||||
really_inline void find_whitespace_and_operators(
|
||||
const simd_input<ARCHITECTURE> in,
|
||||
uint64_t &whitespace, uint64_t &op) {
|
||||
|
||||
const __m128i structural_table =
|
||||
const __m128i operator_table =
|
||||
_mm_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
|
||||
const __m128i white_table = _mm_setr_epi8(32, 100, 100, 100, 17, 100, 113, 2,
|
||||
100, 9, 10, 112, 100, 13, 100, 100);
|
||||
const __m128i struct_offset = _mm_set1_epi8(0xd4u);
|
||||
const __m128i struct_mask = _mm_set1_epi8(32);
|
||||
const __m128i op_offset = _mm_set1_epi8(0xd4u);
|
||||
const __m128i op_mask = _mm_set1_epi8(32);
|
||||
|
||||
whitespace = in.map([&](auto _in) {
|
||||
return _mm_cmpeq_epi8(_in, _mm_shuffle_epi8(white_table, _in));
|
||||
}).to_bitmask();
|
||||
|
||||
structurals = in.map([&](auto _in) {
|
||||
const __m128i r1 = _mm_add_epi8(struct_offset, _in);
|
||||
const __m128i r2 = _mm_or_si128(_in, struct_mask);
|
||||
const __m128i r3 = _mm_shuffle_epi8(structural_table, r1);
|
||||
op = in.map([&](auto _in) {
|
||||
const __m128i r1 = _mm_add_epi8(op_offset, _in);
|
||||
const __m128i r2 = _mm_or_si128(_in, op_mask);
|
||||
const __m128i r3 = _mm_shuffle_epi8(operator_table, r1);
|
||||
return _mm_cmpeq_epi8(r2, r3);
|
||||
}).to_bitmask();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue