Genericize bitmask building to make algorithms clearer
This commit is contained in:
parent
2060cf8a70
commit
b01222518d
|
@ -18,22 +18,25 @@ struct simd_input<Architecture::HASWELL> {
|
|||
this->hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
|
||||
uint64_t r0 = static_cast<uint32_t>(_mm256_movemask_epi8(chunk_to_mask(this->lo)));
|
||||
uint64_t r1 = _mm256_movemask_epi8(chunk_to_mask(this->hi));
|
||||
return r0 | (r1 << 32);
|
||||
}
|
||||
|
||||
really_inline uint64_t eq(uint8_t m) {
|
||||
const __m256i mask = _mm256_set1_epi8(m);
|
||||
__m256i cmp_res_0 = _mm256_cmpeq_epi8(this->lo, mask);
|
||||
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
|
||||
__m256i cmp_res_1 = _mm256_cmpeq_epi8(this->hi, mask);
|
||||
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
|
||||
return res_0 | (res_1 << 32);
|
||||
return this->build_bitmask([&] (auto chunk) {
|
||||
return _mm256_cmpeq_epi8(chunk, mask);
|
||||
});
|
||||
}
|
||||
|
||||
really_inline uint64_t lteq(uint8_t m) {
|
||||
const __m256i maxval = _mm256_set1_epi8(m);
|
||||
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, this->lo), maxval);
|
||||
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
|
||||
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, this->hi), maxval);
|
||||
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
|
||||
return res_0 | (res_1 << 32);
|
||||
return this->build_bitmask([&] (auto chunk) {
|
||||
return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, chunk), maxval);
|
||||
});
|
||||
}
|
||||
|
||||
}; // struct simd_input
|
||||
|
|
|
@ -25,77 +25,60 @@ static really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTUR
|
|||
uint64_t &whitespace, uint64_t &structurals) {
|
||||
|
||||
#ifdef SIMDJSON_NAIVE_STRUCTURAL
|
||||
// You should never need this naive approach, but it can be useful
|
||||
// for research purposes
|
||||
const __m256i mask_open_brace = _mm256_set1_epi8(0x7b);
|
||||
__m256i struct_lo = _mm256_cmpeq_epi8(in.lo, mask_open_brace);
|
||||
__m256i struct_hi = _mm256_cmpeq_epi8(in.hi, mask_open_brace);
|
||||
const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
|
||||
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_brace));
|
||||
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_brace));
|
||||
const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
|
||||
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_open_bracket));
|
||||
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_open_bracket));
|
||||
const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
|
||||
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_bracket));
|
||||
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_bracket));
|
||||
const __m256i mask_column = _mm256_set1_epi8(0x3a);
|
||||
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_column));
|
||||
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_column));
|
||||
const __m256i mask_comma = _mm256_set1_epi8(0x2c);
|
||||
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_comma));
|
||||
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_comma));
|
||||
uint64_t structural_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(struct_lo));
|
||||
uint64_t structural_res_1 = _mm256_movemask_epi8(struct_hi);
|
||||
structurals = (structural_res_0 | (structural_res_1 << 32));
|
||||
|
||||
const __m256i mask_space = _mm256_set1_epi8(0x20);
|
||||
__m256i space_lo = _mm256_cmpeq_epi8(in.lo, mask_space);
|
||||
__m256i space_hi = _mm256_cmpeq_epi8(in.hi, mask_space);
|
||||
const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
|
||||
space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_linefeed));
|
||||
space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_linefeed));
|
||||
const __m256i mask_tab = _mm256_set1_epi8(0x09);
|
||||
space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_tab));
|
||||
space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_tab));
|
||||
const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
|
||||
space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_carriage));
|
||||
space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_carriage));
|
||||
// You should never need this naive approach, but it can be useful
|
||||
// for research purposes
|
||||
const __m256i mask_open_brace = _mm256_set1_epi8(0x7b);
|
||||
const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
|
||||
const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
|
||||
const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
|
||||
const __m256i mask_column = _mm256_set1_epi8(0x3a);
|
||||
const __m256i mask_comma = _mm256_set1_epi8(0x2c);
|
||||
structurals = in->build_bitmask([&](auto in) {
|
||||
__m256i structurals = _mm256_cmpeq_epi8(in, mask_open_brace);
|
||||
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_brace));
|
||||
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_open_bracket));
|
||||
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_bracket));
|
||||
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_column));
|
||||
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_comma));
|
||||
return structurals;
|
||||
});
|
||||
|
||||
uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(space_lo));
|
||||
uint64_t ws_res_1 = _mm256_movemask_epi8(space_hi);
|
||||
whitespace = (ws_res_0 | (ws_res_1 << 32));
|
||||
// end of naive approach
|
||||
const __m256i mask_space = _mm256_set1_epi8(0x20);
|
||||
const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
|
||||
const __m256i mask_tab = _mm256_set1_epi8(0x09);
|
||||
const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
|
||||
whitespace = in->build_bitmask([&](auto in) {
|
||||
__m256i space = _mm256_cmpeq_epi8(in, mask_space);
|
||||
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_linefeed));
|
||||
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_tab));
|
||||
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_carriage));
|
||||
});
|
||||
// end of naive approach
|
||||
|
||||
#else // SIMDJSON_NAIVE_STRUCTURAL
|
||||
// clang-format off
|
||||
const __m256i structural_table =
|
||||
_mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
|
||||
44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
|
||||
const __m256i white_table = _mm256_setr_epi8(
|
||||
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
|
||||
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
|
||||
// clang-format on
|
||||
const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
|
||||
const __m256i struct_mask = _mm256_set1_epi8(32);
|
||||
|
||||
__m256i lo_white = _mm256_cmpeq_epi8(in.lo, _mm256_shuffle_epi8(white_table, in.lo));
|
||||
__m256i hi_white = _mm256_cmpeq_epi8(in.hi, _mm256_shuffle_epi8(white_table, in.hi));
|
||||
uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(lo_white));
|
||||
uint64_t ws_res_1 = _mm256_movemask_epi8(hi_white);
|
||||
whitespace = (ws_res_0 | (ws_res_1 << 32));
|
||||
__m256i lo_struct_r1 = _mm256_add_epi8(struct_offset, in.lo);
|
||||
__m256i hi_struct_r1 = _mm256_add_epi8(struct_offset, in.hi);
|
||||
__m256i lo_struct_r2 = _mm256_or_si256(in.lo, struct_mask);
|
||||
__m256i hi_struct_r2 = _mm256_or_si256(in.hi, struct_mask);
|
||||
__m256i lo_struct_r3 = _mm256_shuffle_epi8(structural_table, lo_struct_r1);
|
||||
__m256i hi_struct_r3 = _mm256_shuffle_epi8(structural_table, hi_struct_r1);
|
||||
__m256i lo_struct = _mm256_cmpeq_epi8(lo_struct_r2, lo_struct_r3);
|
||||
__m256i hi_struct = _mm256_cmpeq_epi8(hi_struct_r2, hi_struct_r3);
|
||||
// clang-format off
|
||||
const __m256i structural_table =
|
||||
_mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
|
||||
44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
|
||||
const __m256i white_table = _mm256_setr_epi8(
|
||||
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
|
||||
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
|
||||
// clang-format on
|
||||
const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
|
||||
const __m256i struct_mask = _mm256_set1_epi8(32);
|
||||
|
||||
whitespace = in.build_bitmask([&](auto chunk) {
|
||||
return _mm256_cmpeq_epi8(chunk, _mm256_shuffle_epi8(white_table, chunk));
|
||||
});
|
||||
structurals = in.build_bitmask([&](auto chunk) {
|
||||
__m256i struct_r1 = _mm256_add_epi8(struct_offset, chunk);
|
||||
__m256i struct_r2 = _mm256_or_si256(chunk, struct_mask);
|
||||
__m256i struct_r3 = _mm256_shuffle_epi8(structural_table, struct_r1);
|
||||
return _mm256_cmpeq_epi8(struct_r2, struct_r3);
|
||||
});
|
||||
|
||||
uint64_t structural_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(lo_struct));
|
||||
uint64_t structural_res_1 = _mm256_movemask_epi8(hi_struct);
|
||||
structurals = (structural_res_0 | (structural_res_1 << 32));
|
||||
#endif // else SIMDJSON_NAIVE_STRUCTURAL
|
||||
}
|
||||
|
||||
|
|
|
@ -22,30 +22,27 @@ struct simd_input<Architecture::WESTMERE> {
|
|||
this->v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
|
||||
uint64_t r0 = static_cast<uint32_t>(_mm_movemask_epi8(chunk_to_mask(this->v0)));
|
||||
uint64_t r1 = _mm_movemask_epi8(chunk_to_mask(this->v1));
|
||||
uint64_t r2 = _mm_movemask_epi8(chunk_to_mask(this->v2));
|
||||
uint64_t r3 = _mm_movemask_epi8(chunk_to_mask(this->v3));
|
||||
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
||||
}
|
||||
|
||||
really_inline uint64_t eq(uint8_t m) {
|
||||
const __m128i mask = _mm_set1_epi8(m);
|
||||
__m128i cmp_res_0 = _mm_cmpeq_epi8(this->v0, mask);
|
||||
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
|
||||
__m128i cmp_res_1 = _mm_cmpeq_epi8(this->v1, mask);
|
||||
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
|
||||
__m128i cmp_res_2 = _mm_cmpeq_epi8(this->v2, mask);
|
||||
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
|
||||
__m128i cmp_res_3 = _mm_cmpeq_epi8(this->v3, mask);
|
||||
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
|
||||
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
|
||||
return this->build_bitmask([&](auto chunk) {
|
||||
return _mm_cmpeq_epi8(chunk, mask);
|
||||
});
|
||||
}
|
||||
|
||||
really_inline uint64_t lteq(uint8_t m) {
|
||||
const __m128i maxval = _mm_set1_epi8(m);
|
||||
__m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v0), maxval);
|
||||
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
|
||||
__m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v1), maxval);
|
||||
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
|
||||
__m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v2), maxval);
|
||||
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
|
||||
__m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v3), maxval);
|
||||
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
|
||||
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
|
||||
return this->build_bitmask([&](auto chunk) {
|
||||
return _mm_cmpeq_epi8(_mm_max_epu8(maxval, chunk), maxval);
|
||||
});
|
||||
}
|
||||
|
||||
}; // struct simd_input
|
||||
|
|
|
@ -28,45 +28,16 @@ static really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTUR
|
|||
const __m128i struct_offset = _mm_set1_epi8(0xd4u);
|
||||
const __m128i struct_mask = _mm_set1_epi8(32);
|
||||
|
||||
__m128i white0 = _mm_cmpeq_epi8(in.v0, _mm_shuffle_epi8(white_table, in.v0));
|
||||
__m128i white1 = _mm_cmpeq_epi8(in.v1, _mm_shuffle_epi8(white_table, in.v1));
|
||||
__m128i white2 = _mm_cmpeq_epi8(in.v2, _mm_shuffle_epi8(white_table, in.v2));
|
||||
__m128i white3 = _mm_cmpeq_epi8(in.v3, _mm_shuffle_epi8(white_table, in.v3));
|
||||
uint64_t ws_res_0 = _mm_movemask_epi8(white0);
|
||||
uint64_t ws_res_1 = _mm_movemask_epi8(white1);
|
||||
uint64_t ws_res_2 = _mm_movemask_epi8(white2);
|
||||
uint64_t ws_res_3 = _mm_movemask_epi8(white3);
|
||||
whitespace = in.build_bitmask([&](auto chunk) {
|
||||
return _mm_cmpeq_epi8(chunk, _mm_shuffle_epi8(white_table, chunk));
|
||||
});
|
||||
|
||||
whitespace =
|
||||
(ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));
|
||||
|
||||
__m128i struct1_r1 = _mm_add_epi8(struct_offset, in.v0);
|
||||
__m128i struct2_r1 = _mm_add_epi8(struct_offset, in.v1);
|
||||
__m128i struct3_r1 = _mm_add_epi8(struct_offset, in.v2);
|
||||
__m128i struct4_r1 = _mm_add_epi8(struct_offset, in.v3);
|
||||
|
||||
__m128i struct1_r2 = _mm_or_si128(in.v0, struct_mask);
|
||||
__m128i struct2_r2 = _mm_or_si128(in.v1, struct_mask);
|
||||
__m128i struct3_r2 = _mm_or_si128(in.v2, struct_mask);
|
||||
__m128i struct4_r2 = _mm_or_si128(in.v3, struct_mask);
|
||||
|
||||
__m128i struct1_r3 = _mm_shuffle_epi8(structural_table, struct1_r1);
|
||||
__m128i struct2_r3 = _mm_shuffle_epi8(structural_table, struct2_r1);
|
||||
__m128i struct3_r3 = _mm_shuffle_epi8(structural_table, struct3_r1);
|
||||
__m128i struct4_r3 = _mm_shuffle_epi8(structural_table, struct4_r1);
|
||||
|
||||
__m128i struct1 = _mm_cmpeq_epi8(struct1_r2, struct1_r3);
|
||||
__m128i struct2 = _mm_cmpeq_epi8(struct2_r2, struct2_r3);
|
||||
__m128i struct3 = _mm_cmpeq_epi8(struct3_r2, struct3_r3);
|
||||
__m128i struct4 = _mm_cmpeq_epi8(struct4_r2, struct4_r3);
|
||||
|
||||
uint64_t structural_res_0 = _mm_movemask_epi8(struct1);
|
||||
uint64_t structural_res_1 = _mm_movemask_epi8(struct2);
|
||||
uint64_t structural_res_2 = _mm_movemask_epi8(struct3);
|
||||
uint64_t structural_res_3 = _mm_movemask_epi8(struct4);
|
||||
|
||||
structurals = (structural_res_0 | (structural_res_1 << 16) |
|
||||
(structural_res_2 << 32) | (structural_res_3 << 48));
|
||||
structurals = in.build_bitmask([&](auto chunk) {
|
||||
__m128i struct_r1 = _mm_add_epi8(struct_offset, chunk);
|
||||
__m128i struct_r2 = _mm_or_si128(chunk, struct_mask);
|
||||
__m128i struct_r3 = _mm_shuffle_epi8(structural_table, struct_r1);
|
||||
return _mm_cmpeq_epi8(struct_r2, struct_r3);
|
||||
});
|
||||
}
|
||||
|
||||
#include "generic/stage1_find_marks_flatten.h"
|
||||
|
|
Loading…
Reference in New Issue