Use map() to interleave instructions for parallelism
This commit is contained in:
parent
9cc4ddfc88
commit
169568ca47
|
@ -69,16 +69,12 @@ struct simd_input<Architecture::ARM64> {
|
|||
|
||||
really_inline uint64_t eq(uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
return this->map([&](uint8x16_t chunk) {
|
||||
return vceqq_u8(chunk, mask);
|
||||
}).to_bitmask();
|
||||
return this->MAP_BITMASK( vceqq_u8(chunk, mask) );
|
||||
}
|
||||
|
||||
really_inline uint64_t lteq(uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
return this->map([&](uint8x16_t chunk) {
|
||||
return vcleq_u8(chunk, mask);
|
||||
}).to_bitmask();
|
||||
return this->MAP_BITMASK( vcleq_u8(chunk, mask) );
|
||||
}
|
||||
|
||||
}; // struct simd_input
|
||||
|
|
|
@ -39,14 +39,10 @@ really_inline void find_whitespace_and_structurals(
|
|||
});
|
||||
|
||||
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
|
||||
structurals = v.map([&](auto chunk) {
|
||||
return vtstq_u8(chunk, structural_shufti_mask);
|
||||
}).to_bitmask();
|
||||
structurals = v.MAP_BITMASK( vtstq_u8(chunk, structural_shufti_mask) );
|
||||
|
||||
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
|
||||
whitespace = v.map([&](auto chunk) {
|
||||
return vtstq_u8(chunk, whitespace_shufti_mask);
|
||||
}).to_bitmask();
|
||||
whitespace = v.MAP_BITMASK( vtstq_u8(chunk, whitespace_shufti_mask) );
|
||||
}
|
||||
|
||||
#include "generic/stage1_find_marks_flatten.h"
|
||||
|
|
|
@ -18,9 +18,9 @@ struct simd_input<Architecture::HASWELL> {
|
|||
this->hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
|
||||
}
|
||||
|
||||
really_inline simd_input(__m256i i0, __m256i i1) {
|
||||
this->lo = i0;
|
||||
this->hi = i1;
|
||||
really_inline simd_input(__m256i a_lo, __m256i a_hi) {
|
||||
this->lo = a_lo;
|
||||
this->hi = a_hi;
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
|
@ -32,23 +32,19 @@ struct simd_input<Architecture::HASWELL> {
|
|||
}
|
||||
|
||||
really_inline uint64_t to_bitmask() {
|
||||
uint64_t r0 = static_cast<uint32_t>(_mm256_movemask_epi8(this->lo));
|
||||
uint64_t r1 = _mm256_movemask_epi8(this->hi);
|
||||
return r0 | (r1 << 32);
|
||||
uint64_t r_lo = static_cast<uint32_t>(_mm256_movemask_epi8(this->lo));
|
||||
uint64_t r_hi = _mm256_movemask_epi8(this->hi);
|
||||
return r_lo | (r_hi << 32);
|
||||
}
|
||||
|
||||
really_inline uint64_t eq(uint8_t m) {
|
||||
const __m256i mask = _mm256_set1_epi8(m);
|
||||
return this->map([&] (auto chunk) {
|
||||
return _mm256_cmpeq_epi8(chunk, mask);
|
||||
}).to_bitmask();
|
||||
return this->MAP_BITMASK( _mm256_cmpeq_epi8(chunk, mask) );
|
||||
}
|
||||
|
||||
really_inline uint64_t lteq(uint8_t m) {
|
||||
const __m256i maxval = _mm256_set1_epi8(m);
|
||||
return this->map([&] (auto chunk) {
|
||||
return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, chunk), maxval);
|
||||
}).to_bitmask();
|
||||
return this->MAP_BITMASK( _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, chunk), maxval) );
|
||||
}
|
||||
|
||||
}; // struct simd_input
|
||||
|
|
|
@ -53,6 +53,7 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
|
|||
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_linefeed));
|
||||
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_tab));
|
||||
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_carriage));
|
||||
return space;
|
||||
}).to_bitmask();
|
||||
// end of naive approach
|
||||
|
||||
|
@ -69,15 +70,14 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
|
|||
const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
|
||||
const __m256i struct_mask = _mm256_set1_epi8(32);
|
||||
|
||||
whitespace = in.map([&](auto chunk) {
|
||||
return _mm256_cmpeq_epi8(chunk, _mm256_shuffle_epi8(white_table, chunk));
|
||||
}).to_bitmask();
|
||||
structurals = in.map([&](auto chunk) {
|
||||
__m256i struct_r1 = _mm256_add_epi8(struct_offset, chunk);
|
||||
__m256i struct_r2 = _mm256_or_si256(chunk, struct_mask);
|
||||
__m256i struct_r3 = _mm256_shuffle_epi8(structural_table, struct_r1);
|
||||
return _mm256_cmpeq_epi8(struct_r2, struct_r3);
|
||||
}).to_bitmask();
|
||||
whitespace = in.MAP_BITMASK( _mm256_cmpeq_epi8(chunk, _mm256_shuffle_epi8(white_table, chunk)) );
|
||||
auto struct_r1 = in.MAP_CHUNKS( _mm256_add_epi8(struct_offset, chunk) );
|
||||
auto struct_r2 = in.MAP_CHUNKS( _mm256_or_si256(chunk, struct_mask) );
|
||||
auto struct_r3 = struct_r1.MAP_CHUNKS( _mm256_shuffle_epi8(structural_table, chunk) );
|
||||
structurals = simd_input<ARCHITECTURE>(
|
||||
_mm256_cmpeq_epi8(struct_r2.lo, struct_r3.lo),
|
||||
_mm256_cmpeq_epi8(struct_r2.hi, struct_r3.hi)
|
||||
).to_bitmask();
|
||||
|
||||
#endif // else SIMDJSON_NAIVE_STRUCTURAL
|
||||
}
|
||||
|
|
|
@ -8,15 +8,23 @@
|
|||
|
||||
namespace simdjson {
|
||||
|
||||
template <Architecture>
|
||||
template <Architecture T>
|
||||
struct simd_input {
|
||||
simd_input(const uint8_t *ptr);
|
||||
// Map through each simd register in this input, producing another simd_input.
|
||||
template <typename F>
|
||||
really_inline simd_input<T> map(F const& map_chunk);
|
||||
// turn this bytemask (usually the result of a simd comparison operation) into a bitmask.
|
||||
uint64_t to_bitmask();
|
||||
// a straightforward comparison of a mask against input.
|
||||
uint64_t eq(uint8_t m);
|
||||
// find all values less than or equal than the content of maxval (using unsigned arithmetic)
|
||||
uint64_t lteq(uint8_t m);
|
||||
}; // struct simd_input
|
||||
|
||||
#define MAP_CHUNKS(EXPR) map([&](auto chunk) { return EXPR; })
|
||||
#define MAP_BITMASK(EXPR) map([&](auto chunk) { return EXPR; }).to_bitmask()
|
||||
|
||||
} // namespace simdjson
|
||||
|
||||
#endif
|
||||
|
|
|
@ -42,7 +42,7 @@ struct simd_input<Architecture::WESTMERE> {
|
|||
|
||||
really_inline uint64_t to_bitmask() {
|
||||
uint64_t r0 = static_cast<uint32_t>(_mm_movemask_epi8(this->v0));
|
||||
uint64_t r1 = _mm_movemask_epi8(this->v0);
|
||||
uint64_t r1 = _mm_movemask_epi8(this->v1);
|
||||
uint64_t r2 = _mm_movemask_epi8(this->v2);
|
||||
uint64_t r3 = _mm_movemask_epi8(this->v3);
|
||||
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
||||
|
@ -50,16 +50,12 @@ struct simd_input<Architecture::WESTMERE> {
|
|||
|
||||
really_inline uint64_t eq(uint8_t m) {
|
||||
const __m128i mask = _mm_set1_epi8(m);
|
||||
return this->map([&](auto chunk) {
|
||||
return _mm_cmpeq_epi8(chunk, mask);
|
||||
}).to_bitmask();
|
||||
return this->MAP_BITMASK( _mm_cmpeq_epi8(chunk, mask) );
|
||||
}
|
||||
|
||||
really_inline uint64_t lteq(uint8_t m) {
|
||||
const __m128i maxval = _mm_set1_epi8(m);
|
||||
return this->map([&](auto chunk) {
|
||||
return _mm_cmpeq_epi8(_mm_max_epu8(maxval, chunk), maxval);
|
||||
}).to_bitmask();
|
||||
return this->MAP_BITMASK( _mm_cmpeq_epi8(_mm_max_epu8(maxval, chunk), maxval) );
|
||||
}
|
||||
|
||||
}; // struct simd_input
|
||||
|
|
|
@ -28,16 +28,17 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
|
|||
const __m128i struct_offset = _mm_set1_epi8(0xd4u);
|
||||
const __m128i struct_mask = _mm_set1_epi8(32);
|
||||
|
||||
whitespace = in.map([&](auto chunk) {
|
||||
return _mm_cmpeq_epi8(chunk, _mm_shuffle_epi8(white_table, chunk));
|
||||
}).to_bitmask();
|
||||
whitespace = in.MAP_BITMASK( _mm_cmpeq_epi8(chunk, _mm_shuffle_epi8(white_table, chunk)) );
|
||||
|
||||
structurals = in.map([&](auto chunk) {
|
||||
__m128i struct_r1 = _mm_add_epi8(struct_offset, chunk);
|
||||
__m128i struct_r2 = _mm_or_si128(chunk, struct_mask);
|
||||
__m128i struct_r3 = _mm_shuffle_epi8(structural_table, struct_r1);
|
||||
return _mm_cmpeq_epi8(struct_r2, struct_r3);
|
||||
}).to_bitmask();
|
||||
auto r1 = in.MAP_CHUNKS( _mm_add_epi8(struct_offset, chunk) );
|
||||
auto r2 = in.MAP_CHUNKS( _mm_or_si128(chunk, struct_mask) );
|
||||
auto r3 = r1.MAP_CHUNKS( _mm_shuffle_epi8(structural_table, chunk) );
|
||||
structurals = simd_input<ARCHITECTURE>(
|
||||
_mm_cmpeq_epi8(r2.v0, r3.v0),
|
||||
_mm_cmpeq_epi8(r2.v1, r3.v1),
|
||||
_mm_cmpeq_epi8(r2.v2, r3.v2),
|
||||
_mm_cmpeq_epi8(r2.v3, r3.v3)
|
||||
).to_bitmask();
|
||||
}
|
||||
|
||||
#include "generic/stage1_find_marks_flatten.h"
|
||||
|
|
Loading…
Reference in New Issue