Use map() to interleave instructions for parallelism

This commit is contained in:
John Keiser 2019-08-23 12:31:50 -07:00
parent 9cc4ddfc88
commit 169568ca47
7 changed files with 43 additions and 50 deletions

View File

@ -69,16 +69,12 @@ struct simd_input<Architecture::ARM64> {
really_inline uint64_t eq(uint8_t m) {
const uint8x16_t mask = vmovq_n_u8(m);
return this->map([&](uint8x16_t chunk) {
return vceqq_u8(chunk, mask);
}).to_bitmask();
return this->MAP_BITMASK( vceqq_u8(chunk, mask) );
}
really_inline uint64_t lteq(uint8_t m) {
const uint8x16_t mask = vmovq_n_u8(m);
return this->map([&](uint8x16_t chunk) {
return vcleq_u8(chunk, mask);
}).to_bitmask();
return this->MAP_BITMASK( vcleq_u8(chunk, mask) );
}
}; // struct simd_input

View File

@ -39,14 +39,10 @@ really_inline void find_whitespace_and_structurals(
});
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
structurals = v.map([&](auto chunk) {
return vtstq_u8(chunk, structural_shufti_mask);
}).to_bitmask();
structurals = v.MAP_BITMASK( vtstq_u8(chunk, structural_shufti_mask) );
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
whitespace = v.map([&](auto chunk) {
return vtstq_u8(chunk, whitespace_shufti_mask);
}).to_bitmask();
whitespace = v.MAP_BITMASK( vtstq_u8(chunk, whitespace_shufti_mask) );
}
#include "generic/stage1_find_marks_flatten.h"

View File

@ -18,9 +18,9 @@ struct simd_input<Architecture::HASWELL> {
this->hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
}
really_inline simd_input(__m256i i0, __m256i i1) {
this->lo = i0;
this->hi = i1;
really_inline simd_input(__m256i a_lo, __m256i a_hi) {
this->lo = a_lo;
this->hi = a_hi;
}
template <typename F>
@ -32,23 +32,19 @@ struct simd_input<Architecture::HASWELL> {
}
really_inline uint64_t to_bitmask() {
uint64_t r0 = static_cast<uint32_t>(_mm256_movemask_epi8(this->lo));
uint64_t r1 = _mm256_movemask_epi8(this->hi);
return r0 | (r1 << 32);
uint64_t r_lo = static_cast<uint32_t>(_mm256_movemask_epi8(this->lo));
uint64_t r_hi = _mm256_movemask_epi8(this->hi);
return r_lo | (r_hi << 32);
}
really_inline uint64_t eq(uint8_t m) {
const __m256i mask = _mm256_set1_epi8(m);
return this->map([&] (auto chunk) {
return _mm256_cmpeq_epi8(chunk, mask);
}).to_bitmask();
return this->MAP_BITMASK( _mm256_cmpeq_epi8(chunk, mask) );
}
really_inline uint64_t lteq(uint8_t m) {
const __m256i maxval = _mm256_set1_epi8(m);
return this->map([&] (auto chunk) {
return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, chunk), maxval);
}).to_bitmask();
return this->MAP_BITMASK( _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, chunk), maxval) );
}
}; // struct simd_input

View File

@ -53,6 +53,7 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_linefeed));
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_tab));
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_carriage));
return space;
}).to_bitmask();
// end of naive approach
@ -69,15 +70,14 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
const __m256i struct_mask = _mm256_set1_epi8(32);
whitespace = in.map([&](auto chunk) {
return _mm256_cmpeq_epi8(chunk, _mm256_shuffle_epi8(white_table, chunk));
}).to_bitmask();
structurals = in.map([&](auto chunk) {
__m256i struct_r1 = _mm256_add_epi8(struct_offset, chunk);
__m256i struct_r2 = _mm256_or_si256(chunk, struct_mask);
__m256i struct_r3 = _mm256_shuffle_epi8(structural_table, struct_r1);
return _mm256_cmpeq_epi8(struct_r2, struct_r3);
}).to_bitmask();
whitespace = in.MAP_BITMASK( _mm256_cmpeq_epi8(chunk, _mm256_shuffle_epi8(white_table, chunk)) );
auto struct_r1 = in.MAP_CHUNKS( _mm256_add_epi8(struct_offset, chunk) );
auto struct_r2 = in.MAP_CHUNKS( _mm256_or_si256(chunk, struct_mask) );
auto struct_r3 = struct_r1.MAP_CHUNKS( _mm256_shuffle_epi8(structural_table, chunk) );
structurals = simd_input<ARCHITECTURE>(
_mm256_cmpeq_epi8(struct_r2.lo, struct_r3.lo),
_mm256_cmpeq_epi8(struct_r2.hi, struct_r3.hi)
).to_bitmask();
#endif // else SIMDJSON_NAIVE_STRUCTURAL
}

View File

@ -8,15 +8,23 @@
namespace simdjson {
template <Architecture>
template <Architecture T>
struct simd_input {
simd_input(const uint8_t *ptr);
// Map through each simd register in this input, producing another simd_input.
template <typename F>
really_inline simd_input<T> map(F const& map_chunk);
// turn this bytemask (usually the result of a simd comparison operation) into a bitmask.
uint64_t to_bitmask();
// a straightforward comparison of a mask against input.
uint64_t eq(uint8_t m);
// find all values less than or equal than the content of maxval (using unsigned arithmetic)
uint64_t lteq(uint8_t m);
}; // struct simd_input
#define MAP_CHUNKS(EXPR) map([&](auto chunk) { return EXPR; })
#define MAP_BITMASK(EXPR) map([&](auto chunk) { return EXPR; }).to_bitmask()
} // namespace simdjson
#endif

View File

@ -42,7 +42,7 @@ struct simd_input<Architecture::WESTMERE> {
really_inline uint64_t to_bitmask() {
uint64_t r0 = static_cast<uint32_t>(_mm_movemask_epi8(this->v0));
uint64_t r1 = _mm_movemask_epi8(this->v0);
uint64_t r1 = _mm_movemask_epi8(this->v1);
uint64_t r2 = _mm_movemask_epi8(this->v2);
uint64_t r3 = _mm_movemask_epi8(this->v3);
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
@ -50,16 +50,12 @@ struct simd_input<Architecture::WESTMERE> {
really_inline uint64_t eq(uint8_t m) {
const __m128i mask = _mm_set1_epi8(m);
return this->map([&](auto chunk) {
return _mm_cmpeq_epi8(chunk, mask);
}).to_bitmask();
return this->MAP_BITMASK( _mm_cmpeq_epi8(chunk, mask) );
}
really_inline uint64_t lteq(uint8_t m) {
const __m128i maxval = _mm_set1_epi8(m);
return this->map([&](auto chunk) {
return _mm_cmpeq_epi8(_mm_max_epu8(maxval, chunk), maxval);
}).to_bitmask();
return this->MAP_BITMASK( _mm_cmpeq_epi8(_mm_max_epu8(maxval, chunk), maxval) );
}
}; // struct simd_input

View File

@ -28,16 +28,17 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
const __m128i struct_offset = _mm_set1_epi8(0xd4u);
const __m128i struct_mask = _mm_set1_epi8(32);
whitespace = in.map([&](auto chunk) {
return _mm_cmpeq_epi8(chunk, _mm_shuffle_epi8(white_table, chunk));
}).to_bitmask();
whitespace = in.MAP_BITMASK( _mm_cmpeq_epi8(chunk, _mm_shuffle_epi8(white_table, chunk)) );
structurals = in.map([&](auto chunk) {
__m128i struct_r1 = _mm_add_epi8(struct_offset, chunk);
__m128i struct_r2 = _mm_or_si128(chunk, struct_mask);
__m128i struct_r3 = _mm_shuffle_epi8(structural_table, struct_r1);
return _mm_cmpeq_epi8(struct_r2, struct_r3);
}).to_bitmask();
auto r1 = in.MAP_CHUNKS( _mm_add_epi8(struct_offset, chunk) );
auto r2 = in.MAP_CHUNKS( _mm_or_si128(chunk, struct_mask) );
auto r3 = r1.MAP_CHUNKS( _mm_shuffle_epi8(structural_table, chunk) );
structurals = simd_input<ARCHITECTURE>(
_mm_cmpeq_epi8(r2.v0, r3.v0),
_mm_cmpeq_epi8(r2.v1, r3.v1),
_mm_cmpeq_epi8(r2.v2, r3.v2),
_mm_cmpeq_epi8(r2.v3, r3.v3)
).to_bitmask();
}
#include "generic/stage1_find_marks_flatten.h"