Use map() to interleave instructions for parallelism

2019-08-23 12:31:50 -07:00 · 2019-08-23 12:31:50 -07:00 · 169568ca47
parent 9cc4ddfc88
commit 169568ca47
7 changed files with 43 additions and 50 deletions
--- a/src/arm64/simd_input.h
+++ b/src/arm64/simd_input.h
@ -69,16 +69,12 @@ struct simd_input<Architecture::ARM64> {

  really_inline uint64_t eq(uint8_t m) {
    const uint8x16_t mask = vmovq_n_u8(m);
-    return this->map([&](uint8x16_t chunk) {
-      return vceqq_u8(chunk, mask);
-    }).to_bitmask();
+    return this->MAP_BITMASK( vceqq_u8(chunk, mask) );
  }

  really_inline uint64_t lteq(uint8_t m) {
    const uint8x16_t mask = vmovq_n_u8(m);
-    return this->map([&](uint8x16_t chunk) {
-      return vcleq_u8(chunk, mask);
-    }).to_bitmask();
+    return this->MAP_BITMASK( vcleq_u8(chunk, mask) );
  }

 }; // struct simd_input
--- a/src/arm64/stage1_find_marks.h
+++ b/src/arm64/stage1_find_marks.h
@ -39,14 +39,10 @@ really_inline void find_whitespace_and_structurals(
  });

  const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
-  structurals = v.map([&](auto chunk) {
-    return vtstq_u8(chunk, structural_shufti_mask);
-  }).to_bitmask();
+  structurals = v.MAP_BITMASK( vtstq_u8(chunk, structural_shufti_mask) );

  const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
-  whitespace = v.map([&](auto chunk) {
-    return vtstq_u8(chunk, whitespace_shufti_mask);
-  }).to_bitmask();
+  whitespace = v.MAP_BITMASK( vtstq_u8(chunk, whitespace_shufti_mask) );
 }

 #include "generic/stage1_find_marks_flatten.h"
--- a/src/haswell/simd_input.h
+++ b/src/haswell/simd_input.h
@ -18,9 +18,9 @@ struct simd_input<Architecture::HASWELL> {
    this->hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
  }

-  really_inline simd_input(__m256i i0, __m256i i1) {
-    this->lo = i0;
-    this->hi = i1;
+  really_inline simd_input(__m256i a_lo, __m256i a_hi) {
+    this->lo = a_lo;
+    this->hi = a_hi;
  }

  template <typename F>
@ -32,23 +32,19 @@ struct simd_input<Architecture::HASWELL> {
  }

  really_inline uint64_t to_bitmask() {
-    uint64_t r0 = static_cast<uint32_t>(_mm256_movemask_epi8(this->lo));
-    uint64_t r1 =                       _mm256_movemask_epi8(this->hi);
-    return r0 | (r1 << 32);
+    uint64_t r_lo = static_cast<uint32_t>(_mm256_movemask_epi8(this->lo));
+    uint64_t r_hi =                       _mm256_movemask_epi8(this->hi);
+    return r_lo | (r_hi << 32);
  }

  really_inline uint64_t eq(uint8_t m) {
    const __m256i mask = _mm256_set1_epi8(m);
-    return this->map([&] (auto chunk) {
-      return _mm256_cmpeq_epi8(chunk, mask);
-    }).to_bitmask();
+    return this->MAP_BITMASK( _mm256_cmpeq_epi8(chunk, mask) );
  }

  really_inline uint64_t lteq(uint8_t m) {
    const __m256i maxval = _mm256_set1_epi8(m);
-    return this->map([&] (auto chunk) {
-      return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, chunk), maxval);
-    }).to_bitmask();
+    return this->MAP_BITMASK( _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, chunk), maxval) );
  }

 }; // struct simd_input
--- a/src/haswell/stage1_find_marks.h
+++ b/src/haswell/stage1_find_marks.h
@ -53,6 +53,7 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
      space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_linefeed));
      space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_tab));
      space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_carriage));
+      return space;
    }).to_bitmask();
    // end of naive approach

@ -69,15 +70,14 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
    const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
    const __m256i struct_mask = _mm256_set1_epi8(32);

-    whitespace = in.map([&](auto chunk) {
-        return _mm256_cmpeq_epi8(chunk, _mm256_shuffle_epi8(white_table, chunk));
-    }).to_bitmask();
-    structurals = in.map([&](auto chunk) {
-      __m256i struct_r1 = _mm256_add_epi8(struct_offset, chunk);
-      __m256i struct_r2 = _mm256_or_si256(chunk, struct_mask);
-      __m256i struct_r3 = _mm256_shuffle_epi8(structural_table, struct_r1);
-      return _mm256_cmpeq_epi8(struct_r2, struct_r3);
-    }).to_bitmask();
+    whitespace = in.MAP_BITMASK( _mm256_cmpeq_epi8(chunk, _mm256_shuffle_epi8(white_table, chunk)) );
+    auto struct_r1 = in.MAP_CHUNKS( _mm256_add_epi8(struct_offset, chunk) );
+    auto struct_r2 = in.MAP_CHUNKS( _mm256_or_si256(chunk, struct_mask) );
+    auto struct_r3 = struct_r1.MAP_CHUNKS( _mm256_shuffle_epi8(structural_table, chunk) );
+    structurals = simd_input<ARCHITECTURE>(
+      _mm256_cmpeq_epi8(struct_r2.lo, struct_r3.lo),
+      _mm256_cmpeq_epi8(struct_r2.hi, struct_r3.hi)
+    ).to_bitmask();

  #endif // else SIMDJSON_NAIVE_STRUCTURAL
 }
--- a/src/simd_input.h
+++ b/src/simd_input.h
@ -8,15 +8,23 @@

 namespace simdjson {

-template <Architecture>
+template <Architecture T>
 struct simd_input {
    simd_input(const uint8_t *ptr);
+    // Map through each simd register in this input, producing another simd_input.
+    template <typename F>
+    really_inline simd_input<T> map(F const& map_chunk);
+    // turn this bytemask (usually the result of a simd comparison operation) into a bitmask.
+    uint64_t to_bitmask();
    // a straightforward comparison of a mask against input.
    uint64_t eq(uint8_t m);
    // find all values less than or equal than the content of maxval (using unsigned arithmetic)
    uint64_t lteq(uint8_t m);
 }; // struct simd_input

+#define MAP_CHUNKS(EXPR) map([&](auto chunk) { return EXPR; })
+#define MAP_BITMASK(EXPR) map([&](auto chunk) { return EXPR; }).to_bitmask()
+
 } // namespace simdjson

 #endif
--- a/src/westmere/simd_input.h
+++ b/src/westmere/simd_input.h
@ -42,7 +42,7 @@ struct simd_input<Architecture::WESTMERE> {

  really_inline uint64_t to_bitmask() {
    uint64_t r0 = static_cast<uint32_t>(_mm_movemask_epi8(this->v0));
-    uint64_t r1 =                       _mm_movemask_epi8(this->v0);
+    uint64_t r1 =                       _mm_movemask_epi8(this->v1);
    uint64_t r2 =                       _mm_movemask_epi8(this->v2);
    uint64_t r3 =                       _mm_movemask_epi8(this->v3);
    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
@ -50,16 +50,12 @@ struct simd_input<Architecture::WESTMERE> {

  really_inline uint64_t eq(uint8_t m) {
    const __m128i mask = _mm_set1_epi8(m);
-    return this->map([&](auto chunk) {
-      return _mm_cmpeq_epi8(chunk, mask);
-    }).to_bitmask();
+    return this->MAP_BITMASK( _mm_cmpeq_epi8(chunk, mask) );
  }

  really_inline uint64_t lteq(uint8_t m) {
    const __m128i maxval = _mm_set1_epi8(m);
-    return this->map([&](auto chunk) {
-      return _mm_cmpeq_epi8(_mm_max_epu8(maxval, chunk), maxval);
-    }).to_bitmask();
+    return this->MAP_BITMASK( _mm_cmpeq_epi8(_mm_max_epu8(maxval, chunk), maxval) );
  }

 }; // struct simd_input
--- a/src/westmere/stage1_find_marks.h
+++ b/src/westmere/stage1_find_marks.h
@ -28,16 +28,17 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
  const __m128i struct_offset = _mm_set1_epi8(0xd4u);
  const __m128i struct_mask = _mm_set1_epi8(32);

-  whitespace = in.map([&](auto chunk) {
-      return _mm_cmpeq_epi8(chunk, _mm_shuffle_epi8(white_table, chunk));
-  }).to_bitmask();
+  whitespace = in.MAP_BITMASK( _mm_cmpeq_epi8(chunk, _mm_shuffle_epi8(white_table, chunk)) );

-  structurals = in.map([&](auto chunk) {
-    __m128i struct_r1 = _mm_add_epi8(struct_offset, chunk);
-    __m128i struct_r2 = _mm_or_si128(chunk, struct_mask);
-    __m128i struct_r3 = _mm_shuffle_epi8(structural_table, struct_r1);
-    return _mm_cmpeq_epi8(struct_r2, struct_r3);
-  }).to_bitmask();
+  auto r1 = in.MAP_CHUNKS( _mm_add_epi8(struct_offset, chunk) );
+  auto r2 = in.MAP_CHUNKS( _mm_or_si128(chunk, struct_mask) );
+  auto r3 = r1.MAP_CHUNKS( _mm_shuffle_epi8(structural_table, chunk) );
+  structurals = simd_input<ARCHITECTURE>(
+    _mm_cmpeq_epi8(r2.v0, r3.v0),
+    _mm_cmpeq_epi8(r2.v1, r3.v1),
+    _mm_cmpeq_epi8(r2.v2, r3.v2),
+    _mm_cmpeq_epi8(r2.v3, r3.v3)
+  ).to_bitmask();
 }

 #include "generic/stage1_find_marks_flatten.h"