Add AMD64 build_bitmask

2019-08-23 10:49:26 -07:00 · 2019-08-23 10:49:26 -07:00 · 441963c84c
parent cf4ae61ac6
commit 441963c84c
5 changed files with 92 additions and 93 deletions
--- a/singleheader/amalgamation_demo.cpp
+++ b/singleheader/amalgamation_demo.cpp
@ -1,4 +1,4 @@
-/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */
+/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */

 #include <iostream>
 #include "simdjson.h"
--- a/singleheader/simdjson.cpp
+++ b/singleheader/simdjson.cpp
@ -1,4 +1,4 @@
-/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */
+/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */
 #include "simdjson.h"

 /* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
@ -574,22 +574,38 @@ struct simd_input<Architecture::ARM64> {
    this->i3 = vld1q_u8(ptr + 48);
  }

+  template <typename F>
+  really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
+    uint8x16_t r0 = chunk_to_mask(this->i0);
+    uint8x16_t r1 = chunk_to_mask(this->i1);
+    uint8x16_t r2 = chunk_to_mask(this->i2);
+    uint8x16_t r3 = chunk_to_mask(this->i3);
+    return neon_movemask_bulk(r0, r1, r2, r3);
+  }
+
+  template <typename F>
+  really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
+    simd_input<Architecture::ARM64> result = {
+      map_chunk(this->i0),
+      map_chunk(this->i1),
+      map_chunk(this->i2),
+      map_chunk(this->i3)
+    };
+    return result;
+  }
+
  really_inline uint64_t eq(uint8_t m) {
    const uint8x16_t mask = vmovq_n_u8(m);
-    uint8x16_t cmp_res_0 = vceqq_u8(this->i0, mask);
-    uint8x16_t cmp_res_1 = vceqq_u8(this->i1, mask);
-    uint8x16_t cmp_res_2 = vceqq_u8(this->i2, mask);
-    uint8x16_t cmp_res_3 = vceqq_u8(this->i3, mask);
-    return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+    return this->build_bitmask([&](uint8x16_t chunk) {
+      return vceqq_u8(chunk, mask);
+    });
  }

  really_inline uint64_t lteq(uint8_t m) {
    const uint8x16_t mask = vmovq_n_u8(m);
-    uint8x16_t cmp_res_0 = vcleq_u8(this->i0, mask);
-    uint8x16_t cmp_res_1 = vcleq_u8(this->i1, mask);
-    uint8x16_t cmp_res_2 = vcleq_u8(this->i2, mask);
-    uint8x16_t cmp_res_3 = vcleq_u8(this->i3, mask);
-    return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+    return this->build_bitmask([&](uint8x16_t chunk) {
+      return vcleq_u8(chunk, mask);
+    });
  }

 }; // struct simd_input
@ -1467,45 +1483,25 @@ really_inline void find_whitespace_and_structurals(
      (uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
  const uint8x16_t high_nibble_mask =
      (uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
-  const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
-  const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
  const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);

-  uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask);
-  uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4);
-  uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
-  uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
-  uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
+  simd_input<ARCHITECTURE> v = in.map([&](auto chunk) {
+    uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask);
+    uint8x16_t nib_hi = vshrq_n_u8(chunk, 4);
+    uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo);
+    uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi);
+    return vandq_u8(shuf_lo, shuf_hi);
+  });

-  uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask);
-  uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4);
-  uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
-  uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
-  uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
+  const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
+  structurals = v.build_bitmask([&](auto chunk) {
+    return vtstq_u8(chunk, structural_shufti_mask);
+  });

-  uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask);
-  uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4);
-  uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
-  uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
-  uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
-
-  uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask);
-  uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4);
-  uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
-  uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
-  uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
-
-  uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
-  uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
-  uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
-  uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
-  structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
-
-  uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
-  uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
-  uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
-  uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
-  whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
+  const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
+  whitespace = v.build_bitmask([&](auto chunk) {
+    return vtstq_u8(chunk, whitespace_shufti_mask);
+  });
 }

 // This file contains a non-architecture-specific version of "flatten" used in stage1.
--- a/singleheader/simdjson.h
+++ b/singleheader/simdjson.h
@ -1,4 +1,4 @@
-/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */
+/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */
 /* begin file include/simdjson/simdjson_version.h */
 // /include/simdjson/simdjson_version.h automatically generated by release.py,
 // do not change by hand
--- a/src/arm64/simd_input.h
+++ b/src/arm64/simd_input.h
@ -46,22 +46,45 @@ struct simd_input<Architecture::ARM64> {
    this->i3 = vld1q_u8(ptr + 48);
  }

+  really_inline simd_input(uint8x16_t i0, uint8x16_t i1, uint8x16_t i2, uint8x16_t i3) {
+    this->i0 = i0;
+    this->i1 = i1;
+    this->i2 = i2;
+    this->i3 = i3;
+  }
+
+  template <typename F>
+  really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
+    uint8x16_t r0 = chunk_to_mask(this->i0);
+    uint8x16_t r1 = chunk_to_mask(this->i1);
+    uint8x16_t r2 = chunk_to_mask(this->i2);
+    uint8x16_t r3 = chunk_to_mask(this->i3);
+    return neon_movemask_bulk(r0, r1, r2, r3);
+  }
+
+  template <typename F>
+  really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
+    simd_input<Architecture::ARM64> result = {
+      map_chunk(this->i0),
+      map_chunk(this->i1),
+      map_chunk(this->i2),
+      map_chunk(this->i3)
+    };
+    return result;
+  }
+
  really_inline uint64_t eq(uint8_t m) {
    const uint8x16_t mask = vmovq_n_u8(m);
-    uint8x16_t cmp_res_0 = vceqq_u8(this->i0, mask);
-    uint8x16_t cmp_res_1 = vceqq_u8(this->i1, mask);
-    uint8x16_t cmp_res_2 = vceqq_u8(this->i2, mask);
-    uint8x16_t cmp_res_3 = vceqq_u8(this->i3, mask);
-    return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+    return this->build_bitmask([&](uint8x16_t chunk) {
+      return vceqq_u8(chunk, mask);
+    });
  }

  really_inline uint64_t lteq(uint8_t m) {
    const uint8x16_t mask = vmovq_n_u8(m);
-    uint8x16_t cmp_res_0 = vcleq_u8(this->i0, mask);
-    uint8x16_t cmp_res_1 = vcleq_u8(this->i1, mask);
-    uint8x16_t cmp_res_2 = vcleq_u8(this->i2, mask);
-    uint8x16_t cmp_res_3 = vcleq_u8(this->i3, mask);
-    return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+    return this->build_bitmask([&](uint8x16_t chunk) {
+      return vcleq_u8(chunk, mask);
+    });
  }

 }; // struct simd_input
--- a/src/arm64/stage1_find_marks.h
+++ b/src/arm64/stage1_find_marks.h
@ -28,45 +28,25 @@ really_inline void find_whitespace_and_structurals(
      (uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
  const uint8x16_t high_nibble_mask =
      (uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
-  const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
-  const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
  const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);

-  uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask);
-  uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4);
-  uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
-  uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
-  uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
+  simd_input<ARCHITECTURE> v = in.map([&](auto chunk) {
+    uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask);
+    uint8x16_t nib_hi = vshrq_n_u8(chunk, 4);
+    uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo);
+    uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi);
+    return vandq_u8(shuf_lo, shuf_hi);
+  });

-  uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask);
-  uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4);
-  uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
-  uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
-  uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
+  const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
+  structurals = v.build_bitmask([&](auto chunk) {
+    return vtstq_u8(chunk, structural_shufti_mask);
+  });

-  uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask);
-  uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4);
-  uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
-  uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
-  uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
-
-  uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask);
-  uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4);
-  uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
-  uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
-  uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
-
-  uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
-  uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
-  uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
-  uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
-  structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
-
-  uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
-  uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
-  uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
-  uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
-  whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
+  const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
+  whitespace = v.build_bitmask([&](auto chunk) {
+    return vtstq_u8(chunk, whitespace_shufti_mask);
+  });
 }

 #include "generic/stage1_find_marks_flatten.h"