Use generic simd operators for find_whitespace_and_operators

2019-10-26 15:19:43 -07:00 · 2019-10-26 15:19:43 -07:00 · e383b7a6ab
parent c89d6bf68b
commit e383b7a6ab
7 changed files with 114 additions and 128 deletions
--- a/src/arm64/simd.h
+++ b/src/arm64/simd.h
@ -30,7 +30,7 @@ namespace simdjson::arm64::simd {
    really_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
    really_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
    really_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
-    really_inline simd8<T> operator~() const { return this ^ 0xFFu; }
+    really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
    really_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = (simd8<T>*)this; *this_cast = *this_cast | other; return *this_cast; }
    really_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = (simd8<T>*)this; *this_cast = *this_cast & other; return *this_cast; }
    really_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = (simd8<T>*)this; *this_cast = *this_cast ^ other; return *this_cast; }
@ -107,16 +107,17 @@ namespace simdjson::arm64::simd {
    really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }

    // Bit-specific operations
-    really_inline bool any_bits_set() const { return vmaxvq_u8(*this) != 0; }
-    really_inline bool any_bits_set(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set(); }
+    really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
+    really_inline bool any_bits_set_anywhere() const { return vmaxvq_u8(*this) != 0; }
+    really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
    template<int N>
    really_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
    template<int N>
    really_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }

-    // Perform a lookup of the lower 4 bits
+    // Perform a lookup assuming no value is larger than 16
    template<typename L>
-    really_inline simd8<L> lookup4(
+    really_inline simd8<L> lookup_16(
        L replace0,  L replace1,  L replace2,  L replace3,
        L replace4,  L replace5,  L replace6,  L replace7,
        L replace8,  L replace9,  L replace10, L replace11,
@ -127,10 +128,25 @@ namespace simdjson::arm64::simd {
        replace8,  replace9,  replace10, replace11,
        replace12, replace13, replace14, replace15
      );
-      return lookup_table.apply_lookup4_to(*this);
+      return lookup_table.apply_lookup_16_to(*this);
    }

-    really_inline simd8<uint8_t> apply_lookup4_to(const simd8<uint8_t> original) {
+    // Perform a lookup of the lower 4 bits
+    template<typename L>
+    really_inline simd8<L> lookup_lower_4_bits(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return (*this & 0xF).lookup_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      );
+    }
+
+    really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<uint8_t> original) {
      return vqtbl1q_u8(*this, original);
    }
  };
@ -187,12 +203,12 @@ namespace simdjson::arm64::simd {

    // Perform a lookup of the lower 4 bits
    template<typename L>
-    really_inline simd8<L> lookup4(
+    really_inline simd8<L> lookup_16(
        L replace0,  L replace1,  L replace2,  L replace3,
        L replace4,  L replace5,  L replace6,  L replace7,
        L replace8,  L replace9,  L replace10, L replace11,
        L replace12, L replace13, L replace14, L replace15) const {
-      return simd8<uint8_t>(*this).lookup4(
+      return simd8<uint8_t>(*this).lookup_16(
        replace0,  replace1,  replace2,  replace3,
        replace4,  replace5,  replace6,  replace7,
        replace8,  replace9,  replace10, replace11,
@ -200,7 +216,7 @@ namespace simdjson::arm64::simd {
      );
    }

-    really_inline simd8<int8_t> apply_lookup4_to(const simd8<uint8_t> original) {
+    really_inline simd8<int8_t> apply_lookup_16_to(const simd8<uint8_t> original) {
      return vqtbl1q_s8(*this, original);
    }
  };
@ -222,7 +238,7 @@ namespace simdjson::arm64::simd {
      each_chunk(this->chunks[3]);
    }

-    template <typename F, typename R=bool>
+    template <typename R=bool, typename F>
    really_inline simd8x64<R> map(F const& map_chunk) const {
      return simd8x64<R>(
        map_chunk(this->chunks[0]),
@ -232,7 +248,7 @@ namespace simdjson::arm64::simd {
      );
    }

-    template <typename F, typename R=bool>
+    template <typename R=bool, typename F>
    really_inline simd8x64<R> map(const simd8x64<T> b, F const& map_chunk) const {
      return simd8x64<R>(
        map_chunk(this->chunks[0], b.chunks[0]),
--- a/src/arm64/stage1_find_marks.h
+++ b/src/arm64/stage1_find_marks.h
@ -10,6 +10,8 @@

 namespace simdjson::arm64 {

+using namespace simd;
+
 really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {

 #ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
@ -20,31 +22,19 @@ really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
 }

 really_inline void find_whitespace_and_operators(
-    const simd::simd8x64<uint8_t> in,
-    uint64_t &whitespace, uint64_t &op) {
-  const uint8x16_t low_nibble_mask =
-      (uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
-  const uint8x16_t high_nibble_mask =
-      (uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
-  const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
+  const simd::simd8x64<uint8_t> in,
+  uint64_t &whitespace, uint64_t &op) {

-  auto v = in.map([&](auto chunk) {
-    uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask);
-    uint8x16_t nib_hi = vshrq_n_u8(chunk, 4);
-    uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo);
-    uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi);
-    return vandq_u8(shuf_lo, shuf_hi);
+  auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
+    auto nib_lo = chunk & 0xf;
+    auto nib_hi = chunk.shr<4>();
+    auto shuf_lo = nib_lo.lookup_16<uint8_t>(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
+    auto shuf_hi = nib_hi.lookup_16<uint8_t>(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
+    return shuf_lo & shuf_hi;
  });

-  const uint8x16_t operator_shufti_mask = vmovq_n_u8(0x7);
-  op = v.map([&](auto _v) {
-    return vtstq_u8(_v, operator_shufti_mask);
-  }).to_bitmask();
-
-  const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
-  whitespace = v.map([&](auto _v) {
-    return vtstq_u8(_v, whitespace_shufti_mask);
-  }).to_bitmask();
+  op = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x7); }).to_bitmask();
+  whitespace = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x18); }).to_bitmask();
 }

 #include "generic/simdutf8check.h"
--- a/src/generic/simdutf8check.h
+++ b/src/generic/simdutf8check.h
@ -36,7 +36,7 @@ struct utf8_checker {
  }

  really_inline simd8<int8_t> continuation_lengths(simd8<int8_t> high_nibbles) {
-    return high_nibbles.lookup4<int8_t>(
+    return high_nibbles.lookup_16<int8_t>(
      1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
      0, 0, 0, 0,             // 10xx (continuation)
      2, 2,                   // 110x
@ -102,7 +102,7 @@ struct utf8_checker {
    // Two-byte characters must start with at least C2
    // Three-byte characters must start with at least E1
    // Four-byte characters must start with at least F1
-    simd8<int8_t> initial_mins = off1_high_nibbles.lookup4<int8_t>(
+    simd8<int8_t> initial_mins = off1_high_nibbles.lookup_16<int8_t>(
      -128, -128, -128, -128, -128, -128, -128, -128, // 0xxx -> false
      -128, -128, -128, -128,                         // 10xx -> false
      0xC2, -128,                                     // 1100 -> C2
@ -114,7 +114,7 @@ struct utf8_checker {
    // Two-byte characters starting with at least C2 are always OK
    // Three-byte characters starting with at least E1 must be followed by at least A0
    // Four-byte characters starting with at least F1 must be followed by at least 90
-    simd8<int8_t> second_mins = off1_high_nibbles.lookup4<int8_t>(
+    simd8<int8_t> second_mins = off1_high_nibbles.lookup_16<int8_t>(
      -128, -128, -128, -128, -128, -128, -128, -128, -128, // 0xxx => false
      -128, -128, -128,                                     // 10xx => false
      127, 127,                                             // 110x => true
@ -152,7 +152,7 @@ struct utf8_checker {
  }

  really_inline void check_next_input(simd8<uint8_t> in) {
-    if (likely(!in.any_bits_set(0x80u))) {
+    if (likely(!in.any_bits_set_anywhere(0x80u))) {
      this->check_carried_continuations();
    } else {
      this->check_utf8_bytes(in);
@ -161,7 +161,7 @@ struct utf8_checker {

  really_inline void check_next_input(simd8x64<uint8_t> in) {
    simd8<uint8_t> bits = in.reduce([&](auto a, auto b) { return a | b; });
-    if (likely(!bits.any_bits_set(0x80u))) {
+    if (likely(!bits.any_bits_set_anywhere(0x80u))) {
      // it is ascii, we just check carried continuations.
      this->check_carried_continuations();
    } else {
@ -171,6 +171,6 @@ struct utf8_checker {
  }

  really_inline ErrorValues errors() {
-    return this->has_error.any_bits_set() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+    return this->has_error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
  }
 }; // struct utf8_checker
--- a/src/haswell/simd.h
+++ b/src/haswell/simd.h
@ -29,7 +29,7 @@ namespace simdjson::haswell::simd {
    really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
    really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
    really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(*this, other); }
-    really_inline Child operator~() const { return this ^ 0xFFu; }
+    really_inline Child operator~() const { return *this ^ 0xFFu; }
    really_inline Child& operator|=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast | other; return *this_cast; }
    really_inline Child& operator&=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast & other; return *this_cast; }
    really_inline Child& operator^=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast ^ other; return *this_cast; }
@ -88,20 +88,38 @@ namespace simdjson::haswell::simd {

    // Perform a lookup of the lower 4 bits
    template<typename L>
-    really_inline simd8<L> lookup4(
+    really_inline simd8<L> lookup_lower_4_bits(
        L replace0,  L replace1,  L replace2,  L replace3,
        L replace4,  L replace5,  L replace6,  L replace7,
        L replace8,  L replace9,  L replace10, L replace11,
        L replace12, L replace13, L replace14, L replace15) const {
-
      simd8<L> lookup_table(
-        replace0, replace1, replace2,  replace3,  replace4,  replace5,  replace6,  replace7,
-        replace8, replace9, replace10, replace11, replace12, replace13, replace14, replace15,
-        replace0, replace1, replace2,  replace3,  replace4,  replace5,  replace6,  replace7,
-        replace8, replace9, replace10, replace11, replace12, replace13, replace14, replace15
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15,
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
      );
      return _mm256_shuffle_epi8(lookup_table, *this);
    }
+
+    // Perform a lookup assuming the value is between 0 and 16
+    template<typename L>
+    really_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_lower_4_bits(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      );
+    }
  };

  // Signed bytes
@ -164,8 +182,10 @@ namespace simdjson::haswell::simd {
    really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return this->max(other) == other; }

    // Bit-specific operations
-    really_inline bool any_bits_set(simd8<uint8_t> bits) const { return !_mm256_testz_si256(*this, bits); }
-    really_inline bool any_bits_set() const { return !_mm256_testz_si256(*this, *this); }
+    really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set(); }
+    really_inline simd8<bool> any_bits_set() const { return ~(*this == u8'\0'); }
+    really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !_mm256_testz_si256(*this, bits); }
+    really_inline bool any_bits_set_anywhere() const { return !_mm256_testz_si256(*this, *this); }
    template<int N>
    really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
    template<int N>
@ -189,7 +209,7 @@ namespace simdjson::haswell::simd {
      each_chunk(this->chunks[1]);
    }

-    template <typename F, typename R=bool>
+    template <typename R=bool, typename F>
    really_inline simd8x64<R> map(F const& map_chunk) const {
      return simd8x64<R>(
        map_chunk(this->chunks[0]),
@ -197,7 +217,7 @@ namespace simdjson::haswell::simd {
      );
    }

-    template <typename F, typename R=bool>
+    template <typename R=bool, typename F>
    really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const {
      return simd8x64<R>(
        map_chunk(this->chunks[0], b.chunks[0]),
--- a/src/haswell/stage1_find_marks.h
+++ b/src/haswell/stage1_find_marks.h
@ -11,6 +11,8 @@
 TARGET_HASWELL
 namespace simdjson::haswell {

+using namespace simd;
+
 really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
  // There should be no such thing with a processing supporting avx2
  // but not clmul.
@ -23,64 +25,13 @@ really_inline void find_whitespace_and_operators(
  const simd::simd8x64<uint8_t> in,
  uint64_t &whitespace, uint64_t &op) {

-  #ifdef SIMDJSON_NAIVE_STRUCTURAL
+  whitespace = in.map([&](simd8<uint8_t> _in) {
+    return _in == _in.lookup_lower_4_bits<uint8_t>(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
+  }).to_bitmask();

-    // You should never need this naive approach, but it can be useful
-    // for research purposes
-    const __m256i mask_open_brace = _mm256_set1_epi8(0x7b);
-    const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
-    const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
-    const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
-    const __m256i mask_column = _mm256_set1_epi8(0x3a);
-    const __m256i mask_comma = _mm256_set1_epi8(0x2c);
-    op = in.map([&](auto in) {
-      __m256i op = _mm256_cmpeq_epi8(in, mask_open_brace);
-      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_brace));
-      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_open_bracket));
-      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_bracket));
-      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_column));
-      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_comma));
-      return op;
-    }).to_bitmask();
-
-    const __m256i mask_space = _mm256_set1_epi8(0x20);
-    const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
-    const __m256i mask_tab = _mm256_set1_epi8(0x09);
-    const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
-    whitespace = in.map([&](auto in) {
-      __m256i space = _mm256_cmpeq_epi8(in, mask_space);
-      space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_linefeed));
-      space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_tab));
-      space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_carriage));
-      return space;
-    }).to_bitmask();
-    // end of naive approach
-
-  #else  // SIMDJSON_NAIVE_STRUCTURAL
-
-    // clang-format off
-    const __m256i operator_table =
-        _mm256_setr_epi8(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{',
-                         ',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
-    const __m256i white_table = _mm256_setr_epi8(
-        ' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100,
-        ' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
-    // clang-format on
-    const __m256i op_offset = _mm256_set1_epi8(0xd4u);
-    const __m256i op_mask = _mm256_set1_epi8(32);
-
-    whitespace = in.map([&](auto _in) {
-      return _mm256_cmpeq_epi8(_in, _mm256_shuffle_epi8(white_table, _in));
-    }).to_bitmask();
-
-    op = in.map([&](auto _in) {
-      const __m256i r1 = _mm256_add_epi8(op_offset, _in);
-      const __m256i r2 = _in | op_mask;
-      const __m256i r3 = _mm256_shuffle_epi8(operator_table, r1);
-      return _mm256_cmpeq_epi8(r2, r3);
-    }).to_bitmask();
-
-  #endif // else SIMDJSON_NAIVE_STRUCTURAL
+  op = in.map([&](simd8<uint8_t> _in) {
+    return (_in | 32) == (_in+0xd4u).lookup_lower_4_bits<uint8_t>(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
+  }).to_bitmask();
 }

 #include "generic/simdutf8check.h"
--- a/src/westmere/simd.h
+++ b/src/westmere/simd.h
@ -29,7 +29,7 @@ namespace simdjson::westmere::simd {
    really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
    really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
    really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(*this, other); }
-    really_inline Child operator~() const { return this ^ 0xFFu; }
+    really_inline Child operator~() const { return *this ^ 0xFFu; }
    really_inline Child& operator|=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast | other; return *this_cast; }
    really_inline Child& operator&=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast & other; return *this_cast; }
    really_inline Child& operator^=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast ^ other; return *this_cast; }
@ -67,7 +67,7 @@ namespace simdjson::westmere::simd {
    really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}

    really_inline bitmask_t to_bitmask() const { return _mm_movemask_epi8(*this); }
-    really_inline bool any() const { return !_mm_testz_si128(*this, *this) == 0; }
+    really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
  };

  template<typename T>
@ -89,7 +89,7 @@ namespace simdjson::westmere::simd {

    // Perform a lookup of the lower 4 bits
    template<typename L>
-    really_inline simd8<L> lookup4(
+    really_inline simd8<L> lookup_lower_4_bits(
        L replace0,  L replace1,  L replace2,  L replace3,
        L replace4,  L replace5,  L replace6,  L replace7,
        L replace8,  L replace9,  L replace10, L replace11,
@ -103,6 +103,21 @@ namespace simdjson::westmere::simd {
      );
      return _mm_shuffle_epi8(lookup_table, *this);
    }
+
+    // Perform a lookup assuming the value is between 0 and 16
+    template<typename L>
+    really_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_lower_4_bits(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      );
+    }
  };

  // Signed bytes
@ -157,8 +172,10 @@ namespace simdjson::westmere::simd {
    really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return this->max(other) == other; }

    // Bit-specific operations
-    really_inline bool any_bits_set(simd8<uint8_t> bits) const { return !_mm_testz_si128(*this, bits); }
-    really_inline bool any_bits_set() const { return !_mm_testz_si128(*this, *this); }
+    really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set(); }
+    really_inline simd8<bool> any_bits_set() const { return ~(*this == u8'\0'); }
+    really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !_mm_testz_si128(*this, bits); }
+    really_inline bool any_bits_set_anywhere() const { return !_mm_testz_si128(*this, *this); }
    template<int N>
    really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
    template<int N>
--- a/src/westmere/stage1_find_marks.h
+++ b/src/westmere/stage1_find_marks.h
@ -11,31 +11,23 @@
 TARGET_WESTMERE
 namespace simdjson::westmere {

+using namespace simd;
+
 really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
  return _mm_cvtsi128_si64(_mm_clmulepi64_si128(
      _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
 }

 really_inline void find_whitespace_and_operators(
-  const simd::simd8x64<uint8_t> in,
+  const simd8x64<uint8_t> in,
  uint64_t &whitespace, uint64_t &op) {

-  const __m128i operator_table =
-      _mm_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
-  const __m128i white_table = _mm_setr_epi8(32, 100, 100, 100,  17, 100, 113,   2,
-                                           100,   9,  10, 112, 100,  13, 100, 100);
-  const __m128i op_offset = _mm_set1_epi8(0xd4u);
-  const __m128i op_mask = _mm_set1_epi8(32);
-
-  whitespace = in.map([&](auto _in) {
-    return _mm_cmpeq_epi8(_in, _mm_shuffle_epi8(white_table, _in));
+  whitespace = in.map([&](simd8<uint8_t> _in) {
+    return _in == _in.lookup_lower_4_bits<uint8_t>(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
  }).to_bitmask();

-  op = in.map([&](auto _in) {
-    const __m128i r1 = _mm_add_epi8(op_offset, _in);
-    const __m128i r2 = _mm_or_si128(_in, op_mask);
-    const __m128i r3 = _mm_shuffle_epi8(operator_table, r1);
-    return _mm_cmpeq_epi8(r2, r3);
+  op = in.map([&](simd8<uint8_t> _in) {
+    return (_in | 32) == (_in+0xd4u).lookup_lower_4_bits<uint8_t>(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
  }).to_bitmask();
 }