Merge pull request #346 from lemire/jkeiser/simd_u8

Genericize SIMD arch code with `simd8<T>`
2019-11-05 19:49:14 -08:00 · 2019-11-05 19:49:14 -08:00 · b7c18df540
parent 52640518d3 74799134b1
commit b7c18df540
27 changed files with 1240 additions and 1186 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -22,7 +22,7 @@ steps:
  image: gcc:8
  environment:
    CHECKPERF_REPOSITORY: https://github.com/lemire/simdjson
-  commands: [ make checkperf ]
+  commands: [ cat /proc/cpuinfo, make checkperf ]
 ---
 kind: pipeline
 name: x64-build
@ -72,7 +72,7 @@ steps:
  image: gcc:8
  environment:
    CHECKPERF_REPOSITORY: https://github.com/lemire/simdjson
-  commands: [ make checkperf ]
+  commands: [ cat /proc/cpuinfo, make checkperf ]
 ---
 kind: pipeline
 name: arm64-build
--- a/8
+++ b/8
@ -64,7 +64,7 @@ COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompeti
 SUPPLEMENTARYEXECUTABLES=parse_noutf8validation parse_nonumberparsing parse_nostringparsing

 # Load headers and sources
-LIBHEADERS=src/simdprune_tables.h src/numberparsing.h src/jsoncharutils.h src/arm64/simd_input.h src/arm64/simdutf8check.h src/arm64/stage1_find_marks.h src/arm64/stage2_build_tape.h src/arm64/stringparsing.h src/generic/stage1_find_marks.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/haswell/simd_input.h src/haswell/simdutf8check.h src/haswell/stage1_find_marks.h src/haswell/stage2_build_tape.h src/haswell/stringparsing.h src/westmere/simd_input.h src/westmere/simdutf8check.h src/westmere/stage1_find_marks.h src/westmere/stage2_build_tape.h src/westmere/stringparsing.h
+LIBHEADERS=src/simdprune_tables.h src/numberparsing.h src/jsoncharutils.h src/arm64/bitmask.h src/arm64/simd.h src/arm64/stage1_find_marks.h src/arm64/stage2_build_tape.h src/arm64/stringparsing.h src/generic/stage1_find_marks.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/haswell/bitmask.h src/haswell/simd.h src/generic/simdutf8check.h src/haswell/stage1_find_marks.h src/haswell/stage2_build_tape.h src/haswell/stringparsing.h src/westmere/bitmask.h src/westmere/simd.h src/westmere/stage1_find_marks.h src/westmere/stage2_build_tape.h src/westmere/stringparsing.h
 PUBHEADERS=include/simdjson/common_defs.h include/simdjson/isadetection.h include/simdjson/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/parsedjson.h include/simdjson/parsedjsoniterator.h include/simdjson/portability.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h include/simdjson/stage1_find_marks.h include/simdjson/stage2_build_tape.h
 HEADERS=$(PUBHEADERS) $(LIBHEADERS)

@ -120,12 +120,12 @@ run_issue150_sh: allparserscheckfile
 run_testjson2json_sh: minify json2json
 	./scripts/testjson2json.sh

-test: run_basictests run_numberparsingcheck run_integer_tests run_stringparsingcheck run_jsoncheck run_pointercheck run_testjson2json_sh run_issue150_sh
+test: run_basictests run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck run_pointercheck run_testjson2json_sh run_issue150_sh
 	@echo "It looks like the code is good!"

-quiettest: run_basictests run_numberparsingcheck run_integer_tests run_stringparsingcheck run_jsoncheck run_pointercheck run_testjson2json_sh run_issue150_sh
+quiettest: run_basictests run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck run_pointercheck run_testjson2json_sh run_issue150_sh

-quicktests: run_basictests run_numberparsingcheck run_integer_tests run_stringparsingcheck run_jsoncheck run_pointercheck
+quicktests: run_basictests run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck run_pointercheck

 slowtests: run_testjson2json_sh run_issue150_sh

--- a/amalgamation.sh
+++ b/amalgamation.sh
@ -20,12 +20,12 @@ $SCRIPTPATH/src/simdjson.cpp
 $SCRIPTPATH/src/jsonioutil.cpp
 $SCRIPTPATH/src/jsonminifier.cpp
 $SCRIPTPATH/src/jsonparser.cpp
-$SCRIPTPATH/src/arm64/simd_input.h
-$SCRIPTPATH/src/haswell/simd_input.h
-$SCRIPTPATH/src/westmere/simd_input.h
-$SCRIPTPATH/src/arm64/simdutf8check.h
-$SCRIPTPATH/src/haswell/simdutf8check.h
-$SCRIPTPATH/src/westmere/simdutf8check.h
+$SCRIPTPATH/src/arm64/bitmask.h
+$SCRIPTPATH/src/haswell/bitmask.h
+$SCRIPTPATH/src/westmere/bitmask.h
+$SCRIPTPATH/src/arm64/simd.h
+$SCRIPTPATH/src/haswell/simd.h
+$SCRIPTPATH/src/westmere/simd.h
 $SCRIPTPATH/src/arm64/stage1_find_marks.h
 $SCRIPTPATH/src/haswell/stage1_find_marks.h
 $SCRIPTPATH/src/westmere/stage1_find_marks.h
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -37,21 +37,22 @@ set(SIMDJSON_SRC_HEADERS
  jsoncharutils.h
  numberparsing.h
  simdprune_tables.h
-  arm64/simd_input.h
-  arm64/simdutf8check.h
+  arm64/bitmask.h
+  arm64/simd.h
  arm64/stage1_find_marks.h
  arm64/stage2_build_tape.h
  arm64/stringparsing.h
  generic/stage1_find_marks.h
  generic/stage2_build_tape.h
  generic/stringparsing.h
-  haswell/simd_input.h
-  haswell/simdutf8check.h
+  generic/simdutf8check.h
+  haswell/bitmask.h
+  haswell/simd.h
  haswell/stage1_find_marks.h
  haswell/stage2_build_tape.h
  haswell/stringparsing.h
-  westmere/simd_input.h
-  westmere/simdutf8check.h
+  westmere/bitmask.h
+  westmere/simd.h
  westmere/stage1_find_marks.h
  westmere/stage2_build_tape.h
  westmere/stringparsing.h
--- a/src/arm64/bitmask.h
+++ b/src/arm64/bitmask.h
@ -0,0 +1,38 @@
+#ifndef SIMDJSON_ARM64_BITMASK_H
+#define SIMDJSON_ARM64_BITMASK_H
+
+#include "simdjson/portability.h"
+
+#ifdef IS_ARM64
+
+#include "haswell/bitmask.h"
+#include "simdjson/common_defs.h"
+
+namespace simdjson::arm64 {
+
+//
+// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+//
+// For example, prefix_xor(00100100) == 00011100
+//
+really_inline uint64_t prefix_xor(uint64_t bitmask) {
+
+#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
+  return vmull_p64(-1ULL, bitmask);
+#else
+  bitmask ^= bitmask << 1;
+  bitmask ^= bitmask << 2;
+  bitmask ^= bitmask << 4;
+  bitmask ^= bitmask << 8;
+  bitmask ^= bitmask << 16;
+  bitmask ^= bitmask << 32;
+  return bitmask;
+#endif
+
+}
+
+} // namespace simdjson::arm64
+UNTARGET_REGION
+
+#endif // IS_ARM64
+#endif
--- a/src/arm64/simd.h
+++ b/src/arm64/simd.h
@ -0,0 +1,316 @@
+#ifndef SIMDJSON_ARM64_SIMD_H
+#define SIMDJSON_ARM64_SIMD_H
+
+#include "simdjson/portability.h"
+
+#ifdef IS_ARM64
+
+#include "simdjson/common_defs.h"
+#include "simdjson/simdjson.h"
+
+namespace simdjson::arm64::simd {
+
+  template<typename T>
+  struct simd8;
+
+  //
+  // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
+  //
+  template<typename T, typename Mask=simd8<bool>>
+  struct base_u8 {
+    uint8x16_t value;
+    static const int SIZE = sizeof(value);
+
+    // Conversion from/to SIMD register
+    really_inline base_u8(const uint8x16_t _value) : value(_value) {}
+    really_inline operator const uint8x16_t&() const { return this->value; }
+    really_inline operator uint8x16_t&() { return this->value; }
+
+    // Bit operations
+    really_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
+    really_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
+    really_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
+    really_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
+    really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+    really_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = (simd8<T>*)this; *this_cast = *this_cast | other; return *this_cast; }
+    really_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = (simd8<T>*)this; *this_cast = *this_cast & other; return *this_cast; }
+    really_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = (simd8<T>*)this; *this_cast = *this_cast ^ other; return *this_cast; }
+
+    really_inline Mask operator==(const simd8<T> other) const { return vceqq_u8(*this, other); }
+
+    template<int N=1>
+    really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+      return vextq_u8(prev_chunk, *this, 16 - N);
+    }
+  };
+
+  // SIMD byte mask type (returned by things like eq and gt)
+  template<>
+  struct simd8<bool>: base_u8<bool> {
+    typedef uint32_t bitmask_t;
+
+    static really_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(-(!!_value)); }
+
+    really_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
+    // False constructor
+    really_inline simd8() : simd8(vdupq_n_u8(0)) {}
+    // Splat constructor
+    really_inline simd8(bool _value) : simd8(splat(_value)) {}
+
+    really_inline simd8<bool>::bitmask_t to_bitmask() const {
+      const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                   0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+      auto minput = *this & bit_mask;
+      uint8x16_t tmp = vpaddq_u8(minput, minput);
+      tmp = vpaddq_u8(tmp, tmp);
+      tmp = vpaddq_u8(tmp, tmp);
+      return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
+    }
+    really_inline bool any() const { return vmaxvq_u8(*this) != 0; }
+  };
+
+  // Unsigned bytes
+  template<>
+  struct simd8<uint8_t>: base_u8<uint8_t> {
+    static really_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(_value); }
+    static really_inline uint8x16_t zero() { return vdupq_n_u8(0); }
+    static really_inline uint8x16_t load(const uint8_t* values) { return vld1q_u8(values); }
+
+    really_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
+    // Zero constructor
+    really_inline simd8() : simd8(zero()) {}
+    // Array constructor
+    really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
+    // Splat constructor
+    really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    // Member-by-member initialization
+    really_inline simd8(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) : simd8(uint8x16_t{
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    }) {}
+
+    // Store to array
+    really_inline void store(uint8_t dst[16]) { return vst1q_u8(dst, *this); }
+
+    // Saturated math
+    really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
+    really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
+
+    // Addition/subtraction are the same for signed and unsigned
+    really_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
+    really_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
+    really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
+    really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
+
+    // Order-specific operations
+    really_inline simd8<uint8_t> max(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
+    really_inline simd8<uint8_t> min(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
+    really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
+
+    // Bit-specific operations
+    really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
+    really_inline bool any_bits_set_anywhere() const { return vmaxvq_u8(*this) != 0; }
+    really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
+    template<int N>
+    really_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
+    template<int N>
+    really_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
+
+    // Perform a lookup assuming no value is larger than 16
+    template<typename L>
+    really_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      simd8<L> lookup_table(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      );
+      return lookup_table.apply_lookup_16_to(*this);
+    }
+
+    // Perform a lookup of the lower 4 bits
+    template<typename L>
+    really_inline simd8<L> lookup_lower_4_bits(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return (*this & 0xF).lookup_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      );
+    }
+
+    really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<uint8_t> original) {
+      return vqtbl1q_u8(*this, original);
+    }
+  };
+
+  // Signed bytes
+  template<>
+  struct simd8<int8_t> {
+    int8x16_t value;
+
+    static really_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
+    static really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
+    static really_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
+
+    // Conversion from/to SIMD register
+    really_inline simd8(const int8x16_t _value) : value{_value} {}
+    really_inline operator const int8x16_t&() const { return this->value; }
+    really_inline operator int8x16_t&() { return this->value; }
+
+    // Zero constructor
+    really_inline simd8() : simd8(zero()) {}
+    // Splat constructor
+    really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    really_inline simd8(const int8_t* values) : simd8(load(values)) {}
+    // Member-by-member initialization
+    really_inline simd8(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) : simd8(int8x16_t{
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+     }) {}
+
+    // Store to array
+    really_inline void store(int8_t dst[16]) { return vst1q_s8(dst, *this); }
+
+    // Explicit conversion to/from unsigned
+    really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
+    really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(*this); }
+
+    // Math
+    really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
+    really_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(*this, other); }
+    really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
+    really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
+
+    // Order-sensitive comparisons
+    really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return vmaxq_s8(*this, other); }
+    really_inline simd8<int8_t> min(const simd8<int8_t> other) const { return vminq_s8(*this, other); }
+    really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(*this, other); }
+    really_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(*this, other); }
+
+    template<int N=1>
+    really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
+      return vextq_s8(prev_chunk, *this, 16 - N);
+    }
+
+    // Perform a lookup of the lower 4 bits
+    template<typename L>
+    really_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return simd8<uint8_t>(*this).lookup_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      );
+    }
+
+    really_inline simd8<int8_t> apply_lookup_16_to(const simd8<uint8_t> original) {
+      return vqtbl1q_s8(*this, original);
+    }
+  };
+
+  template<typename T>
+  struct simd8x64 {
+    const simd8<T> chunks[4];
+
+    really_inline simd8x64() : chunks{simd8<T>(), simd8<T>(), simd8<T>(), simd8<T>()} {}
+    really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
+
+    really_inline void store(T ptr[64]) {
+      this->chunks[0].store(ptr);
+      this->chunks[0].store(ptr+16);
+      this->chunks[0].store(ptr+32);
+      this->chunks[0].store(ptr+48);
+    }
+
+    template <typename F>
+    really_inline void each(F const& each_chunk) const
+    {
+      each_chunk(this->chunks[0]);
+      each_chunk(this->chunks[1]);
+      each_chunk(this->chunks[2]);
+      each_chunk(this->chunks[3]);
+    }
+
+    template <typename R=bool, typename F>
+    really_inline simd8x64<R> map(F const& map_chunk) const {
+      return simd8x64<R>(
+        map_chunk(this->chunks[0]),
+        map_chunk(this->chunks[1]),
+        map_chunk(this->chunks[2]),
+        map_chunk(this->chunks[3])
+      );
+    }
+
+    template <typename R=bool, typename F>
+    really_inline simd8x64<R> map(const simd8x64<T> b, F const& map_chunk) const {
+      return simd8x64<R>(
+        map_chunk(this->chunks[0], b.chunks[0]),
+        map_chunk(this->chunks[1], b.chunks[1]),
+        map_chunk(this->chunks[2], b.chunks[2]),
+        map_chunk(this->chunks[3], b.chunks[3])
+      );
+    }
+
+    template <typename F>
+    really_inline simd8<T> reduce(F const& reduce_pair) const {
+      return reduce_pair(
+        reduce_pair(this->chunks[0], this->chunks[1]),
+        reduce_pair(this->chunks[2], this->chunks[3])
+      );
+    }
+
+    really_inline uint64_t to_bitmask() const {
+      const uint8x16_t bit_mask = {
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+      };
+      // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
+      uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
+      uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
+      sum0 = vpaddq_u8(sum0, sum1);
+      sum0 = vpaddq_u8(sum0, sum0);
+      return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+    }
+
+    really_inline simd8x64<T> bit_or(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return this->map( [&](auto a) { return a | mask; } );
+    }
+
+    really_inline uint64_t eq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return this->map( [&](auto a) { return a == mask; } ).to_bitmask();
+    }
+
+    really_inline uint64_t lteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
+    }
+
+  }; // struct simd8x64<T>
+
+} // namespace simdjson::arm64::simd
+
+#endif // IS_ARM64
+#endif // SIMDJSON_ARM64_SIMD_H
--- a/src/arm64/simd_input.h
+++ b/src/arm64/simd_input.h
@ -1,119 +0,0 @@
-#ifndef SIMDJSON_ARM64_SIMD_INPUT_H
-#define SIMDJSON_ARM64_SIMD_INPUT_H
-
-#include "simdjson/common_defs.h"
-#include "simdjson/portability.h"
-#include "simdjson/simdjson.h"
-
-#ifdef IS_ARM64
-
-namespace simdjson::arm64 {
-
-really_inline uint16_t neon_movemask(uint8x16_t input) {
-  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
-  uint8x16_t minput = vandq_u8(input, bit_mask);
-  uint8x16_t tmp = vpaddq_u8(minput, minput);
-  tmp = vpaddq_u8(tmp, tmp);
-  tmp = vpaddq_u8(tmp, tmp);
-  return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
-}
-
-really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
-                                          uint8x16_t p2, uint8x16_t p3) {
-  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
-  uint8x16_t t0 = vandq_u8(p0, bit_mask);
-  uint8x16_t t1 = vandq_u8(p1, bit_mask);
-  uint8x16_t t2 = vandq_u8(p2, bit_mask);
-  uint8x16_t t3 = vandq_u8(p3, bit_mask);
-  uint8x16_t sum0 = vpaddq_u8(t0, t1);
-  uint8x16_t sum1 = vpaddq_u8(t2, t3);
-  sum0 = vpaddq_u8(sum0, sum1);
-  sum0 = vpaddq_u8(sum0, sum0);
-  return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
-}
-
-struct simd_input {
-  const uint8x16_t chunks[4];
-
-  really_inline simd_input()
-    : chunks{uint8x16_t(), uint8x16_t(), uint8x16_t(), uint8x16_t() } {}
-
-  really_inline simd_input(const uint8x16_t chunk0, const uint8x16_t chunk1, const uint8x16_t chunk2, const uint8x16_t chunk3)
-    : chunks{chunk0, chunk1, chunk2, chunk3 } {}
-
-  really_inline simd_input(const uint8_t *ptr)
-      : chunks{
-        vld1q_u8(ptr + 0*16),
-        vld1q_u8(ptr + 1*16),
-        vld1q_u8(ptr + 2*16),
-        vld1q_u8(ptr + 3*16)
-       } {}
-
-  template <typename F>
-  really_inline void each(F const& each_chunk) const {
-    each_chunk(this->chunks[0]);
-    each_chunk(this->chunks[1]);
-    each_chunk(this->chunks[2]);
-    each_chunk(this->chunks[3]);
-  }
-
-  template <typename F>
-  really_inline simd_input map(F const& map_chunk) const {
-    return simd_input(
-      map_chunk(this->chunks[0]),
-      map_chunk(this->chunks[1]),
-      map_chunk(this->chunks[2]),
-      map_chunk(this->chunks[3])
-    );
-  }
-
-  template <typename F>
-  really_inline simd_input map(simd_input b, F const& map_chunk) const {
-    return simd_input(
-      map_chunk(this->chunks[0], b.chunks[0]),
-      map_chunk(this->chunks[1], b.chunks[1]),
-      map_chunk(this->chunks[2], b.chunks[2]),
-      map_chunk(this->chunks[3], b.chunks[3])
-    );
-  }
-
-  template <typename F>
-  really_inline uint8x16_t reduce(F const& reduce_pair) const {
-    uint8x16_t r01 = reduce_pair(this->chunks[0], this->chunks[1]);
-    uint8x16_t r23 = reduce_pair(this->chunks[2], this->chunks[3]);
-    return reduce_pair(r01, r23);
-  }
-
-  really_inline uint64_t to_bitmask() const {
-    return neon_movemask_bulk(this->chunks[0], this->chunks[1], this->chunks[2], this->chunks[3]);
-  }
-
-  really_inline simd_input bit_or(const uint8_t m) const {
-    const uint8x16_t mask = vmovq_n_u8(m);
-    return this->map( [&](auto a) {
-      return vorrq_u8(a, mask);
-    });
-  }
-
-  really_inline uint64_t eq(const uint8_t m) const {
-    const uint8x16_t mask = vmovq_n_u8(m);
-    return this->map( [&](auto a) {
-      return vceqq_u8(a, mask);
-    }).to_bitmask();
-  }
-
-  really_inline uint64_t lteq(const uint8_t m) const {
-    const uint8x16_t mask = vmovq_n_u8(m);
-    return this->map( [&](auto a) {
-      return vcleq_u8(a, mask);
-    }).to_bitmask();
-  }
-
-}; // struct simd_input
-
-} // namespace simdjson::arm64
-
-#endif // IS_ARM64
-#endif // SIMDJSON_ARM64_SIMD_INPUT_H
--- a/src/arm64/simdutf8check.h
+++ b/src/arm64/simdutf8check.h
@ -1,212 +0,0 @@
-// From https://github.com/cyb70289/utf8/blob/master/lemire-neon.c
-// Adapted from https://github.com/lemire/fastvalidate-utf-8
-
-#ifndef SIMDJSON_ARM64_SIMDUTF8CHECK_H
-#define SIMDJSON_ARM64_SIMDUTF8CHECK_H
-
-// TODO this is different from IS_ARM64 in portability.h, which we use in other places ...
-#if defined(_ARM_NEON) || defined(__aarch64__) ||                              \
-    (defined(_MSC_VER) && defined(_M_ARM64))
-
-#include "simdjson/simdjson.h"
-#include "arm64/simd_input.h"
-#include <arm_neon.h>
-#include <cinttypes>
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-
-/*
- * legal utf-8 byte sequence
- * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
- *
- *  Code Points        1st       2s       3s       4s
- * U+0000..U+007F     00..7F
- * U+0080..U+07FF     C2..DF   80..BF
- * U+0800..U+0FFF     E0       A0..BF   80..BF
- * U+1000..U+CFFF     E1..EC   80..BF   80..BF
- * U+D000..U+D7FF     ED       80..9F   80..BF
- * U+E000..U+FFFF     EE..EF   80..BF   80..BF
- * U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
- * U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
- * U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
- *
- */
-namespace simdjson::arm64 {
-
-static const int8_t _nibbles[] = {
-    1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
-    0, 0, 0, 0,             // 10xx (continuation)
-    2, 2,                   // 110x
-    3,                      // 1110
-    4,                      // 1111, next should be 0 (not checked here)
-};
-
-static const int8_t _initial_mins[] = {
-    -128,         -128, -128, -128, -128, -128,
-    -128,         -128, -128, -128, -128, -128, // 10xx => false
-    (int8_t)0xC2, -128,                         // 110x
-    (int8_t)0xE1,                               // 1110
-    (int8_t)0xF1,
-};
-
-static const int8_t _second_mins[] = {
-    -128,         -128, -128, -128, -128, -128,
-    -128,         -128, -128, -128, -128, -128, // 10xx => false
-    127,          127,                          // 110x => true
-    (int8_t)0xA0,                               // 1110
-    (int8_t)0x90,
-};
-
-struct processed_utf_bytes {
-  int8x16_t raw_bytes;
-  int8x16_t high_nibbles;
-  int8x16_t carried_continuations;
-};
-
-struct utf8_checker {
-  int8x16_t has_error{vdupq_n_s8(0)};
-  processed_utf_bytes previous{vdupq_n_s8(0), vdupq_n_s8(0), vdupq_n_s8(0)};
-
-  really_inline void add_errors(int8x16_t errors) {
-    this->has_error = vorrq_s8(this->has_error, errors);
-  }
-
-  // all byte values must be no larger than 0xF4
-  really_inline void check_smaller_than_0xF4(int8x16_t current_bytes) {
-    // unsigned, saturates to 0 below max
-    this->add_errors( vreinterpretq_s8_u8(vqsubq_u8(
-                        vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4))) );
-  }
-
-  really_inline int8x16_t continuation_lengths(int8x16_t high_nibbles) {
-    return vqtbl1q_s8(vld1q_s8(_nibbles), vreinterpretq_u8_s8(high_nibbles));
-  }
-
-  really_inline int8x16_t carry_continuations(int8x16_t initial_lengths) {
-    int8x16_t right1 = vreinterpretq_s8_u8(vqsubq_u8(
-        vreinterpretq_u8_s8(vextq_s8(this->previous.carried_continuations, initial_lengths, 16 - 1)),
-        vdupq_n_u8(1)));
-    int8x16_t sum = vaddq_s8(initial_lengths, right1);
-
-    int8x16_t right2 = vreinterpretq_s8_u8(
-        vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(this->previous.carried_continuations, sum, 16 - 2)),
-                  vdupq_n_u8(2)));
-    return vaddq_s8(sum, right2);
-  }
-
-  really_inline void check_continuations(int8x16_t initial_lengths, int8x16_t carries) {
-
-    // overlap || underlap
-    // carry > length && length > 0 || !(carry > length) && !(length > 0)
-    // (carries > length) == (lengths > 0)
-    uint8x16_t overunder = vceqq_u8(vcgtq_s8(carries, initial_lengths),
-                                    vcgtq_s8(initial_lengths, vdupq_n_s8(0)));
-
-    this->add_errors( vreinterpretq_s8_u8(overunder) );
-  }
-
-  // when 0xED is found, next byte must be no larger than 0x9F
-  // when 0xF4 is found, next byte must be no larger than 0x8F
-  // next byte must be continuation, ie sign bit is set, so signed < is ok
-  really_inline void check_first_continuation_max(int8x16_t current_bytes, int8x16_t off1_current_bytes) {
-    uint8x16_t maskED = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xED));
-    uint8x16_t maskF4 = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xF4));
-
-    uint8x16_t badfollowED = vandq_u8(vcgtq_s8(current_bytes, vdupq_n_s8(0x9F)), maskED);
-    uint8x16_t badfollowF4 = vandq_u8(vcgtq_s8(current_bytes, vdupq_n_s8(0x8F)), maskF4);
-
-    this->add_errors( vreinterpretq_s8_u8(vorrq_u8(badfollowED, badfollowF4)) );
-  }
-
-  // map off1_hibits => error condition
-  // hibits     off1    cur
-  // C       => < C2 && true
-  // E       => < E1 && < A0
-  // F       => < F1 && < 90
-  // else      false && false
-  really_inline void check_overlong(int8x16_t current_bytes,
-                                    int8x16_t off1_current_bytes,
-                                    int8x16_t high_nibbles) {
-    int8x16_t off1_high_nibbles = vextq_s8(this->previous.high_nibbles, high_nibbles, 16 - 1);
-    int8x16_t initial_mins =
-        vqtbl1q_s8(vld1q_s8(_initial_mins), vreinterpretq_u8_s8(off1_high_nibbles));
-
-    uint8x16_t initial_under = vcgtq_s8(initial_mins, off1_current_bytes);
-
-    int8x16_t second_mins = vqtbl1q_s8(vld1q_s8(_second_mins), vreinterpretq_u8_s8(off1_high_nibbles));
-    uint8x16_t second_under = vcgtq_s8(second_mins, current_bytes);
-    this->add_errors( vreinterpretq_s8_u8(vandq_u8(initial_under, second_under)) );
-  }
-
-  really_inline int8x16_t count_nibbles(int8x16_t bytes) {
-    return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4));
-  }
-
-  // check whether the current bytes are valid UTF-8
-  // at the end of the function, previous gets updated
-  really_inline void check_utf8_bytes(int8x16_t current_bytes) {
-    struct processed_utf_bytes pb;
-    pb.raw_bytes = current_bytes;
-    pb.high_nibbles = this->count_nibbles(current_bytes);
-
-    this->check_smaller_than_0xF4(current_bytes);
-
-    int8x16_t initial_lengths = this->continuation_lengths(pb.high_nibbles);
-
-    pb.carried_continuations = this->carry_continuations(initial_lengths);
-
-    this->check_continuations(initial_lengths, pb.carried_continuations);
-
-    int8x16_t off1_current_bytes = vextq_s8(this->previous.raw_bytes, pb.raw_bytes, 16 - 1);
-    this->check_first_continuation_max(current_bytes, off1_current_bytes);
-
-    this->check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles);
-    this->previous = pb;
-  }
-
-  // Checks that all bytes are ascii
-  really_inline bool check_ascii_neon(simd_input in) {
-    // checking if the most significant bit is always equal to 0.
-    uint8x16_t high_bit = vdupq_n_u8(0x80);
-    uint8x16_t any_bits_on = in.reduce([&](auto a, auto b) {
-      return vorrq_u8(a, b);
-    });
-    uint8x16_t high_bit_on = vandq_u8(any_bits_on, high_bit);
-    uint64x2_t v64 = vreinterpretq_u64_u8(high_bit_on);
-    uint32x2_t v32 = vqmovn_u64(v64);
-    uint64x1_t result = vreinterpret_u64_u32(v32);
-    return vget_lane_u64(result, 0) == 0;
-  }
-
-  really_inline void check_next_input(simd_input in) {
-    if (check_ascii_neon(in)) {
-      // All bytes are ascii. Therefore the byte that was just before must be
-      // ascii too. We only check the byte that was just before simd_input. Nines
-      // are arbitrary values.
-      const int8x16_t verror =
-          (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
-      this->add_errors(vreinterpretq_s8_u8(
-                         vcgtq_s8(this->previous.carried_continuations, verror)));
-    } else {
-      // it is not ascii so we have to do heavy work
-      in.each([&](auto _in) {
-        this->check_utf8_bytes(vreinterpretq_s8_u8(_in));
-      });
-    }
-  }
-
-  really_inline ErrorValues errors() {
-    uint64x2_t v64 = vreinterpretq_u64_s8(this->has_error);
-    uint32x2_t v32 = vqmovn_u64(v64);
-    uint64x1_t result = vreinterpret_u64_u32(v32);
-    return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
-                                        : simdjson::SUCCESS;
-  }
-
-}; // struct utf8_checker
-
-} // namespace simdjson::arm64
-#endif
-#endif
--- a/src/arm64/stage1_find_marks.h
+++ b/src/arm64/stage1_find_marks.h
@ -5,49 +5,31 @@

 #ifdef IS_ARM64

-#include "arm64/simd_input.h"
-#include "arm64/simdutf8check.h"
+#include "arm64/bitmask.h"
+#include "arm64/simd.h"
 #include "simdjson/stage1_find_marks.h"

 namespace simdjson::arm64 {

-really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
-
-#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
-  return vmull_p64(-1ULL, quote_bits);
-#else
-  return portable_compute_quote_mask(quote_bits);
-#endif
-}
+using namespace simd;

 really_inline void find_whitespace_and_operators(
-    const simd_input in,
-    uint64_t &whitespace, uint64_t &op) {
-  const uint8x16_t low_nibble_mask =
-      (uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
-  const uint8x16_t high_nibble_mask =
-      (uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
-  const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
+  const simd::simd8x64<uint8_t> in,
+  uint64_t &whitespace, uint64_t &op) {

-  auto v = in.map([&](auto chunk) {
-    uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask);
-    uint8x16_t nib_hi = vshrq_n_u8(chunk, 4);
-    uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo);
-    uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi);
-    return vandq_u8(shuf_lo, shuf_hi);
+  auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
+    auto nib_lo = chunk & 0xf;
+    auto nib_hi = chunk.shr<4>();
+    auto shuf_lo = nib_lo.lookup_16<uint8_t>(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
+    auto shuf_hi = nib_hi.lookup_16<uint8_t>(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
+    return shuf_lo & shuf_hi;
  });

-  const uint8x16_t operator_shufti_mask = vmovq_n_u8(0x7);
-  op = v.map([&](auto _v) {
-    return vtstq_u8(_v, operator_shufti_mask);
-  }).to_bitmask();
-
-  const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
-  whitespace = v.map([&](auto _v) {
-    return vtstq_u8(_v, whitespace_shufti_mask);
-  }).to_bitmask();
+  op = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x7); }).to_bitmask();
+  whitespace = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x18); }).to_bitmask();
 }

+#include "generic/simdutf8check.h"
 #include "generic/stage1_find_marks.h"

 } // namespace simdjson::arm64
--- a/src/arm64/stringparsing.h
+++ b/src/arm64/stringparsing.h
@ -1,59 +1,42 @@
 #ifndef SIMDJSON_ARM64_STRINGPARSING_H
 #define SIMDJSON_ARM64_STRINGPARSING_H

+#include "simdjson/portability.h"
+
 #ifdef IS_ARM64

+#include "arm64/simd.h"
 #include "simdjson/common_defs.h"
 #include "simdjson/parsedjson.h"
 #include "jsoncharutils.h"

-#ifdef JSON_TEST_STRINGS
-void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
-                  const uint8_t *parsed_end);
-void found_bad_string(const uint8_t *buf);
-#endif
-
 namespace simdjson::arm64 {

+using namespace simd;
+
 // Holds backslashes and quotes locations.
 struct parse_string_helper {
  uint32_t bs_bits;
  uint32_t quote_bits;
-  really_inline uint32_t bytes_processed() const { return sizeof(uint8x16_t)*2; }
+  static const uint32_t BYTES_PROCESSED = 32;
 };

 really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) {
  // this can read up to 31 bytes beyond the buffer size, but we require
  // SIMDJSON_PADDING of padding
-  static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
-  uint8x16_t v0 = vld1q_u8(src);
-  uint8x16_t v1 = vld1q_u8(src + 16);
-  vst1q_u8(dst, v0);
-  vst1q_u8(dst + 16, v1);
+  static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1));
+  simd8<uint8_t> v0(src);
+  simd8<uint8_t> v1(src + sizeof(v0));
+  v0.store(dst);
+  v1.store(dst + sizeof(v0));

-  uint8x16_t bs_mask = vmovq_n_u8('\\');
-  uint8x16_t qt_mask = vmovq_n_u8('"');
-  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
-  uint8x16_t cmp_bs_0 = vceqq_u8(v0, bs_mask);
-  uint8x16_t cmp_bs_1 = vceqq_u8(v1, bs_mask);
-  uint8x16_t cmp_qt_0 = vceqq_u8(v0, qt_mask);
-  uint8x16_t cmp_qt_1 = vceqq_u8(v1, qt_mask);
-
-  cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask);
-  cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask);
-  cmp_qt_0 = vandq_u8(cmp_qt_0, bit_mask);
-  cmp_qt_1 = vandq_u8(cmp_qt_1, bit_mask);
-
-  uint8x16_t sum0 = vpaddq_u8(cmp_bs_0, cmp_bs_1);
-  uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1);
-  sum0 = vpaddq_u8(sum0, sum1);
-  sum0 = vpaddq_u8(sum0, sum0);
+  // Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on ARM; therefore, we
+  // smash them together into a 64-byte mask and get the bitmask from there.
+  uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
  return {
-      vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits
-      vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1)  // quote_bits
+    static_cast<uint32_t>(bs_and_quote),      // bs_bits
+    static_cast<uint32_t>(bs_and_quote >> 32) // quote_bits
  };
-
 }

 #include "generic/stringparsing.h"
--- a/src/generic/simdutf8check.h
+++ b/src/generic/simdutf8check.h
@ -0,0 +1,176 @@
+/*
+ * legal utf-8 byte sequence
+ * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
+ *
+ *  Code Points        1st       2s       3s       4s
+ * U+0000..U+007F     00..7F
+ * U+0080..U+07FF     C2..DF   80..BF
+ * U+0800..U+0FFF     E0       A0..BF   80..BF
+ * U+1000..U+CFFF     E1..EC   80..BF   80..BF
+ * U+D000..U+D7FF     ED       80..9F   80..BF
+ * U+E000..U+FFFF     EE..EF   80..BF   80..BF
+ * U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
+ * U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
+ * U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
+ *
+ */
+
+// all byte values must be no larger than 0xF4
+
+using namespace simd;
+
+struct processed_utf_bytes {
+  simd8<uint8_t> raw_bytes;
+  simd8<int8_t> high_nibbles;
+  simd8<int8_t> carried_continuations;
+};
+
+struct utf8_checker {
+  simd8<uint8_t> has_error;
+  processed_utf_bytes previous;
+
+  // all byte values must be no larger than 0xF4
+  really_inline void check_smaller_than_0xF4(simd8<uint8_t> current_bytes) {
+    // unsigned, saturates to 0 below max
+    this->has_error |= current_bytes.saturating_sub(0xF4u);
+  }
+
+  really_inline simd8<int8_t> continuation_lengths(simd8<int8_t> high_nibbles) {
+    return high_nibbles.lookup_16<int8_t>(
+      1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
+      0, 0, 0, 0,             // 10xx (continuation)
+      2, 2,                   // 110x
+      3,                      // 1110
+      4);                     // 1111, next should be 0 (not checked here)
+  }
+
+  really_inline simd8<int8_t> carry_continuations(simd8<int8_t> initial_lengths) {
+    simd8<int8_t> prev_carried_continuations = initial_lengths.prev(this->previous.carried_continuations);
+    simd8<int8_t> right1 = simd8<int8_t>(simd8<uint8_t>(prev_carried_continuations).saturating_sub(1));
+    simd8<int8_t> sum = initial_lengths + right1;
+
+    simd8<int8_t> prev2_carried_continuations = sum.prev<2>(this->previous.carried_continuations);
+    simd8<int8_t> right2 = simd8<int8_t>(simd8<uint8_t>(prev2_carried_continuations).saturating_sub(2));
+    return sum + right2;
+  }
+
+  really_inline void check_continuations(simd8<int8_t> initial_lengths, simd8<int8_t> carries) {
+    // overlap || underlap
+    // carry > length && length > 0 || !(carry > length) && !(length > 0)
+    // (carries > length) == (lengths > 0)
+    // (carries > current) == (current > 0)
+    this->has_error |= simd8<uint8_t>(
+      (carries > initial_lengths) == (initial_lengths > simd8<int8_t>::zero()));
+  }
+
+  really_inline void check_carried_continuations() {
+    static const int8_t last_1[32] = {
+      9, 9, 9, 9, 9, 9, 9, 9,
+      9, 9, 9, 9, 9, 9, 9, 9,
+      9, 9, 9, 9, 9, 9, 9, 9,
+      9, 9, 9, 9, 9, 9, 9, 1
+    };
+    this->has_error |= simd8<uint8_t>(this->previous.carried_continuations > simd8<int8_t>(last_1 + 32 - sizeof(simd8<int8_t>)));
+  }
+
+  // when 0xED is found, next byte must be no larger than 0x9F
+  // when 0xF4 is found, next byte must be no larger than 0x8F
+  // next byte must be continuation, ie sign bit is set, so signed < is ok
+  really_inline void check_first_continuation_max(simd8<uint8_t> current_bytes,
+                                                  simd8<uint8_t> off1_current_bytes) {
+    simd8<bool> prev_ED = off1_current_bytes == 0xEDu;
+    simd8<bool> prev_F4 = off1_current_bytes == 0xF4u;
+    // Check if ED is followed by A0 or greater
+    simd8<bool> ED_too_large = (simd8<int8_t>(current_bytes) > simd8<int8_t>::splat(0x9Fu)) & prev_ED;
+    // Check if F4 is followed by 90 or greater
+    simd8<bool> F4_too_large = (simd8<int8_t>(current_bytes) > simd8<int8_t>::splat(0x8Fu)) & prev_F4;
+    // These will also error if ED or F4 is followed by ASCII, but that's an error anyway
+    this->has_error |= simd8<uint8_t>(ED_too_large | F4_too_large);
+  }
+
+  // map off1_hibits => error condition
+  // hibits     off1    cur
+  // C       => < C2 && true
+  // E       => < E1 && < A0
+  // F       => < F1 && < 90
+  // else      false && false
+  really_inline void check_overlong(simd8<uint8_t> current_bytes,
+                                    simd8<uint8_t> off1_current_bytes,
+                                    simd8<int8_t> high_nibbles) {
+    simd8<int8_t> off1_high_nibbles = high_nibbles.prev(this->previous.high_nibbles);
+
+    // Two-byte characters must start with at least C2
+    // Three-byte characters must start with at least E1
+    // Four-byte characters must start with at least F1
+    simd8<int8_t> initial_mins = off1_high_nibbles.lookup_16<int8_t>(
+      -128, -128, -128, -128, -128, -128, -128, -128, // 0xxx -> false
+      -128, -128, -128, -128,                         // 10xx -> false
+      0xC2, -128,                                     // 1100 -> C2
+      0xE1,                                           // 1110
+      0xF1                                            // 1111
+    );
+    simd8<bool> initial_under = initial_mins > simd8<int8_t>(off1_current_bytes);
+
+    // Two-byte characters starting with at least C2 are always OK
+    // Three-byte characters starting with at least E1 must be followed by at least A0
+    // Four-byte characters starting with at least F1 must be followed by at least 90
+    simd8<int8_t> second_mins = off1_high_nibbles.lookup_16<int8_t>(
+      -128, -128, -128, -128, -128, -128, -128, -128, -128, // 0xxx => false
+      -128, -128, -128,                                     // 10xx => false
+      127, 127,                                             // 110x => true
+      0xA0,                                                 // 1110
+      0x90                                                  // 1111
+    );
+    simd8<bool> second_under = second_mins > simd8<int8_t>(current_bytes);
+    this->has_error |= simd8<uint8_t>(initial_under & second_under);
+  }
+
+  really_inline void count_nibbles(simd8<uint8_t> bytes, struct processed_utf_bytes *answer) {
+    answer->raw_bytes = bytes;
+    answer->high_nibbles = simd8<int8_t>(bytes.shr<4>());
+  }
+
+  // check whether the current bytes are valid UTF-8
+  // at the end of the function, previous gets updated
+  really_inline void check_utf8_bytes(simd8<uint8_t> current_bytes) {
+    struct processed_utf_bytes pb {};
+    this->count_nibbles(current_bytes, &pb);
+
+    this->check_smaller_than_0xF4(current_bytes);
+
+    simd8<int8_t> initial_lengths = this->continuation_lengths(pb.high_nibbles);
+
+    pb.carried_continuations = this->carry_continuations(initial_lengths);
+
+    this->check_continuations(initial_lengths, pb.carried_continuations);
+
+    simd8<uint8_t> off1_current_bytes = pb.raw_bytes.prev(this->previous.raw_bytes);
+    this->check_first_continuation_max(current_bytes, off1_current_bytes);
+
+    this->check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles);
+    this->previous = pb;
+  }
+
+  really_inline void check_next_input(simd8<uint8_t> in) {
+    if (likely(!in.any_bits_set_anywhere(0x80u))) {
+      this->check_carried_continuations();
+    } else {
+      this->check_utf8_bytes(in);
+    }
+  }
+
+  really_inline void check_next_input(simd8x64<uint8_t> in) {
+    simd8<uint8_t> bits = in.reduce([&](auto a, auto b) { return a | b; });
+    if (likely(!bits.any_bits_set_anywhere(0x80u))) {
+      // it is ascii, we just check carried continuations.
+      this->check_carried_continuations();
+    } else {
+      // it is not ascii so we have to do heavy work
+      in.each([&](auto _in) { this->check_utf8_bytes(_in); });
+    }
+  }
+
+  really_inline ErrorValues errors() {
+    return this->has_error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+  }
+}; // struct utf8_checker
--- a/src/generic/stage1_find_marks.h
+++ b/src/generic/stage1_find_marks.h
@ -72,7 +72,6 @@ public:
  uint64_t unescaped_chars_error = 0;
  bit_indexer structural_indexes;

-
  json_structural_scanner(uint32_t *_structural_indexes) : structural_indexes{_structural_indexes} {}

  // return a bitvector indicating where we have characters that end an odd-length
@ -159,12 +158,12 @@ public:
  //
  // Backslash sequences outside of quotes will be detected in stage 2.
  //
-  really_inline uint64_t find_strings(const simd_input in) {
+  really_inline uint64_t find_strings(const simd::simd8x64<uint8_t> in) {
    const uint64_t backslash = in.eq('\\');
    const uint64_t escaped = follows_odd_sequence_of(backslash, prev_escaped);
    const uint64_t quote = in.eq('"') & ~escaped;
-    // compute_quote_mask returns start quote plus string contents.
-    const uint64_t in_string = compute_quote_mask(quote) ^ prev_in_string;
+    // prefix_xor flips on bits inside the string (and flips off the end quote).
+    const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
    /* right shift of a signed value expected to be well-defined and standard
    * compliant as of C++20,
    * John Regher from Utah U. says this is fine code */
@ -198,7 +197,7 @@ public:
  // contents of a string the same as content outside. Errors and structurals inside the string or on
  // the trailing quote will need to be removed later when the correct string information is known.
  //
-  really_inline uint64_t find_potential_structurals(const simd_input in) {
+  really_inline uint64_t find_potential_structurals(const simd::simd8x64<uint8_t> in) {
    // These use SIMD so let's kick them off before running the regular 64-bit stuff ...
    uint64_t whitespace, op;
    find_whitespace_and_operators(in, whitespace, op);
@ -236,8 +235,8 @@ public:
    //
    // Load up all 128 bytes into SIMD registers
    //
-    simd_input in_1(buf);
-    simd_input in_2(buf+64);
+    simd::simd8x64<uint8_t> in_1(buf);
+    simd::simd8x64<uint8_t> in_2(buf+64);

    //
    // Find the strings and potential structurals (operators / primitives).
--- a/src/generic/stringparsing.h
+++ b/src/generic/stringparsing.h
@ -84,7 +84,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
       */

      /* find out where the quote is... */
-      uint32_t quote_dist = trailing_zeroes(helper.quote_bits);
+      auto quote_dist = trailing_zeroes(helper.quote_bits);

      /* NULL termination is still handy if you expect all your strings to
       * be NULL terminated? */
@ -92,7 +92,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
      dst[quote_dist] = 0;

      uint32_t str_length = (dst - start_of_string) + quote_dist;
-      memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t));
+      memcpy(pj.current_string_buf_loc, &str_length, sizeof(str_length));
      /*****************************
       * Above, check for overflow in case someone has a crazy string
       * (>=4GB?)                 _
@ -109,7 +109,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
    }
    if (((helper.quote_bits - 1) & helper.bs_bits) != 0) {
      /* find out where the backspace is */
-      uint32_t bs_dist = trailing_zeroes(helper.bs_bits);
+      auto bs_dist = trailing_zeroes(helper.bs_bits);
      uint8_t escape_char = src[bs_dist + 1];
      /* we encountered backslash first. Handle backslash */
      if (escape_char == 'u') {
@ -136,8 +136,8 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
    } else {
      /* they are the same. Since they can't co-occur, it means we
       * encountered neither. */
-      src += helper.bytes_processed();
-      dst += helper.bytes_processed();
+      src += parse_string_helper::BYTES_PROCESSED;
+      dst += parse_string_helper::BYTES_PROCESSED;
    }
  }
  /* can't be reached */
--- a/src/haswell/bitmask.h
+++ b/src/haswell/bitmask.h
@ -0,0 +1,30 @@
+#ifndef SIMDJSON_HASWELL_BITMASK_H
+#define SIMDJSON_HASWELL_BITMASK_H
+
+#include "simdjson/portability.h"
+
+#ifdef IS_X86_64
+
+#include "simdjson/common_defs.h"
+
+TARGET_HASWELL
+namespace simdjson::haswell {
+
+//
+// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+//
+// For example, prefix_xor(00100100) == 00011100
+//
+really_inline uint64_t prefix_xor(const uint64_t bitmask) {
+  // There should be no such thing with a processing supporting avx2
+  // but not clmul.
+  __m128i all_ones = _mm_set1_epi8('\xFF');
+  __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
+  return _mm_cvtsi128_si64(result);
+}
+
+} // namespace simdjson::haswell
+UNTARGET_REGION
+
+#endif // IS_X86_64
+#endif
--- a/src/haswell/simd.h
+++ b/src/haswell/simd.h
@ -0,0 +1,267 @@
+#ifndef SIMDJSON_HASWELL_SIMD_H
+#define SIMDJSON_HASWELL_SIMD_H
+
+#include "simdjson/portability.h"
+
+#ifdef IS_X86_64
+
+#include "simdjson/common_defs.h"
+
+TARGET_HASWELL
+namespace simdjson::haswell::simd {
+
+  // Forward-declared so they can be used by splat and friends.
+  template<typename Child>
+  struct base {
+    __m256i value;
+
+    // Zero constructor
+    really_inline base() : value{__m256i()} {}
+
+    // Conversion from SIMD register
+    really_inline base(const __m256i _value) : value(_value) {}
+
+    // Conversion to SIMD register
+    really_inline operator const __m256i&() const { return this->value; }
+    really_inline operator __m256i&() { return this->value; }
+
+    // Bit operations
+    really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
+    really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
+    really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
+    really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(*this, other); }
+    really_inline Child operator~() const { return *this ^ 0xFFu; }
+    really_inline Child& operator|=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast | other; return *this_cast; }
+    really_inline Child& operator&=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast & other; return *this_cast; }
+    really_inline Child& operator^=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast ^ other; return *this_cast; }
+  };
+
+  // Forward-declared so they can be used by splat and friends.
+  template<typename T>
+  struct simd8;
+
+  template<typename T, typename Mask=simd8<bool>>
+  struct base8: base<simd8<T>> {
+    really_inline base8() : base<simd8<T>>() {}
+    really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
+
+    really_inline Mask operator==(const simd8<T> other) const { return _mm256_cmpeq_epi8(*this, other); }
+
+    static const int SIZE = sizeof(base<T>::value);
+
+    template<int N=1>
+    really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+      return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
+    }
+  };
+
+  // SIMD byte mask type (returned by things like eq and gt)
+  template<>
+  struct simd8<bool>: base8<bool> {
+    typedef int bitmask_t;
+    static really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(-(!!_value)); }
+
+    really_inline simd8<bool>() : base8() {}
+    really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
+    // Splat constructor
+    really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+
+    really_inline bitmask_t to_bitmask() const { return _mm256_movemask_epi8(*this); }
+    really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
+  };
+
+  template<typename T>
+  struct base8_numeric: base8<T> {
+    static really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
+    static really_inline simd8<T> zero() { return _mm256_setzero_si256(); }
+    static really_inline simd8<T> load(const T values[32]) {
+      return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
+    }
+
+    really_inline base8_numeric() : base8<T>() {}
+    really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
+
+    // Store to array
+    really_inline void store(T dst[32]) { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
+
+    // Addition/subtraction are the same for signed and unsigned
+    really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
+    really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
+    really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *this; }
+    really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *this; }
+
+    // Perform a lookup of the lower 4 bits
+    template<typename L>
+    really_inline simd8<L> lookup_lower_4_bits(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      simd8<L> lookup_table(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15,
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      );
+      return _mm256_shuffle_epi8(lookup_table, *this);
+    }
+
+    // Perform a lookup assuming the value is between 0 and 16
+    template<typename L>
+    really_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_lower_4_bits(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      );
+    }
+  };
+
+  // Signed bytes
+  template<>
+  struct simd8<int8_t> : base8_numeric<int8_t> {
+    really_inline simd8() : base8_numeric<int8_t>() {}
+    really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
+    // Splat constructor
+    really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
+    // Member-by-member initialization
+    really_inline simd8(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
+      int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+      int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
+    ) : simd8(_mm256_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15,
+      v16,v17,v18,v19,v20,v21,v22,v23,
+      v24,v25,v26,v27,v28,v29,v30,v31
+    )) {}
+
+    // Order-sensitive comparisons
+    really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
+    really_inline simd8<int8_t> min(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
+    really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
+  };
+
+  // Unsigned bytes
+  template<>
+  struct simd8<uint8_t>: base8_numeric<uint8_t> {
+    really_inline simd8() : base8_numeric<uint8_t>() {}
+    really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
+    // Splat constructor
+    really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
+    // Member-by-member initialization
+    really_inline simd8(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
+      uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
+      uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
+    ) : simd8(_mm256_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15,
+      v16,v17,v18,v19,v20,v21,v22,v23,
+      v24,v25,v26,v27,v28,v29,v30,v31
+    )) {}
+
+    // Saturated math
+    really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
+    really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
+
+    // Order-specific operations
+    really_inline simd8<uint8_t> max(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); }
+    really_inline simd8<uint8_t> min(const simd8<uint8_t> other) const { return _mm256_min_epu8(*this, other); }
+    really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max(*this) == other; }
+
+    // Bit-specific operations
+    really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set(); }
+    really_inline simd8<bool> any_bits_set() const { return ~(*this == uint8_t(0)); }
+    really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !_mm256_testz_si256(*this, bits); }
+    really_inline bool any_bits_set_anywhere() const { return !_mm256_testz_si256(*this, *this); }
+    template<int N>
+    really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
+    template<int N>
+    really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
+  };
+
+  template<typename T>
+  struct simd8x64 {
+    const simd8<T> chunks[2];
+
+    really_inline simd8x64() : chunks{simd8<T>(), simd8<T>()} {}
+    really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
+    really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
+
+    really_inline void store(T *ptr) {
+      this->chunks[0].store(ptr);
+      this->chunks[0].store(ptr+sizeof(simd8<T>));
+    }
+
+    template <typename F>
+    really_inline void each(F const& each_chunk) const
+    {
+      each_chunk(this->chunks[0]);
+      each_chunk(this->chunks[1]);
+    }
+
+    template <typename R=bool, typename F>
+    really_inline simd8x64<R> map(F const& map_chunk) const {
+      return simd8x64<R>(
+        map_chunk(this->chunks[0]),
+        map_chunk(this->chunks[1])
+      );
+    }
+
+    template <typename R=bool, typename F>
+    really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const {
+      return simd8x64<R>(
+        map_chunk(this->chunks[0], b.chunks[0]),
+        map_chunk(this->chunks[1], b.chunks[1])
+      );
+    }
+
+    template <typename F>
+    really_inline simd8<T> reduce(F const& reduce_pair) const {
+      return reduce_pair(this->chunks[0], this->chunks[1]);
+    }
+
+    really_inline uint64_t to_bitmask() const {
+      uint64_t r_lo = static_cast<uint32_t>(this->chunks[0].to_bitmask());
+      uint64_t r_hi =                       this->chunks[1].to_bitmask();
+      return r_lo | (r_hi << 32);
+    }
+
+    really_inline simd8x64<T> bit_or(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return this->map( [&](auto a) { return a | mask; } );
+    }
+
+    really_inline uint64_t eq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return this->map( [&](auto a) { return a == mask; } ).to_bitmask();
+    }
+
+    really_inline uint64_t lteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
+    }
+
+  }; // struct simd8x64<T>
+
+} // namespace simdjson::haswell::simd
+UNTARGET_REGION
+
+#endif // IS_X86_64
+#endif // SIMDJSON_HASWELL_SIMD_H
--- a/src/haswell/simd_input.h
+++ b/src/haswell/simd_input.h
@ -1,88 +0,0 @@
-#ifndef SIMDJSON_HASWELL_SIMD_INPUT_H
-#define SIMDJSON_HASWELL_SIMD_INPUT_H
-
-#include "simdjson/common_defs.h"
-#include "simdjson/portability.h"
-#include "simdjson/simdjson.h"
-
-#ifdef IS_X86_64
-
-TARGET_HASWELL
-namespace simdjson::haswell {
-
-struct simd_input {
-  const __m256i chunks[2];
-
-  really_inline simd_input() : chunks{__m256i(), __m256i()} {}
-
-  really_inline simd_input(const __m256i chunk0, const __m256i chunk1)
-      : chunks{chunk0, chunk1} {}
-
-  really_inline simd_input(const uint8_t *ptr)
-      : chunks{
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0*32)),
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 1*32))
-      } {}
-
-  template <typename F>
-  really_inline void each(F const& each_chunk) const
-  {
-    each_chunk(this->chunks[0]);
-    each_chunk(this->chunks[1]);
-  }
-
-  template <typename F>
-  really_inline simd_input map(F const& map_chunk) const {
-    return simd_input(
-      map_chunk(this->chunks[0]),
-      map_chunk(this->chunks[1])
-    );
-  }
-
-  template <typename F>
-  really_inline simd_input map(const simd_input b, F const& map_chunk) const {
-    return simd_input(
-      map_chunk(this->chunks[0], b.chunks[0]),
-      map_chunk(this->chunks[1], b.chunks[1])
-    );
-  }
-
-  template <typename F>
-  really_inline __m256i reduce(F const& reduce_pair) const {
-    return reduce_pair(this->chunks[0], this->chunks[1]);
-  }
-
-  really_inline uint64_t to_bitmask() const {
-    uint64_t r_lo = static_cast<uint32_t>(_mm256_movemask_epi8(this->chunks[0]));
-    uint64_t r_hi =                       _mm256_movemask_epi8(this->chunks[1]);
-    return r_lo | (r_hi << 32);
-  }
-
-  really_inline simd_input bit_or(const uint8_t m) const {
-    const __m256i mask = _mm256_set1_epi8(m);
-    return this->map( [&](auto a) {
-      return _mm256_or_si256(a, mask);
-    });
-  }
-
-  really_inline uint64_t eq(const uint8_t m) const {
-    const __m256i mask = _mm256_set1_epi8(m);
-    return this->map( [&](auto a) {
-      return _mm256_cmpeq_epi8(a, mask);
-    }).to_bitmask();
-  }
-
-  really_inline uint64_t lteq(const uint8_t m) const {
-    const __m256i maxval = _mm256_set1_epi8(m);
-    return this->map( [&](auto a) {
-      return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, a), maxval);
-    }).to_bitmask();
-  }
-
-}; // struct simd_input
-
-} // namespace simdjson::haswell
-UNTARGET_REGION
-
-#endif // IS_X86_64
-#endif // SIMDJSON_HASWELL_SIMD_INPUT_H
--- a/src/haswell/simdutf8check.h
+++ b/src/haswell/simdutf8check.h
@ -1,233 +0,0 @@
-#ifndef SIMDJSON_HASWELL_SIMDUTF8CHECK_H
-#define SIMDJSON_HASWELL_SIMDUTF8CHECK_H
-
-#include "simdjson/portability.h"
-#include "simdjson/simdjson.h"
-#include "haswell/simd_input.h"
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-
-#ifdef IS_X86_64
-/*
- * legal utf-8 byte sequence
- * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
- *
- *  Code Points        1st       2s       3s       4s
- * U+0000..U+007F     00..7F
- * U+0080..U+07FF     C2..DF   80..BF
- * U+0800..U+0FFF     E0       A0..BF   80..BF
- * U+1000..U+CFFF     E1..EC   80..BF   80..BF
- * U+D000..U+D7FF     ED       80..9F   80..BF
- * U+E000..U+FFFF     EE..EF   80..BF   80..BF
- * U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
- * U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
- * U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
- *
- */
-
-// all byte values must be no larger than 0xF4
-
-TARGET_HASWELL
-namespace simdjson::haswell {
-
-static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b) {
-  return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 15);
-}
-
-static inline __m256i push_last_2bytes_of_a_to_b(__m256i a, __m256i b) {
-  return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 14);
-}
-
-struct processed_utf_bytes {
-  __m256i raw_bytes;
-  __m256i high_nibbles;
-  __m256i carried_continuations;
-};
-
-struct utf8_checker {
-  __m256i has_error;
-  processed_utf_bytes previous;
-
-  utf8_checker() :
-      has_error{_mm256_setzero_si256()},
-      previous{_mm256_setzero_si256(), _mm256_setzero_si256(), _mm256_setzero_si256()} {}
-
-  really_inline void add_errors(__m256i errors) {
-    this->has_error = _mm256_or_si256(this->has_error, errors);
-  }
-
-  // all byte values must be no larger than 0xF4
-  really_inline void check_smaller_than_0xF4(__m256i current_bytes) {
-    // unsigned, saturates to 0 below max
-    this->add_errors( _mm256_subs_epu8(current_bytes, _mm256_set1_epi8(0xF4u)) );
-  }
-
-  really_inline __m256i continuation_lengths(__m256i high_nibbles) {
-    return _mm256_shuffle_epi8(
-        _mm256_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
-                         0, 0, 0, 0,             // 10xx (continuation)
-                         2, 2,                   // 110x
-                         3,                      // 1110
-                         4, // 1111, next should be 0 (not checked here)
-                         1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
-                         0, 0, 0, 0,             // 10xx (continuation)
-                         2, 2,                   // 110x
-                         3,                      // 1110
-                         4), // 1111, next should be 0 (not checked here)
-                         
-        high_nibbles);
-  }
-
-  really_inline __m256i carry_continuations(__m256i initial_lengths) {
-    __m256i right1 = _mm256_subs_epu8(
-        push_last_byte_of_a_to_b(this->previous.carried_continuations, initial_lengths),
-        _mm256_set1_epi8(1));
-    __m256i sum = _mm256_add_epi8(initial_lengths, right1);
-
-    __m256i right2 = _mm256_subs_epu8(
-        push_last_2bytes_of_a_to_b(this->previous.carried_continuations, sum), _mm256_set1_epi8(2));
-    return _mm256_add_epi8(sum, right2);
-  }
-
-  really_inline void check_continuations(__m256i initial_lengths, __m256i carries) {
-    // overlap || underlap
-    // carry > length && length > 0 || !(carry > length) && !(length > 0)
-    // (carries > length) == (lengths > 0)
-    // (carries > current) == (current > 0)
-    __m256i overunder = _mm256_cmpeq_epi8(
-        _mm256_cmpgt_epi8(carries, initial_lengths),
-        _mm256_cmpgt_epi8(initial_lengths, _mm256_setzero_si256()));
-
-    this->add_errors( overunder );
-  }
-
-  really_inline void check_carried_continuations() {
-    this->add_errors(
-      _mm256_cmpgt_epi8(this->previous.carried_continuations,
-      _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                       9, 9, 9, 9, 9, 9, 9, 1))
-    );
-  }
-
-  // when 0xED is found, next byte must be no larger than 0x9F
-  // when 0xF4 is found, next byte must be no larger than 0x8F
-  // next byte must be continuation, ie sign bit is set, so signed < is ok
-  really_inline void check_first_continuation_max(__m256i current_bytes,
-                                                      __m256i off1_current_bytes) {
-    __m256i maskED =
-        _mm256_cmpeq_epi8(off1_current_bytes, _mm256_set1_epi8(0xEDu));
-    __m256i maskF4 =
-        _mm256_cmpeq_epi8(off1_current_bytes, _mm256_set1_epi8(0xF4u));
-
-    __m256i badfollowED = _mm256_and_si256(
-        _mm256_cmpgt_epi8(current_bytes, _mm256_set1_epi8(0x9Fu)), maskED);
-    __m256i badfollowF4 = _mm256_and_si256(
-        _mm256_cmpgt_epi8(current_bytes, _mm256_set1_epi8(0x8Fu)), maskF4);
-
-    this->add_errors( _mm256_or_si256(badfollowED, badfollowF4) );
-  }
-
-  // map off1_hibits => error condition
-  // hibits     off1    cur
-  // C       => < C2 && true
-  // E       => < E1 && < A0
-  // F       => < F1 && < 90
-  // else      false && false
-  really_inline void check_overlong(__m256i current_bytes,
-                                        __m256i off1_current_bytes,
-                                        __m256i high_nibbles) {
-    __m256i off1_high_nibbles = push_last_byte_of_a_to_b(this->previous.high_nibbles, high_nibbles);
-    __m256i initial_mins = _mm256_shuffle_epi8(
-        _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
-                         -128, -128, -128, // 10xx => false
-                         0xC2u, -128,      // 110x
-                         0xE1u,            // 1110
-                         0xF1u,            // 1111
-                         -128, -128, -128, -128, -128, -128, -128, -128, -128,
-                         -128, -128, -128, // 10xx => false
-                         0xC2u, -128,      // 110x
-                         0xE1u,            // 1110
-                         0xF1u),           // 1111
-        off1_high_nibbles);
-
-    __m256i initial_under = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes);
-
-    __m256i second_mins = _mm256_shuffle_epi8(
-        _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
-                         -128, -128, -128, // 10xx => false
-                         127, 127,         // 110x => true
-                         0xA0u,            // 1110
-                         0x90u,            // 1111
-                         -128, -128, -128, -128, -128, -128, -128, -128, -128,
-                         -128, -128, -128, // 10xx => false
-                         127, 127,         // 110x => true
-                         0xA0u,            // 1110
-                         0x90u),           // 1111
-        off1_high_nibbles);
-    __m256i second_under = _mm256_cmpgt_epi8(second_mins, current_bytes);
-    this->add_errors( _mm256_and_si256(initial_under, second_under) );
-  }
-
-  really_inline void count_nibbles(__m256i bytes, struct processed_utf_bytes *answer) {
-    answer->raw_bytes = bytes;
-    answer->high_nibbles = _mm256_and_si256(_mm256_srli_epi16(bytes, 4), _mm256_set1_epi8(0x0F));
-  }
-
-  // check whether the current bytes are valid UTF-8
-  // at the end of the function, previous gets updated
-  really_inline void check_utf8_bytes(__m256i current_bytes) {
-    struct processed_utf_bytes pb {};
-    this->count_nibbles(current_bytes, &pb);
-
-    this->check_smaller_than_0xF4(current_bytes);
-
-    __m256i initial_lengths = this->continuation_lengths(pb.high_nibbles);
-
-    pb.carried_continuations = this->carry_continuations(initial_lengths);
-
-    this->check_continuations(initial_lengths, pb.carried_continuations);
-
-    __m256i off1_current_bytes =
-        push_last_byte_of_a_to_b(this->previous.raw_bytes, pb.raw_bytes);
-    this->check_first_continuation_max(current_bytes, off1_current_bytes);
-
-    this->check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles);
-    this->previous = pb;
-  }
-
-  really_inline void check_next_input(__m256i in) {
-    __m256i high_bit = _mm256_set1_epi8(0x80u);
-    if (likely(_mm256_testz_si256(in, high_bit) == 1)) {
-      this->check_carried_continuations();
-    } else {
-      this->check_utf8_bytes(in);
-    }
-  }
-
-  really_inline void check_next_input(simd_input in) {
-    __m256i high_bit = _mm256_set1_epi8(0x80u);
-    __m256i any_bits_on = in.reduce([&](auto a, auto b) {
-      return _mm256_or_si256(a, b);
-    });
-    if (likely(_mm256_testz_si256(any_bits_on, high_bit) == 1)) {
-      // it is ascii, we just check carried continuations.
-      this->check_carried_continuations();
-    } else {
-      // it is not ascii so we have to do heavy work
-      in.each([&](auto _in) { check_utf8_bytes(_in); });
-    }
-  }
-
-  really_inline ErrorValues errors() {
-    return _mm256_testz_si256(this->has_error, this->has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
-  }
-}; // struct utf8_checker
-
-}; // namespace simdjson::haswell
-UNTARGET_REGION // haswell
-
-#endif // IS_X86_64
-
-#endif // SIMDJSON_HASWELL_SIMDUTF8CHECK_H
--- a/src/haswell/stage1_find_marks.h
+++ b/src/haswell/stage1_find_marks.h
@ -5,85 +5,29 @@

 #ifdef IS_X86_64

-#include "haswell/simd_input.h"
-#include "haswell/simdutf8check.h"
+#include "haswell/bitmask.h"
+#include "haswell/simd.h"
 #include "simdjson/stage1_find_marks.h"

 TARGET_HASWELL
 namespace simdjson::haswell {

-really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
-  // There should be no such thing with a processing supporting avx2
-  // but not clmul.
-  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
-      _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
-  return quote_mask;
-}
+using namespace simd;

 really_inline void find_whitespace_and_operators(
-  const simd_input in,
+  const simd::simd8x64<uint8_t> in,
  uint64_t &whitespace, uint64_t &op) {

-  #ifdef SIMDJSON_NAIVE_STRUCTURAL
+  whitespace = in.map([&](simd8<uint8_t> _in) {
+    return _in == _in.lookup_lower_4_bits<uint8_t>(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
+  }).to_bitmask();

-    // You should never need this naive approach, but it can be useful
-    // for research purposes
-    const __m256i mask_open_brace = _mm256_set1_epi8(0x7b);
-    const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
-    const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
-    const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
-    const __m256i mask_column = _mm256_set1_epi8(0x3a);
-    const __m256i mask_comma = _mm256_set1_epi8(0x2c);
-    op = in.map([&](auto in) {
-      __m256i op = _mm256_cmpeq_epi8(in, mask_open_brace);
-      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_brace));
-      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_open_bracket));
-      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_bracket));
-      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_column));
-      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_comma));
-      return op;
-    }).to_bitmask();
-
-    const __m256i mask_space = _mm256_set1_epi8(0x20);
-    const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
-    const __m256i mask_tab = _mm256_set1_epi8(0x09);
-    const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
-    whitespace = in.map([&](auto in) {
-      __m256i space = _mm256_cmpeq_epi8(in, mask_space);
-      space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_linefeed));
-      space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_tab));
-      space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_carriage));
-      return space;
-    }).to_bitmask();
-    // end of naive approach
-
-  #else  // SIMDJSON_NAIVE_STRUCTURAL
-
-    // clang-format off
-    const __m256i operator_table =
-        _mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
-                         44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
-    const __m256i white_table = _mm256_setr_epi8(
-        32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
-        32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
-    // clang-format on
-    const __m256i op_offset = _mm256_set1_epi8(0xd4u);
-    const __m256i op_mask = _mm256_set1_epi8(32);
-
-    whitespace = in.map([&](auto _in) {
-      return _mm256_cmpeq_epi8(_in, _mm256_shuffle_epi8(white_table, _in));
-    }).to_bitmask();
-
-    op = in.map([&](auto _in) {
-      const __m256i r1 = _mm256_add_epi8(op_offset, _in);
-      const __m256i r2 = _mm256_or_si256(_in, op_mask);
-      const __m256i r3 = _mm256_shuffle_epi8(operator_table, r1);
-      return _mm256_cmpeq_epi8(r2, r3);
-    }).to_bitmask();
-
-  #endif // else SIMDJSON_NAIVE_STRUCTURAL
+  op = in.map([&](simd8<uint8_t> _in) {
+    return (_in | 32) == (_in+0xd4u).lookup_lower_4_bits<uint8_t>(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
+  }).to_bitmask();
 }

+#include "generic/simdutf8check.h"
 #include "generic/stage1_find_marks.h"

 } // namespace haswell
--- a/src/haswell/stringparsing.h
+++ b/src/haswell/stringparsing.h
@ -1,41 +1,37 @@
 #ifndef SIMDJSON_HASWELL_STRINGPARSING_H
 #define SIMDJSON_HASWELL_STRINGPARSING_H

+#include "simdjson/portability.h"
+
 #ifdef IS_X86_64

+#include "haswell/simd.h"
 #include "simdjson/common_defs.h"
 #include "simdjson/parsedjson.h"
 #include "jsoncharutils.h"

-#ifdef JSON_TEST_STRINGS
-void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
-                  const uint8_t *parsed_end);
-void found_bad_string(const uint8_t *buf);
-#endif
-
 TARGET_HASWELL
 namespace simdjson::haswell {

+using namespace simd;
+
 // Holds backslashes and quotes locations.
 struct parse_string_helper {
  uint32_t bs_bits;
  uint32_t quote_bits;
-  really_inline uint32_t bytes_processed() const { return sizeof(__m256i); }
+  static const uint32_t BYTES_PROCESSED = 32;
 };

 really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) {
-  // this can read up to 31 bytes beyond the buffer size, but we require
+  // this can read up to 15 bytes beyond the buffer size, but we require
  // SIMDJSON_PADDING of padding
-  static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
-  __m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
-  // store to dest unconditionally - we can overwrite the bits we don't like
-  // later
-  _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
-  auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
+  static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1));
+  simd8<uint8_t> v(src);
+  // store to dest unconditionally - we can overwrite the bits we don't like later
+  v.store(dst);
  return {
-      static_cast<uint32_t>(_mm256_movemask_epi8(
-          _mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))),     // bs_bits
-      static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask)) // quote_bits
+      (uint32_t)(v == '\\').to_bitmask(),     // bs_bits
+      (uint32_t)(v == '"').to_bitmask(), // quote_bits
  };
 }

--- a/src/stage1_find_marks.cpp
+++ b/src/stage1_find_marks.cpp
@ -1,19 +1,3 @@
-#include "simdjson/portability.h"
-#include "simdjson/common_defs.h"
-
-namespace {
-// for when clmul is unavailable
-[[maybe_unused]] really_inline uint64_t portable_compute_quote_mask(uint64_t quote_bits) {
-  uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
-  quote_mask = quote_mask ^ (quote_mask << 2);
-  quote_mask = quote_mask ^ (quote_mask << 4);
-  quote_mask = quote_mask ^ (quote_mask << 8);
-  quote_mask = quote_mask ^ (quote_mask << 16);
-  quote_mask = quote_mask ^ (quote_mask << 32);
-  return quote_mask;
-}
-} // namespace
-
 #include "arm64/stage1_find_marks.h"
 #include "haswell/stage1_find_marks.h"
 #include "westmere/stage1_find_marks.h"
--- a/src/stage2_build_tape.cpp
+++ b/src/stage2_build_tape.cpp
@ -65,6 +65,12 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) {
  return error == 0;
 }

+#ifdef JSON_TEST_STRINGS
+void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
+                  const uint8_t *parsed_end);
+void found_bad_string(const uint8_t *buf);
+#endif
+
 #include "arm64/stage2_build_tape.h"
 #include "haswell/stage2_build_tape.h"
 #include "westmere/stage2_build_tape.h"
--- a/src/westmere/bitmask.h
+++ b/src/westmere/bitmask.h
@ -0,0 +1,30 @@
+#ifndef SIMDJSON_WESTMERE_BITMASK_H
+#define SIMDJSON_WESTMERE_BITMASK_H
+
+#include "simdjson/portability.h"
+
+#ifdef IS_X86_64
+
+#include "simdjson/common_defs.h"
+
+TARGET_WESTMERE
+namespace simdjson::westmere {
+
+//
+// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+//
+// For example, prefix_xor(00100100) == 00011100
+//
+really_inline uint64_t prefix_xor(const uint64_t bitmask) {
+  // There should be no such thing with a processing supporting avx2
+  // but not clmul.
+  __m128i all_ones = _mm_set1_epi8('\xFF');
+  __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
+  return _mm_cvtsi128_si64(result);
+}
+
+} // namespace simdjson::westmere
+UNTARGET_REGION
+
+#endif // IS_X86_64
+#endif
--- a/src/westmere/simd.h
+++ b/src/westmere/simd.h
@ -0,0 +1,270 @@
+#ifndef SIMDJSON_WESTMERE_SIMD_H
+#define SIMDJSON_WESTMERE_SIMD_H
+
+#include "simdjson/portability.h"
+
+#ifdef IS_X86_64
+
+#include "simdjson/common_defs.h"
+#include "simdjson/simdjson.h"
+
+TARGET_WESTMERE
+namespace simdjson::westmere::simd {
+
+  template<typename Child>
+  struct base {
+    __m128i value;
+
+    // Zero constructor
+    really_inline base() : value{__m128i()} {}
+
+    // Conversion from SIMD register
+    really_inline base(const __m128i _value) : value(_value) {}
+
+    // Conversion to SIMD register
+    really_inline operator const __m128i&() const { return this->value; }
+    really_inline operator __m128i&() { return this->value; }
+
+    // Bit operations
+    really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
+    really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
+    really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
+    really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(*this, other); }
+    really_inline Child operator~() const { return *this ^ 0xFFu; }
+    really_inline Child& operator|=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast | other; return *this_cast; }
+    really_inline Child& operator&=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast & other; return *this_cast; }
+    really_inline Child& operator^=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast ^ other; return *this_cast; }
+  };
+
+  // Forward-declared so they can be used by splat and friends.
+  template<typename T>
+  struct simd8;
+
+  template<typename T, typename Mask=simd8<bool>>
+  struct base8: base<simd8<T>> {
+    typedef int bitmask_t;
+
+    really_inline base8() : base<simd8<T>>() {}
+    really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
+
+    really_inline Mask operator==(const simd8<T> other) const { return _mm_cmpeq_epi8(*this, other); }
+
+    static const int SIZE = sizeof(base<simd8<T>>::value);
+
+    template<int N=1>
+    really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+      return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
+    }
+  };
+
+  // SIMD byte mask type (returned by things like eq and gt)
+  template<>
+  struct simd8<bool>: base8<bool> {
+    static really_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(-(!!_value)); }
+
+    really_inline simd8<bool>() : base8() {}
+    really_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {}
+    // Splat constructor
+    really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+
+    really_inline bitmask_t to_bitmask() const { return _mm_movemask_epi8(*this); }
+    really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
+  };
+
+  template<typename T>
+  struct base8_numeric: base8<T> {
+    static really_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
+    static really_inline simd8<T> zero() { return _mm_setzero_si128(); }
+    static really_inline simd8<T> load(const T values[16]) {
+      return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+    }
+
+    really_inline base8_numeric() : base8<T>() {}
+    really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
+
+    // Store to array
+    really_inline void store(T dst[16]) { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
+
+    // Addition/subtraction are the same for signed and unsigned
+    really_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
+    really_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
+    really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *this; }
+    really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *this; }
+
+    // Perform a lookup of the lower 4 bits
+    template<typename L>
+    really_inline simd8<L> lookup_lower_4_bits(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+
+      simd8<L> lookup_table(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      );
+      return _mm_shuffle_epi8(lookup_table, *this);
+    }
+
+    // Perform a lookup assuming the value is between 0 and 16
+    template<typename L>
+    really_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_lower_4_bits(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      );
+    }
+  };
+
+  // Signed bytes
+  template<>
+  struct simd8<int8_t> : base8_numeric<int8_t> {
+    really_inline simd8() : base8_numeric<int8_t>() {}
+    really_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
+    // Splat constructor
+    really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    really_inline simd8(const int8_t* values) : simd8(load(values)) {}
+    // Member-by-member initialization
+    really_inline simd8(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) : simd8(_mm_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
+
+    // Order-sensitive comparisons
+    really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
+    really_inline simd8<int8_t> min(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
+    really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
+  };
+
+  // Unsigned bytes
+  template<>
+  struct simd8<uint8_t>: base8_numeric<uint8_t> {
+    really_inline simd8() : base8_numeric<uint8_t>() {}
+    really_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
+    // Splat constructor
+    really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    really_inline simd8(const uint8_t* values) : simd8(load(values)) {}
+    // Member-by-member initialization
+    really_inline simd8(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) : simd8(_mm_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
+
+    // Saturated math
+    really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
+    really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
+
+    // Order-specific operations
+    really_inline simd8<uint8_t> max(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
+    really_inline simd8<uint8_t> min(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
+    really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max(*this) == other; }
+
+    // Bit-specific operations
+    really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set(); }
+    really_inline simd8<bool> any_bits_set() const { return ~(*this == uint8_t(0)); }
+    really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !_mm_testz_si128(*this, bits); }
+    really_inline bool any_bits_set_anywhere() const { return !_mm_testz_si128(*this, *this); }
+    template<int N>
+    really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
+    template<int N>
+    really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
+  };
+
+  template<typename T>
+  struct simd8x64 {
+    const simd8<T> chunks[4];
+
+    really_inline simd8x64() : chunks{simd8<T>(), simd8<T>(), simd8<T>(), simd8<T>()} {}
+    really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
+
+    really_inline void store(T ptr[64]) {
+      this->chunks[0].store(ptr);
+      this->chunks[0].store(ptr+16);
+      this->chunks[0].store(ptr+32);
+      this->chunks[0].store(ptr+48);
+    }
+
+    template <typename F>
+    really_inline void each(F const& each_chunk) const
+    {
+      each_chunk(this->chunks[0]);
+      each_chunk(this->chunks[1]);
+      each_chunk(this->chunks[2]);
+      each_chunk(this->chunks[3]);
+    }
+
+    template <typename F, typename R=bool>
+    really_inline simd8x64<R> map(F const& map_chunk) const {
+      return simd8x64<R>(
+        map_chunk(this->chunks[0]),
+        map_chunk(this->chunks[1]),
+        map_chunk(this->chunks[2]),
+        map_chunk(this->chunks[3])
+      );
+    }
+
+    template <typename F, typename R=bool>
+    really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const {
+      return simd8x64<R>(
+        map_chunk(this->chunks[0], b.chunks[0]),
+        map_chunk(this->chunks[1], b.chunks[1]),
+        map_chunk(this->chunks[2], b.chunks[2]),
+        map_chunk(this->chunks[3], b.chunks[3])
+      );
+    }
+
+    template <typename F>
+    really_inline simd8<T> reduce(F const& reduce_pair) const {
+      return reduce_pair(
+        reduce_pair(this->chunks[0], this->chunks[1]),
+        reduce_pair(this->chunks[2], this->chunks[3])
+      );
+    }
+
+    really_inline uint64_t to_bitmask() const {
+      uint64_t r0 = static_cast<uint32_t>(this->chunks[0].to_bitmask());
+      uint64_t r1 =                       this->chunks[1].to_bitmask();
+      uint64_t r2 =                       this->chunks[2].to_bitmask();
+      uint64_t r3 =                       this->chunks[3].to_bitmask();
+      return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    }
+
+    really_inline simd8x64<T> bit_or(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return this->map( [&](auto a) { return a | mask; } );
+    }
+
+    really_inline uint64_t eq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return this->map( [&](auto a) { return a == mask; } ).to_bitmask();
+    }
+
+    really_inline uint64_t lteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
+    }
+
+  }; // struct simd8x64<T>
+
+} // namespace simdjson::westmere::simd
+UNTARGET_REGION
+
+#endif // IS_X86_64
+#endif // SIMDJSON_WESTMERE_SIMD_INPUT_H
--- a/src/westmere/simd_input.h
+++ b/src/westmere/simd_input.h
@ -1,100 +0,0 @@
-#ifndef SIMDJSON_WESTMERE_SIMD_INPUT_H
-#define SIMDJSON_WESTMERE_SIMD_INPUT_H
-
-#include "simdjson/common_defs.h"
-#include "simdjson/portability.h"
-#include "simdjson/simdjson.h"
-
-#ifdef IS_X86_64
-
-TARGET_WESTMERE
-namespace simdjson::westmere {
-
-struct simd_input {
-  const __m128i chunks[4];
-
-  really_inline simd_input()
-      : chunks { __m128i(), __m128i(), __m128i(), __m128i() } {}
-
-  really_inline simd_input(const __m128i chunk0, const __m128i chunk1, const __m128i chunk2, const __m128i chunk3)
-      : chunks{chunk0, chunk1, chunk2, chunk3} {}
-
-  really_inline simd_input(const uint8_t *ptr)
-      : simd_input(
-        _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0)),
-        _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16)),
-        _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32)),
-        _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48))
-      ) {}
-
-  template <typename F>
-  really_inline void each(F const& each_chunk) const {
-    each_chunk(this->chunks[0]);
-    each_chunk(this->chunks[1]);
-    each_chunk(this->chunks[2]);
-    each_chunk(this->chunks[3]);
-  }
-
-  template <typename F>
-  really_inline simd_input map(F const& map_chunk) const {
-    return simd_input(
-      map_chunk(this->chunks[0]),
-      map_chunk(this->chunks[1]),
-      map_chunk(this->chunks[2]),
-      map_chunk(this->chunks[3])
-    );
-  }
-
-  template <typename F>
-  really_inline simd_input map(const simd_input b, F const& map_chunk) const {
-    return simd_input(
-      map_chunk(this->chunks[0], b.chunks[0]),
-      map_chunk(this->chunks[1], b.chunks[1]),
-      map_chunk(this->chunks[2], b.chunks[2]),
-      map_chunk(this->chunks[3], b.chunks[3])
-    );
-  }
-
-  template <typename F>
-  really_inline __m128i reduce(F const& reduce_pair) const {
-    __m128i r01 = reduce_pair(this->chunks[0], this->chunks[1]);
-    __m128i r23 = reduce_pair(this->chunks[2], this->chunks[3]);
-    return reduce_pair(r01, r23);
-  }
-
-  really_inline uint64_t to_bitmask() const {
-    uint64_t r0 = static_cast<uint32_t>(_mm_movemask_epi8(this->chunks[0]));
-    uint64_t r1 =                       _mm_movemask_epi8(this->chunks[1]);
-    uint64_t r2 =                       _mm_movemask_epi8(this->chunks[2]);
-    uint64_t r3 =                       _mm_movemask_epi8(this->chunks[3]);
-    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
-  }
-
-  really_inline simd_input bit_or(const uint8_t m) const {
-    const __m128i mask = _mm_set1_epi8(m);
-    return this->map( [&](auto a) {
-      return _mm_or_si128(a, mask);
-    });
-  }
-
-  really_inline uint64_t eq(const uint8_t m) const {
-    const __m128i mask = _mm_set1_epi8(m);
-    return this->map( [&](auto a) {
-      return _mm_cmpeq_epi8(a, mask);
-    }).to_bitmask();
-  }
-
-  really_inline uint64_t lteq(const uint8_t m) const {
-    const __m128i maxval = _mm_set1_epi8(m);
-    return this->map( [&](auto a) {
-      return _mm_cmpeq_epi8(_mm_max_epu8(maxval, a), maxval);
-    }).to_bitmask();
-  }
-
-}; // struct simd_input
-
-} // namespace simdjson::westmere
-UNTARGET_REGION
-
-#endif // IS_X86_64
-#endif // SIMDJSON_WESTMERE_SIMD_INPUT_H
--- a/src/westmere/simdutf8check.h
+++ b/src/westmere/simdutf8check.h
@ -1,203 +0,0 @@
-#ifndef SIMDJSON_WESTMERE_SIMDUTF8CHECK_H
-#define SIMDJSON_WESTMERE_SIMDUTF8CHECK_H
-
-#include "simdjson/portability.h"
-#include "simdjson/simdjson.h"
-#include "westmere/simd_input.h"
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#ifdef IS_X86_64
-
-/*
- * legal utf-8 byte sequence
- * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
- *
- *  Code Points        1st       2s       3s       4s
- * U+0000..U+007F     00..7F
- * U+0080..U+07FF     C2..DF   80..BF
- * U+0800..U+0FFF     E0       A0..BF   80..BF
- * U+1000..U+CFFF     E1..EC   80..BF   80..BF
- * U+D000..U+D7FF     ED       80..9F   80..BF
- * U+E000..U+FFFF     EE..EF   80..BF   80..BF
- * U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
- * U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
- * U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
- *
- */
-
-// all byte values must be no larger than 0xF4
-
-/********** sse code **********/
-TARGET_WESTMERE
-namespace simdjson::westmere {
-
-struct processed_utf_bytes {
-  __m128i raw_bytes;
-  __m128i high_nibbles;
-  __m128i carried_continuations;
-};
-
-struct utf8_checker {
-  __m128i has_error{_mm_setzero_si128()};
-  processed_utf_bytes previous{_mm_setzero_si128(), _mm_setzero_si128(), _mm_setzero_si128()};
-
-  really_inline void add_errors(__m128i errors) {
-    this->has_error = _mm_or_si128(errors, this->has_error);
-  }
-
-  // all byte values must be no larger than 0xF4
-  really_inline void check_smaller_than_0xF4(__m128i current_bytes) {
-    // unsigned, saturates to 0 below max
-    this->add_errors( _mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4u)) );
-  }
-
-  really_inline __m128i continuation_lengths(__m128i high_nibbles) {
-    return _mm_shuffle_epi8(
-        _mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
-                      0, 0, 0, 0,             // 10xx (continuation)
-                      2, 2,                   // 110x
-                      3,                      // 1110
-                      4), // 1111, next should be 0 (not checked here)
-        high_nibbles);
-  }
-
-  really_inline __m128i carry_continuations(__m128i initial_lengths) {
-
-    __m128i right1 =
-        _mm_subs_epu8(_mm_alignr_epi8(initial_lengths, this->previous.carried_continuations, 16 - 1),
-                      _mm_set1_epi8(1));
-    __m128i sum = _mm_add_epi8(initial_lengths, right1);
-
-    __m128i right2 = _mm_subs_epu8(_mm_alignr_epi8(sum, this->previous.carried_continuations, 16 - 2),
-                                  _mm_set1_epi8(2));
-    return _mm_add_epi8(sum, right2);
-  }
-
-  really_inline void check_continuations(__m128i initial_lengths, __m128i carries) {
-
-    // overlap || underlap
-    // carry > length && length > 0 || !(carry > length) && !(length > 0)
-    // (carries > length) == (lengths > 0)
-    __m128i overunder =
-        _mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths),
-                      _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()));
-
-    this->add_errors( overunder );
-  }
-
-  // when 0xED is found, next byte must be no larger than 0x9F
-  // when 0xF4 is found, next byte must be no larger than 0x8F
-  // next byte must be continuation, ie sign bit is set, so signed < is ok
-  really_inline void check_first_continuation_max(__m128i current_bytes, __m128i off1_current_bytes) {
-    __m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xEDu));
-    __m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4u));
-
-    __m128i badfollowED = _mm_and_si128(
-        _mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9Fu)), maskED);
-    __m128i badfollowF4 = _mm_and_si128(
-        _mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8Fu)), maskF4);
-
-    this->add_errors( _mm_or_si128(badfollowED, badfollowF4) );
-  }
-
-  // map off1_hibits => error condition
-  // hibits     off1    cur
-  // C       => < C2 && true
-  // E       => < E1 && < A0
-  // F       => < F1 && < 90
-  // else      false && false
-  really_inline void check_overlong(__m128i current_bytes,
-                                    __m128i off1_current_bytes, __m128i high_nibbles) {
-    __m128i off1_hibits = _mm_alignr_epi8(high_nibbles, this->previous.high_nibbles, 16 - 1);
-    __m128i initial_mins = _mm_shuffle_epi8(
-        _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-                      -128, -128,  // 10xx => false
-                      0xC2u, -128, // 110x
-                      0xE1u,       // 1110
-                      0xF1u),
-        off1_hibits);
-
-    __m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
-
-    __m128i second_mins = _mm_shuffle_epi8(
-        _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-                      -128, -128, // 10xx => false
-                      127, 127,   // 110x => true
-                      0xA0u,      // 1110
-                      0x90u),
-        off1_hibits);
-    __m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes);
-    this->add_errors( _mm_and_si128(initial_under, second_under) );
-  }
-
-  really_inline void count_nibbles(__m128i bytes, struct processed_utf_bytes *answer) {
-    answer->raw_bytes = bytes;
-    answer->high_nibbles = _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F));
-  }
-
-  // check whether the current bytes are valid UTF-8
-  // at the end of the function, previous gets updated
-  really_inline void check_utf8_bytes(__m128i current_bytes) {
-    struct processed_utf_bytes pb;
-    this->count_nibbles(current_bytes, &pb);
-
-    this->check_smaller_than_0xF4(current_bytes);
-
-    __m128i initial_lengths = this->continuation_lengths(pb.high_nibbles);
-
-    pb.carried_continuations = this->carry_continuations(initial_lengths);
-
-    this->check_continuations(initial_lengths, pb.carried_continuations);
-
-    __m128i off1_current_bytes =
-        _mm_alignr_epi8(pb.raw_bytes, this->previous.raw_bytes, 16 - 1);
-    this->check_first_continuation_max(current_bytes, off1_current_bytes);
-
-    this->check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles);
-    this->previous = pb;
-  }
-
-  really_inline void check_carried_continuations() {
-      this->has_error = _mm_cmpgt_epi8(this->previous.carried_continuations,
-                                       _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                                     9, 9, 9, 9, 9, 1));
-  }
-
-  really_inline void check_next_input(__m128i in) {
-    __m128i high_bit = _mm_set1_epi8(0x80u);
-    if (_mm_testz_si128( in, high_bit) == 1) {
-      // it is ascii, we just check continuations
-      this->check_carried_continuations();
-    } else {
-      // it is not ascii so we have to do heavy work
-      this->check_utf8_bytes(in);
-    }
-  }
-
-  really_inline void check_next_input(simd_input in) {
-    __m128i high_bit = _mm_set1_epi8(0x80u);
-    __m128i any_bits_on = in.reduce([&](auto a, auto b) {
-      return _mm_or_si128(a, b);
-    });
-    if (_mm_testz_si128(any_bits_on, high_bit) == 1) {
-      // it is ascii, we just check continuations
-      this->check_carried_continuations();
-    } else {
-      // it is not ascii so we have to do heavy work
-      in.each([&](auto _in) { this->check_utf8_bytes(_in); });
-    }
-  }
-
-  really_inline ErrorValues errors() {
-    return _mm_testz_si128(this->has_error, this->has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
-  }
-
-}; // struct utf8_checker
-
-} // namespace simdjson::westmere
-UNTARGET_REGION // westmere
-
-#endif // IS_X86_64
-
-#endif
--- a/src/westmere/stage1_find_marks.h
+++ b/src/westmere/stage1_find_marks.h
@ -5,41 +5,29 @@

 #ifdef IS_X86_64

-#include "westmere/simd_input.h"
-#include "westmere/simdutf8check.h"
+#include "westmere/bitmask.h"
+#include "westmere/simd.h"
 #include "simdjson/stage1_find_marks.h"

 TARGET_WESTMERE
 namespace simdjson::westmere {

-really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
-  return _mm_cvtsi128_si64(_mm_clmulepi64_si128(
-      _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
-}
+using namespace simd;

 really_inline void find_whitespace_and_operators(
-  const simd_input in,
+  const simd8x64<uint8_t> in,
  uint64_t &whitespace, uint64_t &op) {

-  const __m128i operator_table =
-      _mm_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
-  const __m128i white_table = _mm_setr_epi8(32, 100, 100, 100,  17, 100, 113,   2,
-                                           100,   9,  10, 112, 100,  13, 100, 100);
-  const __m128i op_offset = _mm_set1_epi8(0xd4u);
-  const __m128i op_mask = _mm_set1_epi8(32);
-
-  whitespace = in.map([&](auto _in) {
-    return _mm_cmpeq_epi8(_in, _mm_shuffle_epi8(white_table, _in));
+  whitespace = in.map([&](simd8<uint8_t> _in) {
+    return _in == _in.lookup_lower_4_bits<uint8_t>(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
  }).to_bitmask();

-  op = in.map([&](auto _in) {
-    const __m128i r1 = _mm_add_epi8(op_offset, _in);
-    const __m128i r2 = _mm_or_si128(_in, op_mask);
-    const __m128i r3 = _mm_shuffle_epi8(operator_table, r1);
-    return _mm_cmpeq_epi8(r2, r3);
+  op = in.map([&](simd8<uint8_t> _in) {
+    return (_in | 32) == (_in+0xd4u).lookup_lower_4_bits<uint8_t>(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
  }).to_bitmask();
 }

+#include "generic/simdutf8check.h"
 #include "generic/stage1_find_marks.h"

 } // namespace westmere
--- a/src/westmere/stringparsing.h
+++ b/src/westmere/stringparsing.h
@ -1,40 +1,39 @@
 #ifndef SIMDJSON_WESTMERE_STRINGPARSING_H
 #define SIMDJSON_WESTMERE_STRINGPARSING_H

+#include "simdjson/portability.h"
+
 #ifdef IS_X86_64

+#include "westmere/simd.h"
 #include "simdjson/common_defs.h"
 #include "simdjson/parsedjson.h"
 #include "jsoncharutils.h"

-#ifdef JSON_TEST_STRINGS
-void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
-                  const uint8_t *parsed_end);
-void found_bad_string(const uint8_t *buf);
-#endif
-
 TARGET_WESTMERE
 namespace simdjson::westmere {

+using namespace simd;
+
 // Holds backslashes and quotes locations.
 struct parse_string_helper {
  uint32_t bs_bits;
  uint32_t quote_bits;
-  really_inline uint32_t bytes_processed() const { return sizeof(__m128i); }
+  static const uint32_t BYTES_PROCESSED = 32;
 };

 really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) {
  // this can read up to 31 bytes beyond the buffer size, but we require
  // SIMDJSON_PADDING of padding
-  __m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
-  // store to dest unconditionally - we can overwrite the bits we don't like
-  // later
-  _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v);
-  auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"'));
+  static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1));
+  simd8<uint8_t> v0(src);
+  simd8<uint8_t> v1(src + 16);
+  v0.store(dst);
+  v1.store(dst + 16);
+  uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
  return {
-      static_cast<uint32_t>(
-          _mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits
-      static_cast<uint32_t>(_mm_movemask_epi8(quote_mask)) // quote_bits
+    static_cast<uint32_t>(bs_and_quote),      // bs_bits
+    static_cast<uint32_t>(bs_and_quote >> 32) // quote_bits
  };
 }