Merge pull request #285 from lemire/methods

Use methods instead of functions for simd_input
2019-08-16 17:45:42 -07:00 · 2019-08-16 17:45:42 -07:00 · 08cf140811
parent 85fb37b6ea 94673bcdf2
commit 08cf140811
10 changed files with 212 additions and 261 deletions
--- a/include/simdjson/simd_input.h
+++ b/include/simdjson/simd_input.h
@ -8,19 +8,14 @@

 namespace simdjson {

-template <Architecture> struct simd_input;
-
-template <Architecture T>
-simd_input<T> fill_input(const uint8_t *ptr);
-
-// a straightforward comparison of a mask against input.
-template <Architecture T>
-uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
-
-// find all values less than or equal than the content of maxval (using unsigned
-// arithmetic)
-template <Architecture T>
-uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
+template <Architecture>
+struct simd_input {
+    simd_input(const uint8_t *ptr);
+    // a straightforward comparison of a mask against input.
+    uint64_t eq(uint8_t m);
+    // find all values less than or equal than the content of maxval (using unsigned arithmetic)
+    uint64_t lteq(uint8_t m);
+}; // struct simd_input

 } // namespace simdjson

--- a/include/simdjson/simd_input_arm64.h
+++ b/include/simdjson/simd_input_arm64.h
@ -6,28 +6,9 @@
 #ifdef IS_ARM64
 namespace simdjson {

-template <>
-struct simd_input<Architecture::ARM64> {
-  uint8x16_t i0;
-  uint8x16_t i1;
-  uint8x16_t i2;
-  uint8x16_t i3;
-};
-
-template <>
-really_inline simd_input<Architecture::ARM64>
-fill_input<Architecture::ARM64>(const uint8_t *ptr) {
-  struct simd_input<Architecture::ARM64> in;
-  in.i0 = vld1q_u8(ptr + 0);
-  in.i1 = vld1q_u8(ptr + 16);
-  in.i2 = vld1q_u8(ptr + 32);
-  in.i3 = vld1q_u8(ptr + 48);
-  return in;
-}
-
 really_inline uint16_t neon_movemask(uint8x16_t input) {
  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+                              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
  uint8x16_t minput = vandq_u8(input, bit_mask);
  uint8x16_t tmp = vpaddq_u8(minput, minput);
  tmp = vpaddq_u8(tmp, tmp);
@ -38,7 +19,7 @@ really_inline uint16_t neon_movemask(uint8x16_t input) {
 really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
                                          uint8x16_t p2, uint8x16_t p3) {
  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+                              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
  uint8x16_t t0 = vandq_u8(p0, bit_mask);
  uint8x16_t t1 = vandq_u8(p1, bit_mask);
  uint8x16_t t2 = vandq_u8(p2, bit_mask);
@ -51,26 +32,38 @@ really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
 }

 template <>
-really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
-    simd_input<Architecture::ARM64> in, uint8_t m) {
-  const uint8x16_t mask = vmovq_n_u8(m);
-  uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
-  uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
-  uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
-  uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
-  return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
-}
+struct simd_input<Architecture::ARM64> {
+  uint8x16_t i0;
+  uint8x16_t i1;
+  uint8x16_t i2;
+  uint8x16_t i3;

-template <>
-really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
-    simd_input<Architecture::ARM64> in, uint8_t m) {
-  const uint8x16_t mask = vmovq_n_u8(m);
-  uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
-  uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
-  uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
-  uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
-  return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
-}
+  really_inline simd_input(const uint8_t *ptr) {
+    this->i0 = vld1q_u8(ptr + 0);
+    this->i1 = vld1q_u8(ptr + 16);
+    this->i2 = vld1q_u8(ptr + 32);
+    this->i3 = vld1q_u8(ptr + 48);
+  }
+
+  really_inline uint64_t eq(uint8_t m) {
+    const uint8x16_t mask = vmovq_n_u8(m);
+    uint8x16_t cmp_res_0 = vceqq_u8(this->i0, mask);
+    uint8x16_t cmp_res_1 = vceqq_u8(this->i1, mask);
+    uint8x16_t cmp_res_2 = vceqq_u8(this->i2, mask);
+    uint8x16_t cmp_res_3 = vceqq_u8(this->i3, mask);
+    return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+  }
+
+  really_inline uint64_t lteq(uint8_t m) {
+    const uint8x16_t mask = vmovq_n_u8(m);
+    uint8x16_t cmp_res_0 = vcleq_u8(this->i0, mask);
+    uint8x16_t cmp_res_1 = vcleq_u8(this->i1, mask);
+    uint8x16_t cmp_res_2 = vcleq_u8(this->i2, mask);
+    uint8x16_t cmp_res_3 = vcleq_u8(this->i3, mask);
+    return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+  }
+
+}; // struct simd_input

 } // namespace simdjson

--- a/include/simdjson/simd_input_haswell.h
+++ b/include/simdjson/simd_input_haswell.h
@ -12,38 +12,31 @@ template <>
 struct simd_input<Architecture::HASWELL> {
  __m256i lo;
  __m256i hi;
-};

-template <>
-really_inline simd_input<Architecture::HASWELL>
-fill_input<Architecture::HASWELL>(const uint8_t *ptr) {
-  struct simd_input<Architecture::HASWELL> in;
-  in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
-  in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
-  return in;
-}
+  really_inline simd_input(const uint8_t *ptr) {
+    this->lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
+    this->hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
+  }

-template <>
-really_inline uint64_t cmp_mask_against_input<Architecture::HASWELL>(
-    simd_input<Architecture::HASWELL> in, uint8_t m) {
-  const __m256i mask = _mm256_set1_epi8(m);
-  __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
-  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-  __m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
-  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
-  return res_0 | (res_1 << 32);
-}
+  really_inline uint64_t eq(uint8_t m) {
+    const __m256i mask = _mm256_set1_epi8(m);
+    __m256i cmp_res_0 = _mm256_cmpeq_epi8(this->lo, mask);
+    uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
+    __m256i cmp_res_1 = _mm256_cmpeq_epi8(this->hi, mask);
+    uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
+    return res_0 | (res_1 << 32);
+  }

-template <>
-really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
-    simd_input<Architecture::HASWELL> in, uint8_t m) {
-  const __m256i maxval = _mm256_set1_epi8(m);
-  __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval);
-  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-  __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval);
-  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
-  return res_0 | (res_1 << 32);
-}
+  really_inline uint64_t lteq(uint8_t m) {
+    const __m256i maxval = _mm256_set1_epi8(m);
+    __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, this->lo), maxval);
+    uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
+    __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, this->hi), maxval);
+    uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
+    return res_0 | (res_1 << 32);
+  }
+
+}; // struct simd_input

 } // namespace simdjson
 UNTARGET_REGION
--- a/include/simdjson/simd_input_westmere.h
+++ b/include/simdjson/simd_input_westmere.h
@ -14,48 +14,41 @@ struct simd_input<Architecture::WESTMERE> {
  __m128i v1;
  __m128i v2;
  __m128i v3;
-};

-template <>
-really_inline simd_input<Architecture::WESTMERE>
-fill_input<Architecture::WESTMERE>(const uint8_t *ptr) {
-  struct simd_input<Architecture::WESTMERE> in;
-  in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
-  in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
-  in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
-  in.v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
-  return in;
-}
+  really_inline simd_input(const uint8_t *ptr) {
+    this->v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
+    this->v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
+    this->v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
+    this->v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
+  }

-template <>
-really_inline uint64_t cmp_mask_against_input<Architecture::WESTMERE>(
-    simd_input<Architecture::WESTMERE> in, uint8_t m) {
-  const __m128i mask = _mm_set1_epi8(m);
-  __m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
-  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
-  __m128i cmp_res_1 = _mm_cmpeq_epi8(in.v1, mask);
-  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
-  __m128i cmp_res_2 = _mm_cmpeq_epi8(in.v2, mask);
-  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
-  __m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask);
-  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
-  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
-}
+  really_inline uint64_t eq(uint8_t m) {
+    const __m128i mask = _mm_set1_epi8(m);
+    __m128i cmp_res_0 = _mm_cmpeq_epi8(this->v0, mask);
+    uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
+    __m128i cmp_res_1 = _mm_cmpeq_epi8(this->v1, mask);
+    uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
+    __m128i cmp_res_2 = _mm_cmpeq_epi8(this->v2, mask);
+    uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
+    __m128i cmp_res_3 = _mm_cmpeq_epi8(this->v3, mask);
+    uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
+    return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
+  }

-template <>
-really_inline uint64_t unsigned_lteq_against_input<Architecture::WESTMERE>(
-    simd_input<Architecture::WESTMERE> in, uint8_t m) {
-  const __m128i maxval = _mm_set1_epi8(m);
-  __m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v0), maxval);
-  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
-  __m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v1), maxval);
-  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
-  __m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v2), maxval);
-  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
-  __m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v3), maxval);
-  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
-  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
-}
+  really_inline uint64_t lteq(uint8_t m) {
+    const __m128i maxval = _mm_set1_epi8(m);
+    __m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v0), maxval);
+    uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
+    __m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v1), maxval);
+    uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
+    __m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v2), maxval);
+    uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
+    __m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v3), maxval);
+    uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
+    return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
+  }
+
+}; // struct simd_input

 } // namespace simdjson
 UNTARGET_REGION
--- a/include/simdjson/simdutf8check.h
+++ b/include/simdjson/simdutf8check.h
@ -6,15 +6,14 @@

 namespace simdjson {

-// Holds the state required to perform check_utf8().
-template <Architecture> struct utf8_checking_state;
-
+// Checks UTF8, chunk by chunk.
 template <Architecture T>
-void check_utf8(simd_input<T> in, utf8_checking_state<T> &state);
-
-// Checks if the utf8 validation has found any error.
-template <Architecture T>
-ErrorValues check_utf8_errors(utf8_checking_state<T> &state);
+struct utf8_checker {
+    // Process the next chunk of input.
+    void check_next_input(simd_input<T> in);
+    // Find out what (if any) errors have occurred
+    ErrorValues errors();
+};

 } // namespace simdjson

--- a/include/simdjson/simdutf8check_arm64.h
+++ b/include/simdjson/simdutf8check_arm64.h
@ -177,12 +177,6 @@ check_utf8_bytes(int8x16_t current_bytes, struct processed_utf_bytes *previous,
  return pb;
 }

-template <>
-struct utf8_checking_state<Architecture::ARM64> {
-  int8x16_t has_error{};
-  processed_utf_bytes previous{};
-};
-
 // Checks that all bytes are ascii
 really_inline bool check_ascii_neon(simd_input<Architecture::ARM64> in) {
  // checking if the most significant bit is always equal to 0.
@ -198,41 +192,43 @@ really_inline bool check_ascii_neon(simd_input<Architecture::ARM64> in) {
 }

 template <>
-really_inline void check_utf8<Architecture::ARM64>(
-    simd_input<Architecture::ARM64> in,
-    utf8_checking_state<Architecture::ARM64> &state) {
-  if (check_ascii_neon(in)) {
-    // All bytes are ascii. Therefore the byte that was just before must be
-    // ascii too. We only check the byte that was just before simd_input. Nines
-    // are arbitrary values.
-    const int8x16_t verror =
-        (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
-    state.has_error =
-        vorrq_s8(vreinterpretq_s8_u8(
-                     vcgtq_s8(state.previous.carried_continuations, verror)),
-                 state.has_error);
-  } else {
-    // it is not ascii so we have to do heavy work
-    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0),
-                                      &(state.previous), &(state.has_error));
-    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1),
-                                      &(state.previous), &(state.has_error));
-    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2),
-                                      &(state.previous), &(state.has_error));
-    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3),
-                                      &(state.previous), &(state.has_error));
-  }
-}
+struct utf8_checker<Architecture::ARM64> {
+  int8x16_t has_error{};
+  processed_utf_bytes previous{};

-template <>
-really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
-    utf8_checking_state<Architecture::ARM64> &state) {
-  uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error);
-  uint32x2_t v32 = vqmovn_u64(v64);
-  uint64x1_t result = vreinterpret_u64_u32(v32);
-  return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
-                                       : simdjson::SUCCESS;
-}
+  really_inline void check_next_input(simd_input<Architecture::ARM64> in) {
+    if (check_ascii_neon(in)) {
+      // All bytes are ascii. Therefore the byte that was just before must be
+      // ascii too. We only check the byte that was just before simd_input. Nines
+      // are arbitrary values.
+      const int8x16_t verror =
+          (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
+      this->has_error =
+          vorrq_s8(vreinterpretq_s8_u8(
+                      vcgtq_s8(this->previous.carried_continuations, verror)),
+                  this->has_error);
+    } else {
+      // it is not ascii so we have to do heavy work
+      this->previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0),
+                                        &(this->previous), &(this->has_error));
+      this->previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1),
+                                        &(this->previous), &(this->has_error));
+      this->previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2),
+                                        &(this->previous), &(this->has_error));
+      this->previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3),
+                                        &(this->previous), &(this->has_error));
+    }
+  }
+
+  really_inline ErrorValues errors() {
+    uint64x2_t v64 = vreinterpretq_u64_s8(this->has_error);
+    uint32x2_t v32 = vqmovn_u64(v64);
+    uint64x1_t result = vreinterpret_u64_u32(v32);
+    return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
+                                        : simdjson::SUCCESS;
+  }
+
+}; // struct utf8_checker

 } // namespace simdjson
 #endif
--- a/include/simdjson/simdutf8check_haswell.h
+++ b/include/simdjson/simdutf8check_haswell.h
@ -192,46 +192,43 @@ avx_check_utf8_bytes(__m256i current_bytes,
  return pb;
 }

-template <> struct utf8_checking_state<Architecture::HASWELL> {
+template <>
+struct utf8_checker<Architecture::HASWELL> {
  __m256i has_error;
  avx_processed_utf_bytes previous;
-  utf8_checking_state() {
+
+  utf8_checker() {
    has_error = _mm256_setzero_si256();
    previous.raw_bytes = _mm256_setzero_si256();
    previous.high_nibbles = _mm256_setzero_si256();
    previous.carried_continuations = _mm256_setzero_si256();
  }
-};

-template <>
-really_inline void check_utf8<Architecture::HASWELL>(
-    simd_input<Architecture::HASWELL> in,
-    utf8_checking_state<Architecture::HASWELL> &state) {
-  __m256i high_bit = _mm256_set1_epi8(0x80u);
-  if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) {
-    // it is ascii, we just check continuation
-    state.has_error = _mm256_or_si256(
-        _mm256_cmpgt_epi8(state.previous.carried_continuations,
-                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 1)),
-        state.has_error);
-  } else {
-    // it is not ascii so we have to do heavy work
-    state.previous =
-        avx_check_utf8_bytes(in.lo, &(state.previous), &(state.has_error));
-    state.previous =
-        avx_check_utf8_bytes(in.hi, &(state.previous), &(state.has_error));
+  really_inline void check_next_input(simd_input<Architecture::HASWELL> in) {
+    __m256i high_bit = _mm256_set1_epi8(0x80u);
+    if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) {
+      // it is ascii, we just check continuation
+      this->has_error = _mm256_or_si256(
+          _mm256_cmpgt_epi8(this->previous.carried_continuations,
+                            _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                            9, 9, 9, 9, 9, 9, 9, 1)),
+          this->has_error);
+    } else {
+      // it is not ascii so we have to do heavy work
+      this->previous =
+          avx_check_utf8_bytes(in.lo, &(this->previous), &(this->has_error));
+      this->previous =
+          avx_check_utf8_bytes(in.hi, &(this->previous), &(this->has_error));
+    }
  }
-}

-template <>
-really_inline ErrorValues check_utf8_errors<Architecture::HASWELL>(
-    utf8_checking_state<Architecture::HASWELL> &state) {
-  return _mm256_testz_si256(state.has_error, state.has_error) == 0
-             ? simdjson::UTF8_ERROR
-             : simdjson::SUCCESS;
-}
+  really_inline ErrorValues errors() {
+    return _mm256_testz_si256(this->has_error, this->has_error) == 0
+              ? simdjson::UTF8_ERROR
+              : simdjson::SUCCESS;
+  }
+}; // struct utf8_checker

 } // namespace simdjson
 UNTARGET_REGION // haswell
--- a/include/simdjson/simdutf8check_westmere.h
+++ b/include/simdjson/simdutf8check_westmere.h
@ -31,6 +31,7 @@
 TARGET_WESTMERE

 namespace simdjson {
+
 // all byte values must be no larger than 0xF4
 static inline void check_smaller_than_0xF4(__m128i current_bytes,
                                           __m128i *has_error) {
@ -164,58 +165,54 @@ check_utf8_bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
 }

 template <>
-struct utf8_checking_state<Architecture::WESTMERE> {
+struct utf8_checker<Architecture::WESTMERE> {
  __m128i has_error = _mm_setzero_si128();
  processed_utf_bytes previous{
      _mm_setzero_si128(), // raw_bytes
      _mm_setzero_si128(), // high_nibbles
      _mm_setzero_si128()  // carried_continuations
  };
-};

-template <>
-really_inline void check_utf8<Architecture::WESTMERE>(
-    simd_input<Architecture::WESTMERE> in,
-    utf8_checking_state<Architecture::WESTMERE> &state) {
-  __m128i high_bit = _mm_set1_epi8(0x80u);
-  if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), high_bit)) == 1) {
-    // it is ascii, we just check continuation
-    state.has_error =
-        _mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
-                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                                  9, 9, 9, 9, 9, 1)),
-                     state.has_error);
-  } else {
-    // it is not ascii so we have to do heavy work
-    state.previous =
-        check_utf8_bytes(in.v0, &(state.previous), &(state.has_error));
-    state.previous =
-        check_utf8_bytes(in.v1, &(state.previous), &(state.has_error));
+  really_inline void check_next_input(simd_input<Architecture::WESTMERE> in) {
+    __m128i high_bit = _mm_set1_epi8(0x80u);
+    if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), high_bit)) == 1) {
+      // it is ascii, we just check continuation
+      this->has_error =
+          _mm_or_si128(_mm_cmpgt_epi8(this->previous.carried_continuations,
+                                      _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                                    9, 9, 9, 9, 9, 1)),
+                      this->has_error);
+    } else {
+      // it is not ascii so we have to do heavy work
+      this->previous =
+          check_utf8_bytes(in.v0, &(this->previous), &(this->has_error));
+      this->previous =
+          check_utf8_bytes(in.v1, &(this->previous), &(this->has_error));
+    }
+
+    if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), high_bit)) == 1) {
+      // it is ascii, we just check continuation
+      this->has_error =
+          _mm_or_si128(_mm_cmpgt_epi8(this->previous.carried_continuations,
+                                      _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                                    9, 9, 9, 9, 9, 1)),
+                      this->has_error);
+    } else {
+      // it is not ascii so we have to do heavy work
+      this->previous =
+          check_utf8_bytes(in.v2, &(this->previous), &(this->has_error));
+      this->previous =
+          check_utf8_bytes(in.v3, &(this->previous), &(this->has_error));
+    }
  }

-  if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), high_bit)) == 1) {
-    // it is ascii, we just check continuation
-    state.has_error =
-        _mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
-                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                                  9, 9, 9, 9, 9, 1)),
-                     state.has_error);
-  } else {
-    // it is not ascii so we have to do heavy work
-    state.previous =
-        check_utf8_bytes(in.v2, &(state.previous), &(state.has_error));
-    state.previous =
-        check_utf8_bytes(in.v3, &(state.previous), &(state.has_error));
+  really_inline ErrorValues errors() {
+    return _mm_testz_si128(this->has_error, this->has_error) == 0
+              ? simdjson::UTF8_ERROR
+              : simdjson::SUCCESS;
  }
-}

-template <>
-really_inline ErrorValues check_utf8_errors<Architecture::WESTMERE>(
-    utf8_checking_state<Architecture::WESTMERE> &state) {
-  return _mm_testz_si128(state.has_error, state.has_error) == 0
-             ? simdjson::UTF8_ERROR
-             : simdjson::SUCCESS;
-}
+}; // struct utf8_checker

 } // namespace simdjson
 UNTARGET_REGION // westmere
--- a/include/simdjson/stage1_find_marks.h
+++ b/include/simdjson/stage1_find_marks.h
@ -25,16 +25,6 @@ namespace {
 }
 } // namespace

-// Holds the state required to perform check_utf8().
-template <Architecture> struct utf8_checking_state;
-
-template <Architecture T>
-void check_utf8(simd_input<T> in, utf8_checking_state<T> &state);
-
-// Checks if the utf8 validation has found any error.
-template <Architecture T>
-ErrorValues check_utf8_errors(utf8_checking_state<T> &state);
-
 template <Architecture T>
 really_inline uint64_t find_odd_backslash_sequences(
    simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash);
--- a/include/simdjson/stage1_find_marks_common.h
+++ b/include/simdjson/stage1_find_marks_common.h
@ -24,7 +24,7 @@ really_inline uint64_t find_odd_backslash_sequences<TARGETED_ARCHITECTURE>(
    uint64_t &prev_iter_ends_odd_backslash) {
  const uint64_t even_bits = 0x5555555555555555ULL;
  const uint64_t odd_bits = ~even_bits;
-  uint64_t bs_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '\\');
+  uint64_t bs_bits = in.eq('\\');
  uint64_t start_edges = bs_bits & ~(bs_bits << 1);
  /* flip lowest if we have an odd-length run at the end of the prior
   * iteration */
@ -71,7 +71,7 @@ really_inline uint64_t find_quote_mask_and_bits<TARGETED_ARCHITECTURE>(
    simd_input<TARGETED_ARCHITECTURE> in, uint64_t odd_ends,
    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
    uint64_t &error_mask) {
-  quote_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '"');
+  quote_bits = in.eq('"');
  quote_bits = quote_bits & ~odd_ends;
  uint64_t quote_mask = compute_quote_mask<TARGETED_ARCHITECTURE>(quote_bits);
  quote_mask ^= prev_iter_inside_quote;
@ -80,8 +80,7 @@ really_inline uint64_t find_quote_mask_and_bits<TARGETED_ARCHITECTURE>(
   * quotation mark, reverse solidus, and the control characters (U+0000
   * through U+001F).
   * https://tools.ietf.org/html/rfc8259 */
-  uint64_t unescaped =
-      unsigned_lteq_against_input<TARGETED_ARCHITECTURE>(in, 0x1F);
+  uint64_t unescaped = in.lteq(0x1F);
  error_mask |= quote_mask & unescaped;
  /* right shift of a signed value expected to be well-defined and standard
   * compliant as of C++20,
@ -97,9 +96,9 @@ really_inline void find_structural_bits_64(
    uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote,
    uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals,
    uint64_t &error_mask,
-    utf8_checking_state<TARGETED_ARCHITECTURE> &utf8_state) {
-  simd_input<TARGETED_ARCHITECTURE> in = fill_input<TARGETED_ARCHITECTURE>(buf);
-  check_utf8<TARGETED_ARCHITECTURE>(in, utf8_state);
+    utf8_checker<TARGETED_ARCHITECTURE> &utf8_state) {
+  simd_input<TARGETED_ARCHITECTURE> in(buf);
+  utf8_state.check_next_input(in);
  /* detect odd sequences of backslashes */
  uint64_t odd_ends = find_odd_backslash_sequences<TARGETED_ARCHITECTURE>(
      in, prev_iter_ends_odd_backslash);
@ -136,7 +135,7 @@ int find_structural_bits<TARGETED_ARCHITECTURE>(const uint8_t *buf, size_t len,
  }
  uint32_t *base_ptr = pj.structural_indexes;
  uint32_t base = 0;
-  utf8_checking_state<TARGETED_ARCHITECTURE> utf8_state;
+  utf8_checker<TARGETED_ARCHITECTURE> utf8_state;

  /* we have padded the input out to 64 byte multiple with the remainder
   * being zeros persistent state across loop does the last iteration end
@ -208,8 +207,7 @@ int find_structural_bits<TARGETED_ARCHITECTURE>(const uint8_t *buf, size_t len,
  }
  if (len != base_ptr[pj.n_structural_indexes - 1]) {
    /* the string might not be NULL terminated, but we add a virtual NULL
-     * ending
-     * character. */
+     * ending character. */
    base_ptr[pj.n_structural_indexes++] = len;
  }
  /* make it safe to dereference one beyond this array */
@ -217,7 +215,7 @@ int find_structural_bits<TARGETED_ARCHITECTURE>(const uint8_t *buf, size_t len,
  if (error_mask) {
    return simdjson::UNESCAPED_CHARS;
  }
-  return check_utf8_errors<TARGETED_ARCHITECTURE>(utf8_state);
+  return utf8_state.errors();
 }

 } // namespace simdjson