Combined performance patch (5% overall, 15% stage 1) (#317)

* Allow -f * Support parse -s (force sse) * Simplify flatten_bits - Add directly to base instead of storing variable - Don't modify base_ptr after beginning of function - Eliminate base variable and increment base_ptr instead * De-unroll the flatten_bits loops * Decrease dependencies in stage 1 - Do all finalize_structurals work before computing the quote mask; mask out the quote mask later - Join find_whitespace_and_structurals and finalize_structurals into single find_structurals call, to reduce variable leakage - Rework pseudo_pred algorithm to refer to "primitive" for clarity and some dependency reduction - Rename quote_mask to in_string to describe what we're trying to achieve ("mask" could mean many things) - Break up find_quote_mask_and_bits into find_quote_mask and invalid_string_bytes to reduce data leakage (i.e. don't expose quote bits or odd_ends at all to find_structural_bits) - Genericize overflow methods "follows" and "follows_odd_sequence" for descriptiveness and possible lifting into a generic simd parsing library * Mark branches as likely/unlikely * Reorder and unroll+interleave stage 1 loop * Nest the cnt > 16 branch inside cnt > 8
2019-10-01 09:01:09 -07:00 · 2019-10-01 09:01:09 -07:00 · de8df0a05f
parent 53b6deaeae
commit de8df0a05f
13 changed files with 405 additions and 363 deletions
--- a/.gitignore
+++ b/.gitignore
@ -14,6 +14,7 @@
 /jsoncheck
 /jsonpointer
 /jsonstats
+/integer_tests
 /libsimdjson.so*
 /minify
 /numberparsingcheck
--- a/benchmark/parse.cpp
+++ b/benchmark/parse.cpp
@ -34,6 +34,18 @@
 #include "simdjson/parsedjson.h"
 #include "simdjson/stage1_find_marks.h"
 #include "simdjson/stage2_build_tape.h"
+
+// Global arguments
+bool find_marks_only = false;
+bool verbose = false;
+bool dump = false;
+bool json_output = false;
+bool force_one_iteration = false;
+bool just_data = false;
+bool force_sse = false;
+int32_t iterations = -1;
+int32_t warmup_iterations = -1;
+
 namespace simdjson {
 Architecture _find_best_supported_implementation() {
  constexpr uint32_t haswell_flags =
@ -43,7 +55,7 @@ Architecture _find_best_supported_implementation() {
      instruction_set::SSE42 | instruction_set::PCLMULQDQ;
  uint32_t supports = detect_supported_architectures();
  // Order from best to worst (within architecture)
-  if ((haswell_flags & supports) == haswell_flags) {
+  if ((haswell_flags & supports) == haswell_flags && !force_sse) {
    return Architecture::HASWELL;
  }
  if ((westmere_flags & supports) == westmere_flags) {
@ -63,6 +75,9 @@ extern unified_functype *unified_ptr;
 extern stage1_functype *stage1_ptr;

 int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
+  if (find_marks_only) {
+    return simdjson::SUCCESS;
+  }
  Architecture best_implementation = _find_best_supported_implementation();
  // Selecting the best implementation
  switch (best_implementation) {
@ -118,18 +133,11 @@ unified_functype *unified_ptr = &unified_machine_dispatch;
 } // namespace simdjson

 int main(int argc, char *argv[]) {
-  bool verbose = false;
-  bool dump = false;
-  bool json_output = false;
-  bool force_one_iteration = false;
-  bool just_data = false;
-  int32_t iterations = -1;
-  int32_t warmup_iterations = -1;

 #ifndef _MSC_VER
  int c;

-  while ((c = getopt(argc, argv, "1vdtn:w:")) != -1) {
+  while ((c = getopt(argc, argv, "1vdtn:w:fs")) != -1) {
    switch (c) {
    case 'n':
      iterations = atoi(optarg);
@ -137,6 +145,9 @@ int main(int argc, char *argv[]) {
    case 'w':
      warmup_iterations = atoi(optarg);
      break;
+    case 's':
+      force_sse = true;
+      break;
    case 't':
      just_data = true;
      break;
@ -152,6 +163,9 @@ int main(int argc, char *argv[]) {
    case '1':
      force_one_iteration = true;
      break;
+    case 'f':
+      find_marks_only = true;
+      break;
    default:
      abort();
    }
@ -326,7 +340,7 @@ int main(int argc, char *argv[]) {
    isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
            simdjson::SUCCESS);
    isok = isok &&
-           (simdjson::SUCCESS ==
+          (simdjson::SUCCESS ==
            simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> secs = end - start;
--- a/include/simdjson/common_defs.h
+++ b/include/simdjson/common_defs.h
@ -17,6 +17,17 @@
 #define SIMDJSON_PADDING 32
 #endif

+#if defined(__GNUC__)
+// Marks a block with a name so that MCA analysis can see it.
+#define BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
+#define END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
+#define DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
+#else
+#define BEGIN_DEBUG_BLOCK(name)
+#define END_DEBUG_BLOCK(name)
+#define DEBUG_BLOCK(name, block)
+#endif
+
 #ifndef _MSC_VER
 // Implemented using Labels as Values which works in GCC and CLANG (and maybe
 // also in Intel's compiler), but won't work in MSVC.
--- a/scripts/checkperf.sh
+++ b/scripts/checkperf.sh
@ -29,5 +29,5 @@ make parse
 make perfdiff

 echo "Running perfdiff:"
-echo ./perfdiff \"$current/parse -t $perftests\" \"$reference/parse -t $perftests\"
-./perfdiff "$current/parse -t $perftests" "$reference/parse -t $perftests"
+echo ./perfdiff \"$current/parse -t $perftests $CHECKPERF_ARGS\" \"$reference/parse -t $perftests $CHECKPERF_ARGS\"
+./perfdiff "$current/parse -t $perftests $CHECKPERF_ARGS" "$reference/parse -t $perftests $CHECKPERF_ARGS"
--- a/src/arm64/simd_input.h
+++ b/src/arm64/simd_input.h
@ -40,25 +40,24 @@ using namespace simdjson::arm64;

 template <>
 struct simd_input<Architecture::ARM64> {
-  uint8x16_t chunks[4];
+  const uint8x16_t chunks[4];

-  really_inline simd_input(const uint8_t *ptr) {
-    this->chunks[0] = vld1q_u8(ptr + 0*16);
-    this->chunks[1] = vld1q_u8(ptr + 1*16);
-    this->chunks[2] = vld1q_u8(ptr + 2*16);
-    this->chunks[3] = vld1q_u8(ptr + 3*16);
-  }
+  really_inline simd_input()
+    : chunks{uint8x16_t(), uint8x16_t(), uint8x16_t(), uint8x16_t() } {}

-  really_inline simd_input(uint8x16_t chunk0, uint8x16_t chunk1, uint8x16_t chunk2, uint8x16_t chunk3) {
-    this->chunks[0] = chunk0;
-    this->chunks[1] = chunk1;
-    this->chunks[2] = chunk2;
-    this->chunks[3] = chunk3;
-  }
+  really_inline simd_input(const uint8x16_t chunk0, const uint8x16_t chunk1, const uint8x16_t chunk2, const uint8x16_t chunk3)
+    : chunks{chunk0, chunk1, chunk2, chunk3 } {}
+
+  really_inline simd_input(const uint8_t *ptr)
+      : chunks{
+        vld1q_u8(ptr + 0*16),
+        vld1q_u8(ptr + 1*16),
+        vld1q_u8(ptr + 2*16),
+        vld1q_u8(ptr + 3*16)
+       } {}

  template <typename F>
-  really_inline void each(F const& each_chunk)
-  {
+  really_inline void each(F const& each_chunk) const {
    each_chunk(this->chunks[0]);
    each_chunk(this->chunks[1]);
    each_chunk(this->chunks[2]);
@ -66,7 +65,7 @@ struct simd_input<Architecture::ARM64> {
  }

  template <typename F>
-  really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
+  really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) const {
    return simd_input<Architecture::ARM64>(
      map_chunk(this->chunks[0]),
      map_chunk(this->chunks[1]),
@ -76,7 +75,7 @@ struct simd_input<Architecture::ARM64> {
  }

  template <typename F>
-  really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) {
+  really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) const {
    return simd_input<Architecture::ARM64>(
      map_chunk(this->chunks[0], b.chunks[0]),
      map_chunk(this->chunks[1], b.chunks[1]),
@ -86,24 +85,31 @@ struct simd_input<Architecture::ARM64> {
  }

  template <typename F>
-  really_inline uint8x16_t reduce(F const& reduce_pair) {
+  really_inline uint8x16_t reduce(F const& reduce_pair) const {
    uint8x16_t r01 = reduce_pair(this->chunks[0], this->chunks[1]);
    uint8x16_t r23 = reduce_pair(this->chunks[2], this->chunks[3]);
    return reduce_pair(r01, r23);
  }

-  really_inline uint64_t to_bitmask() {
+  really_inline uint64_t to_bitmask() const {
    return neon_movemask_bulk(this->chunks[0], this->chunks[1], this->chunks[2], this->chunks[3]);
  }

-  really_inline uint64_t eq(uint8_t m) {
+  really_inline simd_input<Architecture::ARM64> bit_or(const uint8_t m) const {
+    const uint8x16_t mask = vmovq_n_u8(m);
+    return this->map( [&](auto a) {
+      return vorrq_u8(a, mask);
+    });
+  }
+
+  really_inline uint64_t eq(const uint8_t m) const {
    const uint8x16_t mask = vmovq_n_u8(m);
    return this->map( [&](auto a) {
      return vceqq_u8(a, mask);
    }).to_bitmask();
  }

-  really_inline uint64_t lteq(uint8_t m) {
+  really_inline uint64_t lteq(const uint8_t m) const {
    const uint8x16_t mask = vmovq_n_u8(m);
    return this->map( [&](auto a) {
      return vcleq_u8(a, mask);
--- a/src/arm64/stage1_find_marks.h
+++ b/src/arm64/stage1_find_marks.h
@ -12,7 +12,7 @@

 namespace simdjson::arm64 {

-really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
+really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {

 #ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
  return vmull_p64(-1ULL, quote_bits);
@ -21,9 +21,9 @@ really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
 #endif
 }

-really_inline void find_whitespace_and_structurals(
-    simd_input<ARCHITECTURE> in, uint64_t &whitespace,
-    uint64_t &structurals) {
+really_inline void find_whitespace_and_operators(
+    const simd_input<ARCHITECTURE> in,
+    uint64_t &whitespace, uint64_t &op) {
  const uint8x16_t low_nibble_mask =
      (uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
  const uint8x16_t high_nibble_mask =
@ -38,9 +38,9 @@ really_inline void find_whitespace_and_structurals(
    return vandq_u8(shuf_lo, shuf_hi);
  });

-  const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
-  structurals = v.map([&](auto _v) {
-    return vtstq_u8(_v, structural_shufti_mask);
+  const uint8x16_t operator_shufti_mask = vmovq_n_u8(0x7);
+  op = v.map([&](auto _v) {
+    return vtstq_u8(_v, operator_shufti_mask);
  }).to_bitmask();

  const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
--- a/src/generic/stage1_find_marks.h
+++ b/src/generic/stage1_find_marks.h
@ -12,230 +12,271 @@
 // indicate whether we end an iteration on an odd-length sequence of
 // backslashes, which modifies our subsequent search for odd-length
 // sequences of backslashes in an obvious way.
-really_inline uint64_t find_odd_backslash_sequences(
-    simd_input<ARCHITECTURE> in,
-    uint64_t &prev_iter_ends_odd_backslash) {
+really_inline uint64_t follows_odd_sequence_of(const uint64_t match, uint64_t &overflow) {
  const uint64_t even_bits = 0x5555555555555555ULL;
  const uint64_t odd_bits = ~even_bits;
-  uint64_t bs_bits = in.eq('\\');
-  uint64_t start_edges = bs_bits & ~(bs_bits << 1);
+  uint64_t start_edges = match & ~(match << 1);
  /* flip lowest if we have an odd-length run at the end of the prior
   * iteration */
-  uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
+  uint64_t even_start_mask = even_bits ^ overflow;
  uint64_t even_starts = start_edges & even_start_mask;
  uint64_t odd_starts = start_edges & ~even_start_mask;
-  uint64_t even_carries = bs_bits + even_starts;
+  uint64_t even_carries = match + even_starts;

  uint64_t odd_carries;
  /* must record the carry-out of our odd-carries out of bit 63; this
   * indicates whether the sense of any edge going to the next iteration
   * should be flipped */
-  bool iter_ends_odd_backslash =
-      add_overflow(bs_bits, odd_starts, &odd_carries);
+  bool new_overflow = add_overflow(match, odd_starts, &odd_carries);

-  odd_carries |= prev_iter_ends_odd_backslash; /* push in bit zero as a
-                                                * potential end if we had an
-                                                * odd-numbered run at the
-                                                * end of the previous
-                                                * iteration */
-  prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
-  uint64_t even_carry_ends = even_carries & ~bs_bits;
-  uint64_t odd_carry_ends = odd_carries & ~bs_bits;
+  odd_carries |= overflow; /* push in bit zero as a
+                              * potential end if we had an
+                              * odd-numbered run at the
+                              * end of the previous
+                              * iteration */
+  overflow = new_overflow ? 0x1ULL : 0x0ULL;
+  uint64_t even_carry_ends = even_carries & ~match;
+  uint64_t odd_carry_ends = odd_carries & ~match;
  uint64_t even_start_odd_end = even_carry_ends & odd_bits;
  uint64_t odd_start_even_end = odd_carry_ends & even_bits;
  uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
  return odd_ends;
 }

-// return both the quote mask (which is a half-open mask that covers the first
-// quote
-// in an unescaped quote pair and everything in the quote pair) and the quote
-// bits, which are the simple
-// unescaped quoted bits. We also update the prev_iter_inside_quote value to
-// tell the next iteration
-// whether we finished the final iteration inside a quote pair; if so, this
-// inverts our behavior of
-// whether we're inside quotes for the next iteration.
-// Note that we don't do any error checking to see if we have backslash
-// sequences outside quotes; these
-// backslash sequences (of any length) will be detected elsewhere.
-really_inline uint64_t find_quote_mask_and_bits(
-    simd_input<ARCHITECTURE> in, uint64_t odd_ends,
-    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
-    uint64_t &error_mask) {
-  quote_bits = in.eq('"');
-  quote_bits = quote_bits & ~odd_ends;
-  uint64_t quote_mask = compute_quote_mask(quote_bits);
-  quote_mask ^= prev_iter_inside_quote;
+//
+// Check if the current character immediately follows a matching character.
+//
+// For example, this checks for quotes with backslashes in front of them:
+//
+//     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
+//
+really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
+  const uint64_t result = match << 1 | overflow;
+  overflow = match >> 63;
+  return result;
+}
+
+//
+// Check if the current character follows a matching character, with possible "filler" between.
+// For example, this checks for empty curly braces, e.g. 
+//
+//     in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
+//
+really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow ) {
+  uint64_t follows_match = follows(match, overflow);
+  uint64_t result;
+  overflow |= add_overflow(follows_match, filler, &result);
+  return result;
+}
+
+really_inline ErrorValues detect_errors_on_eof(
+  uint64_t &unescaped_chars_error,
+  const uint64_t prev_in_string) {
+  if (prev_in_string) {
+    return UNCLOSED_STRING;
+  }
+  if (unescaped_chars_error) {
+    return UNESCAPED_CHARS;
+  }
+  return SUCCESS;
+}
+
+//
+// Return a mask of all string characters plus end quotes.
+//
+// prev_escaped is overflow saying whether the next character is escaped.
+// prev_in_string is overflow saying whether we're still in a string.
+//
+// Backslash sequences outside of quotes will be detected in stage 2.
+//
+really_inline uint64_t find_strings(const simd_input<ARCHITECTURE> in, uint64_t &prev_escaped, uint64_t &prev_in_string) {
+  const uint64_t backslash = in.eq('\\');
+  const uint64_t escaped = follows_odd_sequence_of(backslash, prev_escaped);
+  const uint64_t quote = in.eq('"') & ~escaped;
+  // compute_quote_mask returns start quote plus string contents.
+  const uint64_t in_string = compute_quote_mask(quote) ^ prev_in_string;
+  /* right shift of a signed value expected to be well-defined and standard
+   * compliant as of C++20,
+   * John Regher from Utah U. says this is fine code */
+  prev_in_string = static_cast<uint64_t>(static_cast<int64_t>(in_string) >> 63);
+  // Use ^ to turn the beginning quote off, and the end quote on.
+  return in_string ^ quote;
+}
+
+really_inline uint64_t invalid_string_bytes(const uint64_t unescaped, const uint64_t quote_mask) {
  /* All Unicode characters may be placed within the
   * quotation marks, except for the characters that MUST be escaped:
   * quotation mark, reverse solidus, and the control characters (U+0000
   * through U+001F).
   * https://tools.ietf.org/html/rfc8259 */
-  uint64_t unescaped = in.lteq(0x1F);
-  error_mask |= quote_mask & unescaped;
-  /* right shift of a signed value expected to be well-defined and standard
-   * compliant as of C++20,
-   * John Regher from Utah U. says this is fine code */
-  prev_iter_inside_quote =
-      static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63);
-  return quote_mask;
+  return quote_mask & unescaped;
 }

-really_inline uint64_t finalize_structurals(
-    uint64_t structurals, uint64_t whitespace, uint64_t quote_mask,
-    uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) {
-  // mask off anything inside quotes
-  structurals &= ~quote_mask;
-  // add the real quote bits back into our bit_mask as well, so we can
-  // quickly traverse the strings we've spent all this trouble gathering
-  structurals |= quote_bits;
-  // Now, establish "pseudo-structural characters". These are non-whitespace
-  // characters that are (a) outside quotes and (b) have a predecessor that's
-  // either whitespace or a structural character. This means that subsequent
-  // passes will get a chance to encounter the first character of every string
-  // of non-whitespace and, if we're parsing an atom like true/false/null or a
-  // number we can stop at the first whitespace or structural character
-  // following it.
+//
+// Determine which characters are *structural*:
+// - braces: [] and {}
+// - the start of primitives (123, true, false, null)
+// - the start of invalid non-whitespace (+, &, ture, UTF-8)
+//
+// Also detects value sequence errors:
+// - two values with no separator between ("hello" "world")
+// - separators with no values ([1,] [1,,]and [,2])
+//
+// This method will find all of the above whether it is in a string or not.
+//
+// To reduce dependency on the expensive "what is in a string" computation, this method treats the
+// contents of a string the same as content outside. Errors and structurals inside the string or on
+// the trailing quote will need to be removed later when the correct string information is known.
+//
+really_inline uint64_t find_potential_structurals(const simd_input<ARCHITECTURE> in, uint64_t &prev_primitive) {
+  // These use SIMD so let's kick them off before running the regular 64-bit stuff ...
+  uint64_t whitespace, op;
+  find_whitespace_and_operators(in, whitespace, op);

-  // a qualified predecessor is something that can happen 1 position before an
-  // pseudo-structural character
-  uint64_t pseudo_pred = structurals | whitespace;
+  // Detect the start of a run of primitive characters. Includes numbers, booleans, and strings (").
+  // Everything except whitespace, braces, colon and comma.
+  const uint64_t primitive = ~(op | whitespace);
+  const uint64_t follows_primitive = follows(primitive, prev_primitive);
+  const uint64_t start_primitive = primitive & ~follows_primitive;

-  uint64_t shifted_pseudo_pred =
-      (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
-  prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
-  uint64_t pseudo_structurals =
-      shifted_pseudo_pred & (~whitespace) & (~quote_mask);
-  structurals |= pseudo_structurals;
-
-  // now, we've used our close quotes all we need to. So let's switch them off
-  // they will be off in the quote mask and on in quote bits.
-  structurals &= ~(quote_bits & ~quote_mask);
-  return structurals;
+  // Return final structurals
+  return op | start_primitive;
 }

-// Find structural bits in a 64-byte chunk.
-really_inline void find_structural_bits_64(
-    const uint8_t *buf, size_t idx, uint32_t *base_ptr, uint32_t &base,
-    uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote,
-    uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals,
-    uint64_t &error_mask,
+static const size_t STEP_SIZE = 128;
+
+//
+// Find the important bits of JSON in a 128-byte chunk, and add them to :
+//
+//
+//
+// PERF NOTES:
+// We pipe 2 inputs through these stages:
+// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+// 2. Scan the JSON for critical data: strings, primitives and operators. This is the critical path.
+//    The output of step 1 depends entirely on this information. These functions don't quite use
+//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+//    at a time. The second input's scans has some dependency on the first ones finishing it, but
+//    they can make a lot of progress before they need that information.
+// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+//    to finish: utf-8 checks and generating the output from the last iteration.
+// 
+// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+// workout.
+//
+really_inline void find_structural_bits_128(
+    const uint8_t *buf, const size_t idx, uint32_t *&base_ptr,
+    uint64_t &prev_escaped, uint64_t &prev_in_string,
+    uint64_t &prev_primitive,
+    uint64_t &prev_structurals,
+    uint64_t &unescaped_chars_error,
    utf8_checker<ARCHITECTURE> &utf8_state) {
-  simd_input<ARCHITECTURE> in(buf);
-  utf8_state.check_next_input(in);
-  /* detect odd sequences of backslashes */
-  uint64_t odd_ends = find_odd_backslash_sequences(
-      in, prev_iter_ends_odd_backslash);
+  //
+  // Load up all 128 bytes into SIMD registers
+  //
+  simd_input<ARCHITECTURE> in_1(buf);
+  simd_input<ARCHITECTURE> in_2(buf+64);

-  /* detect insides of quote pairs ("quote_mask") and also our quote_bits
-   * themselves */
-  uint64_t quote_bits;
-  uint64_t quote_mask = find_quote_mask_and_bits(
-      in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
+  //
+  // Find the strings and potential structurals (operators / primitives).
+  //
+  // This will include false structurals that are *inside* strings--we'll filter strings out
+  // before we return.
+  //
+  uint64_t string_1 = find_strings(in_1, prev_escaped, prev_in_string);
+  uint64_t structurals_1 = find_potential_structurals(in_1, prev_primitive);
+  uint64_t string_2 = find_strings(in_2, prev_escaped, prev_in_string);
+  uint64_t structurals_2 = find_potential_structurals(in_2, prev_primitive);

-  /* take the previous iterations structural bits, not our current
-   * iteration,
-   * and flatten */
-  flatten_bits(base_ptr, base, idx, structurals);
+  //
+  // Do miscellaneous work while the processor is busy calculating strings and structurals.
+  //
+  // After that, weed out structurals that are inside strings and find invalid string characters.
+  //
+  uint64_t unescaped_1 = in_1.lteq(0x1F);
+  utf8_state.check_next_input(in_1);
+  flatten_bits(base_ptr, idx, prev_structurals); // Output *last* iteration's structurals to ParsedJson
+  prev_structurals = structurals_1 & ~string_1;
+  unescaped_chars_error |= unescaped_1 & string_1;

-  uint64_t whitespace;
-  find_whitespace_and_structurals(in, whitespace, structurals);
-
-  /* fixup structurals to reflect quotes and add pseudo-structural
-   * characters */
-  structurals = finalize_structurals(structurals, whitespace, quote_mask,
-                                     quote_bits, prev_iter_ends_pseudo_pred);
+  uint64_t unescaped_2 = in_2.lteq(0x1F);
+  utf8_state.check_next_input(in_2);
+  flatten_bits(base_ptr, idx+64, prev_structurals); // Output *last* iteration's structurals to ParsedJson
+  prev_structurals = structurals_2 & ~string_2;
+  unescaped_chars_error |= unescaped_2 & string_2;
 }

 int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
-  if (len > pj.byte_capacity) {
+  if (unlikely(len > pj.byte_capacity)) {
    std::cerr << "Your ParsedJson object only supports documents up to "
              << pj.byte_capacity << " bytes but you are trying to process "
              << len << " bytes" << std::endl;
    return simdjson::CAPACITY;
  }
  uint32_t *base_ptr = pj.structural_indexes;
-  uint32_t base = 0;
  utf8_checker<ARCHITECTURE> utf8_state;

-  /* we have padded the input out to 64 byte multiple with the remainder
-   * being zeros persistent state across loop does the last iteration end
-   * with an odd-length sequence of backslashes? */
-
-  /* either 0 or 1, but a 64-bit value */
-  uint64_t prev_iter_ends_odd_backslash = 0ULL;
-  /* does the previous iteration end inside a double-quote pair? */
-  uint64_t prev_iter_inside_quote =
-      0ULL; /* either all zeros or all ones
-             * does the previous iteration end on something that is a
-             * predecessor of a pseudo-structural character - i.e.
-             * whitespace or a structural character effectively the very
-             * first char is considered to follow "whitespace" for the
-             * purposes of pseudo-structural character detection so we
-             * initialize to 1 */
-  uint64_t prev_iter_ends_pseudo_pred = 1ULL;
-
-  /* structurals are persistent state across loop as we flatten them on the
-   * subsequent iteration into our array pointed to be base_ptr.
-   * This is harmless on the first iteration as structurals==0
-   * and is done for performance reasons; we can hide some of the latency of
-   * the
-   * expensive carryless multiply in the previous step with this work */
+  // Whether the first character of the next iteration is escaped.
+  uint64_t prev_escaped = 0ULL;
+  // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
+  uint64_t prev_in_string = 0ULL;
+  // Whether the last character of the previous iteration is a primitive value character
+  // (anything except whitespace, braces, comma or colon).
+  uint64_t prev_primitive = 0ULL;
+  // Mask of structural characters from the last iteration.
+  // Kept around for performance reasons, so we can call flatten_bits to soak up some unused
+  // CPU capacity while the next iteration is busy with an expensive clmul in compute_quote_mask.
  uint64_t structurals = 0;

-  size_t lenminus64 = len < 64 ? 0 : len - 64;
+  size_t lenminusstep = len < STEP_SIZE ? 0 : len - STEP_SIZE;
  size_t idx = 0;
-  uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII
-                              code points < 0x20) */
+  // Errors with unescaped characters in strings (ASCII codepoints < 0x20)
+  uint64_t unescaped_chars_error = 0;

-  for (; idx < lenminus64; idx += 64) {
-    find_structural_bits_64(&buf[idx], idx, base_ptr, base,
-                            prev_iter_ends_odd_backslash,
-                            prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
-                            structurals, error_mask, utf8_state);
+  for (; idx < lenminusstep; idx += STEP_SIZE) {
+    find_structural_bits_128(&buf[idx], idx, base_ptr,
+                             prev_escaped, prev_in_string, prev_primitive,
+                             structurals, unescaped_chars_error, utf8_state);
  }
+
  /* If we have a final chunk of less than 64 bytes, pad it to 64 with
   * spaces  before processing it (otherwise, we risk invalidating the UTF-8
   * checks). */
-  if (idx < len) {
-    uint8_t tmp_buf[64];
-    memset(tmp_buf, 0x20, 64);
+  if (likely(idx < len)) {
+    uint8_t tmp_buf[STEP_SIZE];
+    memset(tmp_buf, 0x20, STEP_SIZE);
    memcpy(tmp_buf, buf + idx, len - idx);
-    find_structural_bits_64(&tmp_buf[0], idx, base_ptr, base,
-                            prev_iter_ends_odd_backslash,
-                            prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
-                            structurals, error_mask, utf8_state);
-    idx += 64;
+    find_structural_bits_128(&tmp_buf[0], idx, base_ptr,
+                             prev_escaped, prev_in_string, prev_primitive,
+                             structurals, unescaped_chars_error, utf8_state);
+    idx += STEP_SIZE;
  }

-  /* is last string quote closed? */
-  if (prev_iter_inside_quote) {
-    return simdjson::UNCLOSED_STRING;
+  /* finally, flatten out the remaining structurals from the last iteration */
+  flatten_bits(base_ptr, idx, structurals);
+
+  simdjson::ErrorValues error = detect_errors_on_eof(unescaped_chars_error, prev_in_string);
+  if (unlikely(error != simdjson::SUCCESS)) {
+    return error;
  }

-  /* finally, flatten out the remaining structurals from the last iteration
-   */
-  flatten_bits(base_ptr, base, idx, structurals);
-
-  pj.n_structural_indexes = base;
+  pj.n_structural_indexes = base_ptr - pj.structural_indexes;
  /* a valid JSON file cannot have zero structural indexes - we should have
   * found something */
-  if (pj.n_structural_indexes == 0u) {
+  if (unlikely(pj.n_structural_indexes == 0u)) {
    return simdjson::EMPTY;
  }
-  if (base_ptr[pj.n_structural_indexes - 1] > len) {
+  if (unlikely(pj.structural_indexes[pj.n_structural_indexes - 1] > len)) {
    return simdjson::UNEXPECTED_ERROR;
  }
-  if (len != base_ptr[pj.n_structural_indexes - 1]) {
+  if (len != pj.structural_indexes[pj.n_structural_indexes - 1]) {
    /* the string might not be NULL terminated, but we add a virtual NULL
     * ending character. */
-    base_ptr[pj.n_structural_indexes++] = len;
+    pj.structural_indexes[pj.n_structural_indexes++] = len;
  }
  /* make it safe to dereference one beyond this array */
-  base_ptr[pj.n_structural_indexes] = 0;
-  if (error_mask) {
-    return simdjson::UNESCAPED_CHARS;
-  }
+  pj.structural_indexes[pj.n_structural_indexes] = 0;
  return utf8_state.errors();
 }
--- a/src/generic/stage1_find_marks_flatten.h
+++ b/src/generic/stage1_find_marks_flatten.h
@ -26,64 +26,42 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx
 // base_ptr[base] incrementing base as we go
 // will potentially store extra values beyond end of valid bits, so base_ptr
 // needs to be large enough to handle this
-really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) {
+really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) {
  // In some instances, the next branch is expensive because it is mispredicted.
  // Unfortunately, in other cases,
  // it helps tremendously.
  if (bits == 0)
    return;
  uint32_t cnt = hamming(bits);
-  uint32_t next_base = base + cnt;
  idx -= 64;
-  base_ptr += base;
-  {
-    base_ptr[0] = idx + trailing_zeroes(bits);
+
+  // Do the first 8 all together
+  for (int i=0; i<8; i++) {
+    base_ptr[i] = idx + trailing_zeroes(bits);
    bits = bits & (bits - 1);
-    base_ptr[1] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[2] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[3] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[4] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[5] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[6] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[7] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr += 8;
  }
-  // We hope that the next branch is easily predicted.
-  if (cnt > 8) {
-    base_ptr[0] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[1] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[2] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[3] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[4] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[5] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[6] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[7] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr += 8;
-  }
-  if (cnt > 16) { // unluckly: we rarely get here
-    // since it means having one structural or pseudo-structral element
-    // every 4 characters (possible with inputs like "","","",...).
-    do {
-      base_ptr[0] = idx + trailing_zeroes(bits);
+
+  // Do the next 8 all together (we hope in most cases it won't happen at all
+  // and the branch is easily predicted).
+  if (unlikely(cnt > 8)) {
+    for (int i=8; i<16; i++) {
+      base_ptr[i] = idx + trailing_zeroes(bits);
      bits = bits & (bits - 1);
-      base_ptr++;
-    } while (bits != 0);
+    }
+
+    // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+    // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+    // or the start of a value ("abc" true 123) every 4 characters.
+    if (unlikely(cnt > 16)) {
+      uint32_t i = 16;
+      do {
+        base_ptr[i] = idx + trailing_zeroes(bits);
+        bits = bits & (bits - 1);
+        i++;
+      } while (i < cnt);
+    }
  }
-  base = next_base;
+
+  base_ptr += cnt;
 }
 #endif // SIMDJSON_NAIVE_FLATTEN
--- a/src/haswell/simd_input.h
+++ b/src/haswell/simd_input.h
@ -10,29 +10,28 @@ namespace simdjson {

 template <>
 struct simd_input<Architecture::HASWELL> {
-  __m256i chunks[2];
+  const __m256i chunks[2];
+
+  really_inline simd_input() : chunks{__m256i(), __m256i()} {}
+
+  really_inline simd_input(const __m256i chunk0, const __m256i chunk1)
+      : chunks{chunk0, chunk1} {}

  really_inline simd_input(const uint8_t *ptr)
-  {
-    this->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0*32));
-    this->chunks[1] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 1*32));
-  }
-
-  really_inline simd_input(__m256i chunk0, __m256i chunk1)
-  {
-    this->chunks[0] = chunk0;
-    this->chunks[1] = chunk1;
-  }
+      : chunks{
+        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0*32)),
+        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 1*32))
+      } {}

  template <typename F>
-  really_inline void each(F const& each_chunk)
+  really_inline void each(F const& each_chunk) const
  {
    each_chunk(this->chunks[0]);
    each_chunk(this->chunks[1]);
  }

  template <typename F>
-  really_inline simd_input<Architecture::HASWELL> map(F const& map_chunk) {
+  really_inline simd_input<Architecture::HASWELL> map(F const& map_chunk) const {
    return simd_input<Architecture::HASWELL>(
      map_chunk(this->chunks[0]),
      map_chunk(this->chunks[1])
@ -40,7 +39,7 @@ struct simd_input<Architecture::HASWELL> {
  }

  template <typename F>
-  really_inline simd_input<Architecture::HASWELL> map(simd_input<Architecture::HASWELL> b, F const& map_chunk) {
+  really_inline simd_input<Architecture::HASWELL> map(const simd_input<Architecture::HASWELL> b, F const& map_chunk) const {
    return simd_input<Architecture::HASWELL>(
      map_chunk(this->chunks[0], b.chunks[0]),
      map_chunk(this->chunks[1], b.chunks[1])
@ -48,24 +47,31 @@ struct simd_input<Architecture::HASWELL> {
  }

  template <typename F>
-  really_inline __m256i reduce(F const& reduce_pair) {
+  really_inline __m256i reduce(F const& reduce_pair) const {
    return reduce_pair(this->chunks[0], this->chunks[1]);
  }

-  really_inline uint64_t to_bitmask() {
+  really_inline uint64_t to_bitmask() const {
    uint64_t r_lo = static_cast<uint32_t>(_mm256_movemask_epi8(this->chunks[0]));
    uint64_t r_hi =                       _mm256_movemask_epi8(this->chunks[1]);
    return r_lo | (r_hi << 32);
  }

-  really_inline uint64_t eq(uint8_t m) {
+  really_inline simd_input<Architecture::HASWELL> bit_or(const uint8_t m) const {
+    const __m256i mask = _mm256_set1_epi8(m);
+    return this->map( [&](auto a) {
+      return _mm256_or_si256(a, mask);
+    });
+  }
+
+  really_inline uint64_t eq(const uint8_t m) const {
    const __m256i mask = _mm256_set1_epi8(m);
    return this->map( [&](auto a) {
      return _mm256_cmpeq_epi8(a, mask);
    }).to_bitmask();
  }

-  really_inline uint64_t lteq(uint8_t m) {
+  really_inline uint64_t lteq(const uint8_t m) const {
    const __m256i maxval = _mm256_set1_epi8(m);
    return this->map( [&](auto a) {
      return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, a), maxval);
--- a/src/haswell/simdutf8check.h
+++ b/src/haswell/simdutf8check.h
@ -218,7 +218,7 @@ struct utf8_checker<Architecture::HASWELL> {
    __m256i any_bits_on = in.reduce([&](auto a, auto b) {
      return _mm256_or_si256(a, b);
    });
-    if ((_mm256_testz_si256(any_bits_on, high_bit)) == 1) {
+    if (likely(_mm256_testz_si256(any_bits_on, high_bit) == 1)) {
      // it is ascii, we just check continuation
      this->has_error = _mm256_or_si256(
          _mm256_cmpgt_epi8(this->previous.carried_continuations,
--- a/src/haswell/stage1_find_marks.h
+++ b/src/haswell/stage1_find_marks.h
@ -13,7 +13,7 @@
 TARGET_HASWELL
 namespace simdjson::haswell {

-really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
+really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
  // There should be no such thing with a processing supporting avx2
  // but not clmul.
  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
@ -21,8 +21,9 @@ really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
  return quote_mask;
 }

-really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
-  uint64_t &whitespace, uint64_t &structurals) {
+really_inline void find_whitespace_and_operators(
+  const simd_input<ARCHITECTURE> in,
+  uint64_t &whitespace, uint64_t &op) {

  #ifdef SIMDJSON_NAIVE_STRUCTURAL

@ -34,14 +35,14 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
    const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
    const __m256i mask_column = _mm256_set1_epi8(0x3a);
    const __m256i mask_comma = _mm256_set1_epi8(0x2c);
-    structurals = in.map([&](auto in) {
-      __m256i structurals = _mm256_cmpeq_epi8(in, mask_open_brace);
-      structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_brace));
-      structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_open_bracket));
-      structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_bracket));
-      structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_column));
-      structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_comma));
-      return structurals;
+    op = in.map([&](auto in) {
+      __m256i op = _mm256_cmpeq_epi8(in, mask_open_brace);
+      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_brace));
+      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_open_bracket));
+      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_bracket));
+      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_column));
+      op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_comma));
+      return op;
    }).to_bitmask();

    const __m256i mask_space = _mm256_set1_epi8(0x20);
@ -60,24 +61,24 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
  #else  // SIMDJSON_NAIVE_STRUCTURAL

    // clang-format off
-    const __m256i structural_table =
+    const __m256i operator_table =
        _mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
                         44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
    const __m256i white_table = _mm256_setr_epi8(
        32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
        32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
    // clang-format on
-    const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
-    const __m256i struct_mask = _mm256_set1_epi8(32);
+    const __m256i op_offset = _mm256_set1_epi8(0xd4u);
+    const __m256i op_mask = _mm256_set1_epi8(32);

    whitespace = in.map([&](auto _in) {
      return _mm256_cmpeq_epi8(_in, _mm256_shuffle_epi8(white_table, _in));
    }).to_bitmask();

-    structurals = in.map([&](auto _in) {
-      const __m256i r1 = _mm256_add_epi8(struct_offset, _in);
-      const __m256i r2 = _mm256_or_si256(_in, struct_mask);
-      const __m256i r3 = _mm256_shuffle_epi8(structural_table, r1);
+    op = in.map([&](auto _in) {
+      const __m256i r1 = _mm256_add_epi8(op_offset, _in);
+      const __m256i r2 = _mm256_or_si256(_in, op_mask);
+      const __m256i r3 = _mm256_shuffle_epi8(operator_table, r1);
      return _mm256_cmpeq_epi8(r2, r3);
    }).to_bitmask();

@ -89,65 +90,43 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
 // base_ptr[base] incrementing base as we go
 // will potentially store extra values beyond end of valid bits, so base_ptr
 // needs to be large enough to handle this
-really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) {
+really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) {
  // In some instances, the next branch is expensive because it is mispredicted.
  // Unfortunately, in other cases,
  // it helps tremendously.
  if (bits == 0)
      return;
  uint32_t cnt = _mm_popcnt_u64(bits);
-  uint32_t next_base = base + cnt;
  idx -= 64;
-  base_ptr += base;
-  {
-      base_ptr[0] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr[1] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr[2] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr[3] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr[4] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr[5] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr[6] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr[7] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr += 8;
+
+  // Do the first 8 all together
+  for (int i=0; i<8; i++) {
+    base_ptr[i] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
  }
-  // We hope that the next branch is easily predicted.
-  if (cnt > 8) {
-      base_ptr[0] = idx + trailing_zeroes(bits);
+
+  // Do the next 8 all together (we hope in most cases it won't happen at all
+  // and the branch is easily predicted).
+  if (unlikely(cnt > 8)) {
+    for (int i=8; i<16; i++) {
+      base_ptr[i] = idx + trailing_zeroes(bits);
      bits = _blsr_u64(bits);
-      base_ptr[1] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr[2] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr[3] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr[4] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr[5] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr[6] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr[7] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr += 8;
-  }
-  if (cnt > 16) { // unluckly: we rarely get here
-      // since it means having one structural or pseudo-structral element
-      // every 4 characters (possible with inputs like "","","",...).
+    }
+
+    // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+    // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+    // or the start of a value ("abc" true 123) every four characters.
+    if (unlikely(cnt > 16)) {
+      uint32_t i = 16;
      do {
-      base_ptr[0] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr++;
-      } while (bits != 0);
+        base_ptr[i] = idx + trailing_zeroes(bits);
+        bits = _blsr_u64(bits);
+        i++;
+      } while (i < cnt);
+    }
  }
-  base = next_base;
+
+  base_ptr += cnt;
 }

 #include "generic/stage1_find_marks.h"
--- a/src/westmere/simd_input.h
+++ b/src/westmere/simd_input.h
@ -10,26 +10,24 @@ namespace simdjson {

 template <>
 struct simd_input<Architecture::WESTMERE> {
-  __m128i chunks[4];
+  const __m128i chunks[4];

-  really_inline simd_input(const uint8_t *ptr) {
-    this->chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
-    this->chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
-    this->chunks[2] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
-    this->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
-  }
+  really_inline simd_input()
+      : chunks { __m128i(), __m128i(), __m128i(), __m128i() } {}

-  really_inline simd_input(__m128i i0, __m128i i1, __m128i i2, __m128i i3)
-  {
-    this->chunks[0] = i0;
-    this->chunks[1] = i1;
-    this->chunks[2] = i2;
-    this->chunks[3] = i3;
-  }
+  really_inline simd_input(const __m128i chunk0, const __m128i chunk1, const __m128i chunk2, const __m128i chunk3)
+      : chunks{chunk0, chunk1, chunk2, chunk3} {}
+
+  really_inline simd_input(const uint8_t *ptr)
+      : simd_input(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0)),
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16)),
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32)),
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48))
+      ) {}

  template <typename F>
-  really_inline void each(F const& each_chunk)
-  {
+  really_inline void each(F const& each_chunk) const {
    each_chunk(this->chunks[0]);
    each_chunk(this->chunks[1]);
    each_chunk(this->chunks[2]);
@ -37,7 +35,7 @@ struct simd_input<Architecture::WESTMERE> {
  }

  template <typename F>
-  really_inline simd_input<Architecture::WESTMERE> map(F const& map_chunk) {
+  really_inline simd_input<Architecture::WESTMERE> map(F const& map_chunk) const {
    return simd_input<Architecture::WESTMERE>(
      map_chunk(this->chunks[0]),
      map_chunk(this->chunks[1]),
@ -47,7 +45,7 @@ struct simd_input<Architecture::WESTMERE> {
  }

  template <typename F>
-  really_inline simd_input<Architecture::WESTMERE> map(simd_input<Architecture::WESTMERE> b, F const& map_chunk) {
+  really_inline simd_input<Architecture::WESTMERE> map(const simd_input<Architecture::WESTMERE> b, F const& map_chunk) const {
    return simd_input<Architecture::WESTMERE>(
      map_chunk(this->chunks[0], b.chunks[0]),
      map_chunk(this->chunks[1], b.chunks[1]),
@ -57,13 +55,13 @@ struct simd_input<Architecture::WESTMERE> {
  }

  template <typename F>
-  really_inline __m128i reduce(F const& reduce_pair) {
+  really_inline __m128i reduce(F const& reduce_pair) const {
    __m128i r01 = reduce_pair(this->chunks[0], this->chunks[1]);
    __m128i r23 = reduce_pair(this->chunks[2], this->chunks[3]);
    return reduce_pair(r01, r23);
  }

-  really_inline uint64_t to_bitmask() {
+  really_inline uint64_t to_bitmask() const {
    uint64_t r0 = static_cast<uint32_t>(_mm_movemask_epi8(this->chunks[0]));
    uint64_t r1 =                       _mm_movemask_epi8(this->chunks[1]);
    uint64_t r2 =                       _mm_movemask_epi8(this->chunks[2]);
@ -71,14 +69,21 @@ struct simd_input<Architecture::WESTMERE> {
    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
  }

-  really_inline uint64_t eq(uint8_t m) {
+  really_inline simd_input<Architecture::WESTMERE> bit_or(const uint8_t m) const {
+    const __m128i mask = _mm_set1_epi8(m);
+    return this->map( [&](auto a) {
+      return _mm_or_si128(a, mask);
+    });
+  }
+
+  really_inline uint64_t eq(const uint8_t m) const {
    const __m128i mask = _mm_set1_epi8(m);
    return this->map( [&](auto a) {
      return _mm_cmpeq_epi8(a, mask);
    }).to_bitmask();
  }

-  really_inline uint64_t lteq(uint8_t m) {
+  really_inline uint64_t lteq(const uint8_t m) const {
    const __m128i maxval = _mm_set1_epi8(m);
    return this->map( [&](auto a) {
      return _mm_cmpeq_epi8(_mm_max_epu8(maxval, a), maxval);
--- a/src/westmere/stage1_find_marks.h
+++ b/src/westmere/stage1_find_marks.h
@ -13,29 +13,30 @@
 TARGET_WESTMERE
 namespace simdjson::westmere {

-really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
+really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
  return _mm_cvtsi128_si64(_mm_clmulepi64_si128(
      _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
 }

-really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
-  uint64_t &whitespace, uint64_t &structurals) {
+really_inline void find_whitespace_and_operators(
+  const simd_input<ARCHITECTURE> in,
+  uint64_t &whitespace, uint64_t &op) {

-  const __m128i structural_table =
+  const __m128i operator_table =
      _mm_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
  const __m128i white_table = _mm_setr_epi8(32, 100, 100, 100,  17, 100, 113,   2,
                                           100,   9,  10, 112, 100,  13, 100, 100);
-  const __m128i struct_offset = _mm_set1_epi8(0xd4u);
-  const __m128i struct_mask = _mm_set1_epi8(32);
+  const __m128i op_offset = _mm_set1_epi8(0xd4u);
+  const __m128i op_mask = _mm_set1_epi8(32);

  whitespace = in.map([&](auto _in) {
    return _mm_cmpeq_epi8(_in, _mm_shuffle_epi8(white_table, _in));
  }).to_bitmask();

-  structurals = in.map([&](auto _in) {
-    const __m128i r1 = _mm_add_epi8(struct_offset, _in);
-    const __m128i r2 = _mm_or_si128(_in, struct_mask);
-    const __m128i r3 = _mm_shuffle_epi8(structural_table, r1);
+  op = in.map([&](auto _in) {
+    const __m128i r1 = _mm_add_epi8(op_offset, _in);
+    const __m128i r2 = _mm_or_si128(_in, op_mask);
+    const __m128i r3 = _mm_shuffle_epi8(operator_table, r1);
    return _mm_cmpeq_epi8(r2, r3);
  }).to_bitmask();
 }