Merge pull request #197 from lemire/multiple_implementation_refactoring

Partial refactoring (stage1) to facilitate multiple implementations.
2019-07-02 10:42:51 -04:00 · 2019-07-02 10:42:51 -04:00 · 78406ba954
parent 5bd7fffb4c aa78b70d69
commit 78406ba954
8 changed files with 996 additions and 840 deletions
--- a/benchmark/parse.cpp
+++ b/benchmark/parse.cpp
@ -144,7 +144,8 @@ int main(int argc, char *argv[]) {
      std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
    }
    unified.start();
-    isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS);
+    // The default template is simdjson::instruction_set::native.
+    isok = (find_structural_bits<>(p.data(), p.size(), pj) == simdjson::SUCCESS);
    unified.end(results);
    cy1 += results[0];
    cl1 += results[1];
@ -185,7 +186,8 @@ int main(int argc, char *argv[]) {
    }

    auto start = std::chrono::steady_clock::now();
-    isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS);
+    // The default template is simdjson::instruction_set::native.
+    isok = (find_structural_bits<>(p.data(), p.size(), pj) == simdjson::SUCCESS);
    isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> secs = end - start;
--- a/benchmark/statisticalmodel.cpp
+++ b/benchmark/statisticalmodel.cpp
@ -180,7 +180,8 @@ int main(int argc, char *argv[]) {
  results.resize(evts.size());
  for (uint32_t i = 0; i < iterations; i++) {
    unified.start();
-    bool isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS);
+    // The default template is simdjson::instruction_set::native.
+    bool isok = (find_structural_bits<>(p.data(), p.size(), pj) == simdjson::SUCCESS);
    unified.end(results);
    
    cy1 += results[0];
--- a/include/simdjson/jsonparser.h
+++ b/include/simdjson/jsonparser.h
@ -8,6 +8,70 @@
 #include "simdjson/stage1_find_marks.h"
 #include "simdjson/stage2_build_tape.h"
 #include "simdjson/simdjson.h"
+#ifdef _MSC_VER
+#include <windows.h>
+#include <sysinfoapi.h>
+#else
+#include <unistd.h>
+#endif
+
+// The function that users are expected to call is json_parse.
+// We have more than one such function because we want to support several 
+// instruction sets.
+
+// function pointer type for json_parse
+using json_parse_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded);
+
+// Pointer that holds the json_parse implementation corresponding to the available SIMD instruction set
+extern json_parse_functype *json_parse_ptr;
+
+
+// json_parse_implementation is the generic function, it is specialized for various 
+// SIMD instruction sets, e.g., as json_parse_implementation<simdjson::instruction_set::avx2>
+// or json_parse_implementation<simdjson::instruction_set::neon> 
+template<simdjson::instruction_set T>
+int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) {
+  if (pj.bytecapacity < len) {
+    return simdjson::CAPACITY;
+  }
+  bool reallocated = false;
+  if(reallocifneeded) {
+#ifdef ALLOW_SAME_PAGE_BUFFER_OVERRUN
+    // realloc is needed if the end of the memory crosses a page
+#ifdef _MSC_VER
+    SYSTEM_INFO sysInfo; 
+    GetSystemInfo(&sysInfo); 
+    long pagesize = sysInfo.dwPageSize;
+#else
+    long pagesize = sysconf (_SC_PAGESIZE); 
+#endif
+    //////////////
+    // We want to check that buf + len - 1 and buf + len - 1 + SIMDJSON_PADDING
+    // are in the same page.
+    // That is, we want to check that  
+    // (buf + len - 1) / pagesize == (buf + len - 1 + SIMDJSON_PADDING) / pagesize
+    // That's true if (buf + len - 1) % pagesize + SIMDJSON_PADDING < pagesize.
+    ///////////
+    if ( (reinterpret_cast<uintptr_t>(buf + len - 1) % pagesize ) + SIMDJSON_PADDING < static_cast<uintptr_t>(pagesize) ) {
+#else // SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN
+    if(true) { // if not SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN, we always reallocate
+#endif
+      const uint8_t *tmpbuf  = buf;
+      buf = (uint8_t *) allocate_padded_buffer(len);
+      if(buf == NULL) return simdjson::MEMALLOC;
+      memcpy((void*)buf,tmpbuf,len);
+      reallocated = true;
+    } // if (true) OR if ( (reinterpret_cast<uintptr_t>(buf + len - 1) % pagesize ) + SIMDJSON_PADDING < static_cast<uintptr_t>(pagesize) ) {
+  } // if(reallocifneeded) {
+  int stage1_is_ok = find_structural_bits<T>(buf, len, pj);
+  if(stage1_is_ok != simdjson::SUCCESS) {
+    pj.errorcode = stage1_is_ok;
+    return pj.errorcode;
+  } 
+  int res = unified_machine(buf, len, pj);
+  if(reallocated) { aligned_free((void*)buf);}
+  return res;
+}

 // Parse a document found in buf. 
 // You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
@ -24,8 +88,10 @@
 // The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false,
 // all bytes at and after buf + len  are ignored (can be garbage).
 // The ParsedJson object can be reused.
-WARN_UNUSED
-int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true);
+
+inline int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) {
+  return json_parse_ptr(buf, len, pj, reallocifneeded);
+}

 // Parse a document found in buf.
 // You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
@ -43,9 +109,8 @@ int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifnee
 // The input buf should be readable up to buf + len + SIMDJSON_PADDING  if reallocifneeded is false,
 // all bytes at and after buf + len  are ignored (can be garbage).
 // The ParsedJson object can be reused.
-WARN_UNUSED
 inline int json_parse(const char * buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) {
-  return json_parse(reinterpret_cast<const uint8_t *>(buf), len, pj, reallocifneeded);
+  return json_parse_ptr(reinterpret_cast<const uint8_t *>(buf), len, pj, reallocifneeded);
 }

 // We do not want to allow implicit conversion from C string to std::string.
@ -61,7 +126,6 @@ int json_parse(const char * buf, ParsedJson &pj) = delete;
 //
 // A temporary buffer is created when needed during processing
 // (a copy of the input string is made).
-WARN_UNUSED
 inline int json_parse(const std::string &s, ParsedJson &pj) {
  return json_parse(s.data(), s.length(), pj, true);
 }
@ -76,7 +140,6 @@ inline int json_parse(const std::string &s, ParsedJson &pj) {
 //
 // You can also check validity
 // by calling pj.isValid(). The same ParsedJson can be reused for other documents.
-WARN_UNUSED
 inline int json_parse(const padded_string &s, ParsedJson &pj) {
  return json_parse(s.data(), s.length(), pj, false);
 }
--- a/include/simdjson/simdjson.h
+++ b/include/simdjson/simdjson.h
@ -4,6 +4,26 @@
 #include <string>

 struct simdjson {
+  enum class instruction_set {
+    avx2,
+    sse4_2,
+    neon,
+    none,
+// the 'native' enum class value should point at a good default on the current machine
+#ifdef __AVX2__
+    native = avx2
+#elif defined(__ARM_NEON)
+    native = neon
+#else
+    // Let us assume that we have an old x64 processor, but one that has SSE (i.e., something
+    // that came out in the second decade of the XXIst century.
+    // It would be nicer to check explicitly, but there many not be a good way to do so
+    // that is cross-platform.
+    // Under Visual Studio, there is no way to check for SSE4.2 support at compile-time.
+    native = sse4_2
+#endif
+  };
+
  enum errorValues {
    SUCCESS = 0,
    CAPACITY, // This ParsedJson can't support a document that big
--- a/include/simdjson/stage1_find_marks.h
+++ b/include/simdjson/stage1_find_marks.h
@ -1,14 +1,857 @@
 #ifndef SIMDJSON_STAGE1_FIND_MARKS_H
 #define SIMDJSON_STAGE1_FIND_MARKS_H

+#include <cassert>
 #include "simdjson/common_defs.h"
+#include "simdjson/parsedjson.h"
+#include "simdjson/portability.h"

-struct ParsedJson;
+#ifdef __AVX2__

-WARN_UNUSED
-int find_structural_bits(const uint8_t *buf, size_t len, ParsedJson &pj);
-
-WARN_UNUSED
-int find_structural_bits(const char *buf, size_t len, ParsedJson &pj);
+#ifndef SIMDJSON_SKIPUTF8VALIDATION
+#define SIMDJSON_UTF8VALIDATE
+
+#endif
+#else
+// currently we don't UTF8 validate for ARM
+// also we assume that if you're not __AVX2__ 
+// you're ARM, which is a bit dumb. TODO: Fix...
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#else
+#warning It appears that neither ARM NEON nor AVX2 are detected.
+#endif // __ARM_NEON
+#endif // __AVX2__
+
+// It seems that many parsers do UTF-8 validation.
+// RapidJSON does not do it by default, but a flag
+// allows it.
+#ifdef SIMDJSON_UTF8VALIDATE
+#include "simdjson/simdutf8check.h"
+#endif
+
+#define TRANSPOSE
+
+template<simdjson::instruction_set>
+struct simd_input;
+#ifdef __AVX2__
+template<>
+struct simd_input<simdjson::instruction_set::avx2>
+{
+  __m256i lo;
+  __m256i hi;
+};
+#endif
+
+#ifdef __ARM_NEON
+template<> struct simd_input<simdjson::instruction_set::neon>
+{
+#ifndef TRANSPOSE
+  uint8x16_t i0;
+  uint8x16_t i1;
+  uint8x16_t i2;
+  uint8x16_t i3;
+#else
+  uint8x16x4_t i;
+#endif
+};
+#endif
+
+#ifdef __ARM_NEON
+really_inline
+uint16_t neonmovemask(uint8x16_t input) {
+  const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+  uint8x16_t minput = vandq_u8(input, bitmask);
+  uint8x16_t tmp = vpaddq_u8(minput, minput);
+  tmp = vpaddq_u8(tmp, tmp);
+  tmp = vpaddq_u8(tmp, tmp);
+  return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
+}
+
+really_inline
+uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) {
+#ifndef TRANSPOSE
+  const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+  uint8x16_t t0 = vandq_u8(p0, bitmask);
+  uint8x16_t t1 = vandq_u8(p1, bitmask);
+  uint8x16_t t2 = vandq_u8(p2, bitmask);
+  uint8x16_t t3 = vandq_u8(p3, bitmask);
+  uint8x16_t sum0 = vpaddq_u8(t0, t1);
+  uint8x16_t sum1 = vpaddq_u8(t2, t3);
+  sum0 = vpaddq_u8(sum0, sum1);
+  sum0 = vpaddq_u8(sum0, sum0);
+  return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+#else
+  const uint8x16_t bitmask1 = { 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10,
+                                0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10};
+  const uint8x16_t bitmask2 = { 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20,
+                                0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20};
+  const uint8x16_t bitmask3 = { 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40,
+                                0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40};
+  const uint8x16_t bitmask4 = { 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80,
+                                0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80};
+#if 0
+  uint8x16_t t0 = vandq_u8(p0, bitmask1);
+  uint8x16_t t1 = vandq_u8(p1, bitmask2);
+  uint8x16_t t2 = vandq_u8(p2, bitmask3);
+  uint8x16_t t3 = vandq_u8(p3, bitmask4);
+  uint8x16_t tmp = vorrq_u8(vorrq_u8(t0, t1), vorrq_u8(t2, t3));
+#else
+  uint8x16_t t0 = vandq_u8(p0, bitmask1);
+  uint8x16_t t1 = vbslq_u8(bitmask2, p1, t0);
+  uint8x16_t t2 = vbslq_u8(bitmask3, p2, t1);
+  uint8x16_t tmp = vbslq_u8(bitmask4, p3, t2);
+#endif
+  uint8x16_t sum = vpaddq_u8(tmp, tmp);
+  return vgetq_lane_u64(vreinterpretq_u64_u8(sum), 0);
+#endif
+}
+#endif
+
+template<simdjson::instruction_set T>
+uint64_t compute_quote_mask(uint64_t quote_bits);
+
+// In practice, if you have NEON or __PCLMUL__, you would
+// always want to use them, but it might be useful, for research
+// purposes, to disable it willingly, that's what SIMDJSON_AVOID_CLMUL
+// does.
+// Also: we don't know of an instance where AVX2 is supported but 
+// where clmul is not supported, so check for both, to be sure.
+#ifdef SIMDJSON_AVOID_CLMUL
+template<simdjson::instruction_set T> really_inline
+uint64_t compute_quote_mask(uint64_t quote_bits)
+{
+  uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
+  quote_mask = quote_mask ^ (quote_mask << 2);
+  quote_mask = quote_mask ^ (quote_mask << 4);
+  quote_mask = quote_mask ^ (quote_mask << 8);
+  quote_mask = quote_mask ^ (quote_mask << 16);
+  quote_mask = quote_mask ^ (quote_mask << 32);
+  return quote_mask;
+}
+#else
+template<simdjson::instruction_set>
+uint64_t compute_quote_mask(uint64_t quote_bits);
+
+#ifdef __AVX2__ 
+template<> really_inline
+uint64_t compute_quote_mask<simdjson::instruction_set::avx2>(uint64_t quote_bits) {
+  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
+      _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
+  return quote_mask;
+}
+#endif
+
+#ifdef __ARM_NEON
+template<> really_inline
+uint64_t compute_quote_mask<simdjson::instruction_set::neon>(uint64_t quote_bits) {
+#ifdef __PCLMUL__ // Might cause problems on runtime dispatch
+  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
+                                          _mm_set_epi64x(0ULL, quote_bits),
+                                          _mm_set1_epi8(0xFF), 0));
+#else
+  uint64_t quote_mask = vmull_p64( -1ULL, quote_bits);
+#endif
+  return quote_mask;
+}
+#endif
+#endif
+
+#ifdef SIMDJSON_UTF8VALIDATE
+template<simdjson::instruction_set T>really_inline
+void check_utf8(simd_input<T> in,
+                __m256i &has_error,
+                struct avx_processed_utf_bytes &previous) {
+  __m256i highbit = _mm256_set1_epi8(0x80);
+  if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) {
+    // it is ascii, we just check continuation
+    has_error = _mm256_or_si256(
+        _mm256_cmpgt_epi8(
+            previous.carried_continuations,
+            _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                             9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
+        has_error);
+  } else {
+    // it is not ascii so we have to do heavy work
+    previous = avxcheckUTF8Bytes(in.lo, &previous, &has_error);
+    previous = avxcheckUTF8Bytes(in.hi, &previous, &has_error);
+  }
+}
+#endif
+
+template<simdjson::instruction_set T>
+simd_input<T> fill_input(const uint8_t * ptr);
+
+#ifdef __AVX2__
+template<> really_inline
+simd_input<simdjson::instruction_set::avx2> fill_input<simdjson::instruction_set::avx2>(const uint8_t * ptr) {
+  struct simd_input<simdjson::instruction_set::avx2> in;
+  in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
+  in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
+  return in;
+}
+#endif
+
+#ifdef __ARM_NEON
+template<> really_inline
+simd_input<simdjson::instruction_set::neon> fill_input<simdjson::instruction_set::neon>(const uint8_t * ptr) {
+  struct simd_input<simdjson::instruction_set::neon> in;
+#ifndef TRANSPOSE
+  in.i0 = vld1q_u8(ptr + 0);
+  in.i1 = vld1q_u8(ptr + 16);
+  in.i2 = vld1q_u8(ptr + 32);
+  in.i3 = vld1q_u8(ptr + 48);
+#else
+  in.i = vld4q_u8(ptr);
+#endif
+  return in;
+}
+#endif
+
+// a straightforward comparison of a mask against input. 5 uops; would be
+// cheaper in AVX512.
+template<simdjson::instruction_set T>
+uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
+
+#ifdef __AVX2__
+template<> really_inline
+uint64_t cmp_mask_against_input<simdjson::instruction_set::avx2>(simd_input<simdjson::instruction_set::avx2> in, uint8_t m) {
+
+  const __m256i mask = _mm256_set1_epi8(m);
+  __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
+  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
+  __m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
+  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
+  return res_0 | (res_1 << 32);
+}
+#endif
+
+#ifdef __ARM_NEON
+template<> really_inline
+uint64_t cmp_mask_against_input<simdjson::instruction_set::neon>(simd_input<simdjson::instruction_set::neon> in, uint8_t m) {
+  const uint8x16_t mask = vmovq_n_u8(m); 
+  uint8x16_t cmp_res_0 = vceqq_u8(in.i.val[0], mask); 
+  uint8x16_t cmp_res_1 = vceqq_u8(in.i.val[1], mask); 
+  uint8x16_t cmp_res_2 = vceqq_u8(in.i.val[2], mask); 
+  uint8x16_t cmp_res_3 = vceqq_u8(in.i.val[3], mask); 
+  return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+}
+#endif
+
+// find all values less than or equal than the content of maxval (using unsigned arithmetic) 
+template<simdjson::instruction_set T>
+uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
+
+#ifdef __AVX2__
+template<> really_inline
+uint64_t unsigned_lteq_against_input<simdjson::instruction_set::avx2>(simd_input<simdjson::instruction_set::avx2> in, uint8_t m) {
+  const __m256i maxval = _mm256_set1_epi8(m);
+  __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.lo),maxval);
+  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
+  __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.hi),maxval);
+  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
+  return res_0 | (res_1 << 32);
+}
+#endif
+
+#ifdef __ARM_NEON
+template<> really_inline
+uint64_t unsigned_lteq_against_input<simdjson::instruction_set::neon>(simd_input<simdjson::instruction_set::neon> in, uint8_t m) {
+  const uint8x16_t mask = vmovq_n_u8(m); 
+  uint8x16_t cmp_res_0 = vcleq_u8(in.i.val[0], mask); 
+  uint8x16_t cmp_res_1 = vcleq_u8(in.i.val[1], mask); 
+  uint8x16_t cmp_res_2 = vcleq_u8(in.i.val[2], mask); 
+  uint8x16_t cmp_res_3 = vcleq_u8(in.i.val[3], mask); 
+  return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+}
+#endif
+
+// return a bitvector indicating where we have characters that end an odd-length
+// sequence of backslashes (and thus change the behavior of the next character
+// to follow). A even-length sequence of backslashes, and, for that matter, the
+// largest even-length prefix of our odd-length sequence of backslashes, simply
+// modify the behavior of the backslashes themselves.
+// We also update the prev_iter_ends_odd_backslash reference parameter to
+// indicate whether we end an iteration on an odd-length sequence of
+// backslashes, which modifies our subsequent search for odd-length
+// sequences of backslashes in an obvious way.
+template<simdjson::instruction_set T> really_inline
+uint64_t find_odd_backslash_sequences(simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash) {
+  const uint64_t even_bits = 0x5555555555555555ULL;
+  const uint64_t odd_bits = ~even_bits;
+  uint64_t bs_bits = cmp_mask_against_input(in, '\\');
+  uint64_t start_edges = bs_bits & ~(bs_bits << 1);
+  // flip lowest if we have an odd-length run at the end of the prior
+  // iteration
+  uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
+  uint64_t even_starts = start_edges & even_start_mask;
+  uint64_t odd_starts = start_edges & ~even_start_mask;
+  uint64_t even_carries = bs_bits + even_starts;
+
+  uint64_t odd_carries;
+  // must record the carry-out of our odd-carries out of bit 63; this
+  // indicates whether the sense of any edge going to the next iteration
+  // should be flipped
+  bool iter_ends_odd_backslash =
+      add_overflow(bs_bits, odd_starts, &odd_carries);
+
+  odd_carries |=
+      prev_iter_ends_odd_backslash;  // push in bit zero as a potential end
+                                     // if we had an odd-numbered run at the
+                                     // end of the previous iteration
+  prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
+  uint64_t even_carry_ends = even_carries & ~bs_bits;
+  uint64_t odd_carry_ends = odd_carries & ~bs_bits;
+  uint64_t even_start_odd_end = even_carry_ends & odd_bits;
+  uint64_t odd_start_even_end = odd_carry_ends & even_bits;
+  uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
+  return odd_ends;
+}
+
+// return both the quote mask (which is a half-open mask that covers the first
+// quote
+// in an unescaped quote pair and everything in the quote pair) and the quote
+// bits, which are the simple
+// unescaped quoted bits. We also update the prev_iter_inside_quote value to
+// tell the next iteration
+// whether we finished the final iteration inside a quote pair; if so, this
+// inverts our behavior of
+// whether we're inside quotes for the next iteration.
+// Note that we don't do any error checking to see if we have backslash
+// sequences outside quotes; these
+// backslash sequences (of any length) will be detected elsewhere.
+template<simdjson::instruction_set T> really_inline
+uint64_t find_quote_mask_and_bits(simd_input<T> in, uint64_t odd_ends,
+    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits, uint64_t &error_mask) {
+  quote_bits = cmp_mask_against_input<T>(in, '"');
+  quote_bits = quote_bits & ~odd_ends;
+  uint64_t quote_mask = compute_quote_mask<T>(quote_bits);
+  quote_mask ^= prev_iter_inside_quote;
+  // All Unicode characters may be placed within the
+  // quotation marks, except for the characters that MUST be escaped:
+  // quotation mark, reverse solidus, and the control characters (U+0000
+  //through U+001F).
+  // https://tools.ietf.org/html/rfc8259
+  uint64_t unescaped = unsigned_lteq_against_input<T>(in, 0x1F);
+  error_mask |= quote_mask & unescaped;
+  // right shift of a signed value expected to be well-defined and standard
+  // compliant as of C++20,
+  // John Regher from Utah U. says this is fine code
+  prev_iter_inside_quote =
+      static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63);
+  return quote_mask;
+}
+
+// do a 'shufti' to detect structural JSON characters
+// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
+// these go into the first 3 buckets of the comparison (1/2/4)
+
+// we are also interested in the four whitespace characters
+// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
+// these go into the next 2 buckets of the comparison (8/16)
+template<simdjson::instruction_set T>
+void find_whitespace_and_structurals(simd_input<T> in,
+                                     uint64_t &whitespace,
+                                     uint64_t &structurals);
+
+#ifdef __AVX2__
+template<> really_inline
+void find_whitespace_and_structurals<simdjson::instruction_set::avx2>(simd_input<simdjson::instruction_set::avx2> in,
+                                                     uint64_t &whitespace,
+                                                     uint64_t &structurals) {
+#ifdef SIMDJSON_NAIVE_STRUCTURAL
+  // You should never need this naive approach, but it can be useful
+  // for research purposes
+  const __m256i mask_open_brace = _mm256_set1_epi8(0x7b);
+  __m256i struct_lo = _mm256_cmpeq_epi8(in.lo, mask_open_brace);
+  __m256i struct_hi = _mm256_cmpeq_epi8(in.hi, mask_open_brace);
+  const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
+  struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_close_brace));
+  struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_close_brace));
+  const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
+  struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_open_bracket));
+  struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_open_bracket));
+  const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
+  struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_close_bracket));
+  struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_close_bracket));
+  const __m256i mask_column = _mm256_set1_epi8(0x3a);
+  struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_column));
+  struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_column));
+  const __m256i mask_comma = _mm256_set1_epi8(0x2c);
+  struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_comma));
+  struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_comma));
+  uint64_t structural_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(struct_lo));
+  uint64_t structural_res_1 = _mm256_movemask_epi8(struct_hi);
+  structurals = (structural_res_0 | (structural_res_1 << 32));
+
+  const __m256i mask_space = _mm256_set1_epi8(0x20);
+  __m256i space_lo = _mm256_cmpeq_epi8(in.lo, mask_space);
+  __m256i space_hi = _mm256_cmpeq_epi8(in.hi, mask_space);
+  const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
+  space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_linefeed));
+  space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_linefeed));
+  const __m256i mask_tab = _mm256_set1_epi8(0x09);
+  space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_tab));
+  space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_tab));
+  const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
+  space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_carriage));
+  space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_carriage));
+
+  uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(space_lo));
+  uint64_t ws_res_1 = _mm256_movemask_epi8(space_hi);
+  whitespace = (ws_res_0 | (ws_res_1 << 32));
+  // end of naive approach
+
+#else // SIMDJSON_NAIVE_STRUCTURAL
+  const __m256i low_nibble_mask = _mm256_setr_epi8(
+      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 
+      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
+  const __m256i high_nibble_mask = _mm256_setr_epi8(
+      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 
+      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
+
+  __m256i structural_shufti_mask = _mm256_set1_epi8(0x7);
+  __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);
+
+  __m256i v_lo = _mm256_and_si256(
+      _mm256_shuffle_epi8(low_nibble_mask, in.lo),
+      _mm256_shuffle_epi8(high_nibble_mask,
+                          _mm256_and_si256(_mm256_srli_epi32(in.lo, 4),
+                                           _mm256_set1_epi8(0x7f))));
+
+  __m256i v_hi = _mm256_and_si256(
+      _mm256_shuffle_epi8(low_nibble_mask, in.hi),
+      _mm256_shuffle_epi8(high_nibble_mask,
+                          _mm256_and_si256(_mm256_srli_epi32(in.hi, 4),
+                                           _mm256_set1_epi8(0x7f))));
+  __m256i tmp_lo = _mm256_cmpeq_epi8(
+      _mm256_and_si256(v_lo, structural_shufti_mask), _mm256_set1_epi8(0));
+  __m256i tmp_hi = _mm256_cmpeq_epi8(
+      _mm256_and_si256(v_hi, structural_shufti_mask), _mm256_set1_epi8(0));
+
+  uint64_t structural_res_0 =
+      static_cast<uint32_t>(_mm256_movemask_epi8(tmp_lo));
+  uint64_t structural_res_1 = _mm256_movemask_epi8(tmp_hi);
+  structurals = ~(structural_res_0 | (structural_res_1 << 32));
+
+  __m256i tmp_ws_lo = _mm256_cmpeq_epi8(
+      _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
+  __m256i tmp_ws_hi = _mm256_cmpeq_epi8(
+      _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
+
+  uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
+  uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
+  whitespace = ~(ws_res_0 | (ws_res_1 << 32));
+#endif // SIMDJSON_NAIVE_STRUCTURAL
+}
+#endif
+
+#ifdef __ARM_NEON
+template<> really_inline
+void find_whitespace_and_structurals<simdjson::instruction_set::neon>(
+                                                  simd_input<simdjson::instruction_set::neon> in,
+                                                  uint64_t &whitespace,
+                                                  uint64_t &structurals) {
+#ifndef FUNKY_BAD_TABLE
+  const uint8x16_t low_nibble_mask = (uint8x16_t){ 
+      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
+  const uint8x16_t high_nibble_mask = (uint8x16_t){ 
+      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
+  const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7); 
+  const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); 
+  const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf); 
+
+  uint8x16_t nib_0_lo = vandq_u8(in.i.val[0], low_nib_and_mask);
+  uint8x16_t nib_0_hi = vshrq_n_u8(in.i.val[0], 4);
+  uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
+  uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
+  uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
+
+  uint8x16_t nib_1_lo = vandq_u8(in.i.val[1], low_nib_and_mask);
+  uint8x16_t nib_1_hi = vshrq_n_u8(in.i.val[1], 4);
+  uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
+  uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
+  uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
+
+  uint8x16_t nib_2_lo = vandq_u8(in.i.val[2], low_nib_and_mask);
+  uint8x16_t nib_2_hi = vshrq_n_u8(in.i.val[2], 4);
+  uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
+  uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
+  uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
+
+  uint8x16_t nib_3_lo = vandq_u8(in.i.val[3], low_nib_and_mask);
+  uint8x16_t nib_3_hi = vshrq_n_u8(in.i.val[3], 4);
+  uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
+  uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
+  uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
+
+  uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
+  uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
+  uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
+  uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
+  structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
+
+  uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
+  uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
+  uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
+  uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
+  whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
+#else
+  // I think this one is garbage. In order to save the expense
+  // of another shuffle, I use an equally expensive shift, and 
+  // this gets glued to the end of the dependency chain. Seems a bit
+  // slower for no good reason.
+  //
+  // need to use a weird arrangement. Bytes in this bitvector
+  // are in conventional order, but bits are reversed as we are
+  // using a signed left shift (that is a +ve value from 0..7) to
+  // shift upwards to 0x80 in the bit. So we need to reverse bits.
+  
+  // note no structural/whitespace has the high bit on
+  // so it's OK to put the high 5 bits into our TBL shuffle
+  //
+
+  // structurals are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
+  // or in 5 bit, 3 bit form thats
+  // (15,3) (15, 5) (7,2) (11,3) (11,5) (5,4) 
+  // bit-reversing (subtract low 3 bits from 7) yields:
+  // (15,4) (15, 2) (7,5) (11,4) (11,2) (5,3) 
+  
+  const uint8x16_t structural_bitvec = (uint8x16_t){ 
+      0, 0, 0, 0, 
+      0, 8, 0, 32, 
+      0, 0, 0, 20, 
+      0, 0, 0, 20};
+  // we are also interested in the four whitespace characters
+  // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
+  // (4,0) (1, 2) (1, 1) (1, 5)
+  // bit-reversing (subtract low 3 bits from 7) yields:
+  // (4,7) (1, 5) (1, 6) (1, 2)
+  
+  const uint8x16_t whitespace_bitvec = (uint8x16_t){ 
+      0, 100, 0, 0, 
+      128, 0, 0, 0, 
+      0, 0, 0, 0, 
+      0, 0, 0, 0};
+  const uint8x16_t low_3bits_and_mask = vmovq_n_u8(0x7); 
+  const uint8x16_t high_1bit_tst_mask = vmovq_n_u8(0x80); 
+
+  int8x16_t low_3bits_0 = vreinterpretq_s8_u8(vandq_u8(in.i.val[0], low_3bits_and_mask));
+  uint8x16_t high_5bits_0 = vshrq_n_u8(in.i.val[0], 3);
+  uint8x16_t shuffle_structural_0 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_0), low_3bits_0);
+  uint8x16_t shuffle_ws_0 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_0), low_3bits_0);
+  uint8x16_t tmp_0 = vtstq_u8(shuffle_structural_0, high_1bit_tst_mask);
+  uint8x16_t tmp_ws_0 = vtstq_u8(shuffle_ws_0, high_1bit_tst_mask);
+
+  int8x16_t low_3bits_1 = vreinterpretq_s8_u8(vandq_u8(in.i.val[1], low_3bits_and_mask));
+  uint8x16_t high_5bits_1 = vshrq_n_u8(in.i.val[1], 3);
+  uint8x16_t shuffle_structural_1 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_1), low_3bits_1);
+  uint8x16_t shuffle_ws_1 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_1), low_3bits_1);
+  uint8x16_t tmp_1 = vtstq_u8(shuffle_structural_1, high_1bit_tst_mask);
+  uint8x16_t tmp_ws_1 = vtstq_u8(shuffle_ws_1, high_1bit_tst_mask);
+
+  int8x16_t low_3bits_2 = vreinterpretq_s8_u8(vandq_u8(in.i.val[2], low_3bits_and_mask));
+  uint8x16_t high_5bits_2 = vshrq_n_u8(in.i.val[2], 3);
+  uint8x16_t shuffle_structural_2 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_2), low_3bits_2);
+  uint8x16_t shuffle_ws_2 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_2), low_3bits_2);
+  uint8x16_t tmp_2 = vtstq_u8(shuffle_structural_2, high_1bit_tst_mask);
+  uint8x16_t tmp_ws_2 = vtstq_u8(shuffle_ws_2, high_1bit_tst_mask);
+
+  int8x16_t low_3bits_3 = vreinterpretq_s8_u8(vandq_u8(in.i.val[3], low_3bits_and_mask));
+  uint8x16_t high_5bits_3 = vshrq_n_u8(in.i.val[3], 3);
+  uint8x16_t shuffle_structural_3 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_3), low_3bits_3);
+  uint8x16_t shuffle_ws_3 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_3), low_3bits_3);
+  uint8x16_t tmp_3 = vtstq_u8(shuffle_structural_3, high_1bit_tst_mask);
+  uint8x16_t tmp_ws_3 = vtstq_u8(shuffle_ws_3, high_1bit_tst_mask);
+
+  structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
+  whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
+#endif
+}
+#endif
+
+
+#ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking
+//
+// This is just a naive implementation. It should be normally
+// disable, but can be used for research purposes to compare
+// again our optimized version.
+really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
+                                uint32_t idx, uint64_t bits) {
+  uint32_t * out_ptr = base_ptr + base;
+  idx -= 64;
+  while(bits != 0) {
+      out_ptr[0] = idx + trailingzeroes(bits);
+      bits = bits & (bits - 1);
+      out_ptr++;
+  }
+  base = (out_ptr - base_ptr);
+}
+
+#else 
+// flatten out values in 'bits' assuming that they are are to have values of idx
+// plus their position in the bitvector, and store these indexes at
+// base_ptr[base] incrementing base as we go
+// will potentially store extra values beyond end of valid bits, so base_ptr
+// needs to be large enough to handle this
+really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
+                                uint32_t idx, uint64_t bits) {
+  // In some instances, the next branch is expensive because it is mispredicted. 
+  // Unfortunately, in other cases,
+  // it helps tremendously.
+  if(bits == 0) return; 
+  uint32_t cnt = hamming(bits);
+  uint32_t next_base = base + cnt;
+  idx -= 64;
+  base_ptr += base;
+  { 
+    base_ptr[0] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[1] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[2] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[3] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[4] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[5] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[6] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[7] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr += 8;
+  }
+  // We hope that the next branch is easily predicted.
+  if (cnt > 8) {
+    base_ptr[0] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[1] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[2] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[3] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[4] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[5] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[6] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[7] = idx + trailingzeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr += 8;
+  }
+  if (cnt > 16) { // unluckly: we rarely get here
+    // since it means having one structural or pseudo-structral element 
+    // every 4 characters (possible with inputs like "","","",...).
+    do {
+      base_ptr[0] = idx + trailingzeroes(bits);
+      bits = bits & (bits - 1);
+      base_ptr++;
+    } while(bits != 0);
+  }
+  base = next_base;
+}
+#endif
+
+// return a updated structural bit vector with quoted contents cleared out and
+// pseudo-structural characters added to the mask
+// updates prev_iter_ends_pseudo_pred which tells us whether the previous
+// iteration ended on a whitespace or a structural character (which means that
+// the next iteration
+// will have a pseudo-structural character at its start)
+really_inline uint64_t finalize_structurals(
+    uint64_t structurals, uint64_t whitespace, uint64_t quote_mask,
+    uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) {
+  // mask off anything inside quotes
+  structurals &= ~quote_mask;
+  // add the real quote bits back into our bitmask as well, so we can
+  // quickly traverse the strings we've spent all this trouble gathering
+  structurals |= quote_bits;
+  // Now, establish "pseudo-structural characters". These are non-whitespace
+  // characters that are (a) outside quotes and (b) have a predecessor that's
+  // either whitespace or a structural character. This means that subsequent
+  // passes will get a chance to encounter the first character of every string
+  // of non-whitespace and, if we're parsing an atom like true/false/null or a
+  // number we can stop at the first whitespace or structural character
+  // following it.
+
+  // a qualified predecessor is something that can happen 1 position before an
+  // pseudo-structural character
+  uint64_t pseudo_pred = structurals | whitespace;
+
+  uint64_t shifted_pseudo_pred =
+      (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
+  prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
+  uint64_t pseudo_structurals =
+      shifted_pseudo_pred & (~whitespace) & (~quote_mask);
+  structurals |= pseudo_structurals;
+
+  // now, we've used our close quotes all we need to. So let's switch them off
+  // they will be off in the quote mask and on in quote bits.
+  structurals &= ~(quote_bits & ~quote_mask);
+  return structurals;
+}
+
+template<simdjson::instruction_set T = simdjson::instruction_set::native>
+WARN_UNUSED
+/*never_inline*/ int find_structural_bits(const uint8_t *buf, size_t len,
+                                           ParsedJson &pj) {
+  if (len > pj.bytecapacity) {
+    std::cerr << "Your ParsedJson object only supports documents up to "
+         << pj.bytecapacity << " bytes but you are trying to process " << len
+         << " bytes" << std::endl;
+    return simdjson::CAPACITY;
+  }
+  uint32_t *base_ptr = pj.structural_indexes;
+  uint32_t base = 0;
+#ifdef SIMDJSON_UTF8VALIDATE
+  __m256i has_error = _mm256_setzero_si256();
+  struct avx_processed_utf_bytes previous {};
+  previous.rawbytes = _mm256_setzero_si256();
+  previous.high_nibbles = _mm256_setzero_si256();
+  previous.carried_continuations = _mm256_setzero_si256();
+#endif
+
+  // we have padded the input out to 64 byte multiple with the remainder being
+  // zeros
+
+  // persistent state across loop
+  // does the last iteration end with an odd-length sequence of backslashes? 
+  // either 0 or 1, but a 64-bit value
+  uint64_t prev_iter_ends_odd_backslash = 0ULL;
+  // does the previous iteration end inside a double-quote pair?
+  uint64_t prev_iter_inside_quote = 0ULL;  // either all zeros or all ones
+  // does the previous iteration end on something that is a predecessor of a
+  // pseudo-structural character - i.e. whitespace or a structural character
+  // effectively the very first char is considered to follow "whitespace" for
+  // the
+  // purposes of pseudo-structural character detection so we initialize to 1
+  uint64_t prev_iter_ends_pseudo_pred = 1ULL;
+
+  // structurals are persistent state across loop as we flatten them on the
+  // subsequent iteration into our array pointed to be base_ptr.
+  // This is harmless on the first iteration as structurals==0
+  // and is done for performance reasons; we can hide some of the latency of the
+  // expensive carryless multiply in the previous step with this work
+  uint64_t structurals = 0;
+
+  size_t lenminus64 = len < 64 ? 0 : len - 64;
+  size_t idx = 0;
+  uint64_t error_mask = 0; // for unescaped characters within strings (ASCII code points < 0x20)
+
+  for (; idx < lenminus64; idx += 64) {
+#ifndef _MSC_VER
+    __builtin_prefetch(buf + idx + 128);
+#endif
+    simd_input<T> in = fill_input<T>(buf+idx);
+#ifdef SIMDJSON_UTF8VALIDATE
+    check_utf8(in, has_error, previous);
+#endif
+    // detect odd sequences of backslashes
+    uint64_t odd_ends = find_odd_backslash_sequences<T>(
+        in, prev_iter_ends_odd_backslash);
+
+    // detect insides of quote pairs ("quote_mask") and also our quote_bits
+    // themselves
+    uint64_t quote_bits;
+    uint64_t quote_mask = find_quote_mask_and_bits<T>(
+        in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
+
+    // take the previous iterations structural bits, not our current iteration,
+    // and flatten
+    flatten_bits(base_ptr, base, idx, structurals);
+
+    uint64_t whitespace;
+    find_whitespace_and_structurals<T>(in, whitespace, structurals);
+
+    // fixup structurals to reflect quotes and add pseudo-structural characters
+    structurals = finalize_structurals(structurals, whitespace, quote_mask,
+                                       quote_bits, prev_iter_ends_pseudo_pred);
+  }
+
+  ////////////////
+  /// we use a giant copy-paste which is ugly.
+  /// but otherwise the string needs to be properly padded or else we
+  /// risk invalidating the UTF-8 checks.
+  ////////////
+  if (idx < len) {
+    uint8_t tmpbuf[64];
+    memset(tmpbuf, 0x20, 64);
+    memcpy(tmpbuf, buf + idx, len - idx);
+    simd_input<T> in = fill_input<T>(tmpbuf);
+#ifdef SIMDJSON_UTF8VALIDATE
+    check_utf8(in, has_error, previous);
+#endif
+
+    // detect odd sequences of backslashes
+    uint64_t odd_ends = find_odd_backslash_sequences<T>(
+        in, prev_iter_ends_odd_backslash);
+
+    // detect insides of quote pairs ("quote_mask") and also our quote_bits
+    // themselves
+    uint64_t quote_bits;
+    uint64_t quote_mask = find_quote_mask_and_bits<T>(
+        in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
+
+    // take the previous iterations structural bits, not our current iteration,
+    // and flatten
+    flatten_bits(base_ptr, base, idx, structurals);
+
+    uint64_t whitespace;
+    find_whitespace_and_structurals<T>(in, whitespace, structurals);
+
+    // fixup structurals to reflect quotes and add pseudo-structural characters
+    structurals = finalize_structurals(structurals, whitespace, quote_mask,
+                                       quote_bits, prev_iter_ends_pseudo_pred);
+    idx += 64;
+  }
+
+  // is last string quote closed?
+  if (prev_iter_inside_quote) {
+      return simdjson::UNCLOSED_STRING;
+  }
+
+  // finally, flatten out the remaining structurals from the last iteration
+  flatten_bits(base_ptr, base, idx, structurals);
+
+  pj.n_structural_indexes = base;
+  // a valid JSON file cannot have zero structural indexes - we should have
+  // found something
+  if (pj.n_structural_indexes == 0u) {
+    fprintf(stderr, "Empty document?\n");
+    return simdjson::EMPTY;
+  }
+  if (base_ptr[pj.n_structural_indexes - 1] > len) {
+    fprintf(stderr, "Internal bug\n");
+    return simdjson::UNEXPECTED_ERROR;
+  }
+  if (len != base_ptr[pj.n_structural_indexes - 1]) {
+    // the string might not be NULL terminated, but we add a virtual NULL ending
+    // character.
+    base_ptr[pj.n_structural_indexes++] = len;
+  }
+  // make it safe to dereference one beyond this array
+  base_ptr[pj.n_structural_indexes] = 0;  
+  if (error_mask) {
+    fprintf(stderr, "Unescaped characters\n");
+    return simdjson::UNESCAPED_CHARS;
+  }
+#ifdef SIMDJSON_UTF8VALIDATE
+    return _mm256_testz_si256(has_error, has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+#else
+  return simdjson::SUCCESS;
+#endif
+}
+
+template<simdjson::instruction_set T = simdjson::instruction_set::native>
+WARN_UNUSED
+int find_structural_bits(const char *buf, size_t len, ParsedJson &pj) {
+  return find_structural_bits<T>(reinterpret_cast<const uint8_t *>(buf), len, pj);
+}

 #endif
--- a/src/jsonparser.cpp
+++ b/src/jsonparser.cpp
@ -7,57 +7,63 @@
 #endif
 #include "simdjson/simdjson.h"

-// parse a document found in buf, need to preallocate ParsedJson.
-WARN_UNUSED
-int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded) {
-  if (pj.bytecapacity < len) {
-    return simdjson::CAPACITY;
-  }
-  bool reallocated = false;
-  if(reallocifneeded) {
-#ifdef ALLOW_SAME_PAGE_BUFFER_OVERRUN
-	  // realloc is needed if the end of the memory crosses a page
-#ifdef _MSC_VER
-	  SYSTEM_INFO sysInfo; 
-	  GetSystemInfo(&sysInfo); 
-	  long pagesize = sysInfo.dwPageSize;
+
+// Responsible to select the best json_parse implementation
+int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded) {
+  // Versions for each implementation
+#ifdef __AVX2__
+  json_parse_functype* avx_implementation = &json_parse_implementation<simdjson::instruction_set::avx2>;
+#endif
+#ifdef __SSE4_2__
+  // json_parse_functype* sse4_2_implementation = &json_parse_implementation<simdjson::instruction_set::sse4_2>; // not implemented yet
+#endif
+#ifdef __ARM_NEON
+  json_parse_functype* neon_implementation = &json_parse_implementation<simdjson::instruction_set::neon>;
+#endif
+
+  // Determining which implementation is the more suitable
+  // Should be done at runtime. Does not make any sense on preprocessor.
+#ifdef __AVX2__
+  simdjson::instruction_set best_implementation = simdjson::instruction_set::avx2;
+#elif defined (__SSE4_2__)
+  simdjson::instruction_set best_implementation = simdjson::instruction_set::sse4_2;
+#elif defined (__ARM_NEON)
+  simdjson::instruction_set best_implementation = simdjson::instruction_set::neon;
 #else
-    long pagesize = sysconf (_SC_PAGESIZE); 
+  simdjson::instruction_set best_implementation = simdjson::instruction_set::none;
 #endif
-  //////////////
-  // We want to check that buf + len - 1 and buf + len - 1 + SIMDJSON_PADDING
-  // are in the same page.
-  // That is, we want to check that  
-  // (buf + len - 1) / pagesize == (buf + len - 1 + SIMDJSON_PADDING) / pagesize
-  // That's true if (buf + len - 1) % pagesize + SIMDJSON_PADDING < pagesize.
-  ///////////
-	 if ( (reinterpret_cast<uintptr_t>(buf + len - 1) % pagesize ) + SIMDJSON_PADDING < static_cast<uintptr_t>(pagesize) ) {
-#else // SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN
-     if(true) { // if not SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN, we always reallocate
+  
+  // Selecting the best implementation
+  switch (best_implementation) {
+#ifdef __AVX2__
+  case simdjson::instruction_set::avx2 :
+    json_parse_ptr = avx_implementation;
+    break;
+#elif defined (__SSE4_2__)
+  /*case simdjson::instruction_set::sse4_2 :
+    json_parse_ptr = sse4_2_implementation;
+    break;*/
+#elif defined (__ARM_NEON)
+  case simdjson::instruction_set::neon :
+    json_parse_ptr = neon_implementation;
+    break;
 #endif
-	     const uint8_t *tmpbuf  = buf;
-       buf = (uint8_t *) allocate_padded_buffer(len);
-       if(buf == NULL) return simdjson::MEMALLOC;
-       memcpy((void*)buf,tmpbuf,len);
-       reallocated = true;
-     }
+  default :
+    std::cerr << "No implemented simd instruction set supported" << std::endl;
+    return simdjson::UNEXPECTED_ERROR;
  }
-  int stage1_is_ok = find_structural_bits(buf, len, pj);
-  if(stage1_is_ok != simdjson::SUCCESS) {
-    pj.errorcode = stage1_is_ok;
-    return pj.errorcode;
-  } 
-  int res = unified_machine(buf, len, pj);
-  if(reallocated) { aligned_free((void*)buf);}
-  return res;
+
+  return json_parse_ptr(buf, len, pj, reallocifneeded);
 }

+json_parse_functype *json_parse_ptr = &json_parse_dispatch;
+
 WARN_UNUSED
 ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool reallocifneeded) {
  ParsedJson pj;
  bool ok = pj.allocateCapacity(len);
  if(ok) {
-    (void)json_parse(buf, len, pj, reallocifneeded);
+    json_parse(buf, len, pj, reallocifneeded);
  } else {
    std::cerr << "failure during memory allocation " << std::endl;
  }
--- a/src/stage1_find_marks.cpp
+++ b/src/stage1_find_marks.cpp
@ -1,780 +1 @@
-#include <cassert>
-#include "simdjson/common_defs.h"
-#include "simdjson/parsedjson.h"
-#include "simdjson/portability.h"
-
-
-#ifdef __AVX2__
-
-#ifndef SIMDJSON_SKIPUTF8VALIDATION
-#define SIMDJSON_UTF8VALIDATE
-
-#endif
-#else
-// currently we don't UTF8 validate for ARM
-// also we assume that if you're not __AVX2__ 
-// you're ARM, which is a bit dumb. TODO: Fix...
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#else
-#warning It appears that neither ARM NEON nor AVX2 are detected.
-#endif // __ARM_NEON
-#endif // __AVX2__
-
-// It seems that many parsers do UTF-8 validation.
-// RapidJSON does not do it by default, but a flag
-// allows it.
-#ifdef SIMDJSON_UTF8VALIDATE
-#include "simdjson/simdutf8check.h"
-#endif
-
-#define TRANSPOSE
-
-struct simd_input {
-#ifdef __AVX2__
-  __m256i lo;
-  __m256i hi;
-#elif defined(__ARM_NEON)
-#ifndef TRANSPOSE
-  uint8x16_t i0;
-  uint8x16_t i1;
-  uint8x16_t i2;
-  uint8x16_t i3;
-#else
-  uint8x16x4_t i;
-#endif
-#else
-#warning It appears that neither ARM NEON nor AVX2 are detected.
-#endif
-};
-
-really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
-  // In practice, if you have NEON or __PCLMUL__, you would
-  // always want to use them, but it might be useful, for research
-  // purposes, to disable it willingly, that's what SIMDJSON_AVOID_CLMUL
-  // does.
-  // Also: we don't know of an instance where AVX2 is supported but 
-  // where clmul is not supported, so check for both, to be sure.
-#if (defined(__PCLMUL__) || defined(__AVX2__)) && !defined(SIMDJSON_AVOID_CLMUL)
-  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
-      _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
-#elif defined(__ARM_NEON) && !defined(SIMDJSON_AVOID_CLMUL)
-  uint64_t quote_mask = vmull_p64( -1ULL, quote_bits);
-#else
-  // this code should always be used if SIMDJSON_AVOID_CLMUL is defined.
-  uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
-  quote_mask = quote_mask ^ (quote_mask << 2);
-  quote_mask = quote_mask ^ (quote_mask << 4);
-  quote_mask = quote_mask ^ (quote_mask << 8);
-  quote_mask = quote_mask ^ (quote_mask << 16);
-  quote_mask = quote_mask ^ (quote_mask << 32);
-#endif
-  return quote_mask;
-}
-
-really_inline simd_input fill_input(const uint8_t * ptr) {
-  struct simd_input in;
-#ifdef __AVX2__
-  in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
-  in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
-#elif defined(__ARM_NEON)
-#ifndef TRANSPOSE
-  in.i0 = vld1q_u8(ptr + 0);
-  in.i1 = vld1q_u8(ptr + 16);
-  in.i2 = vld1q_u8(ptr + 32);
-  in.i3 = vld1q_u8(ptr + 48);
-#else
-  in.i = vld4q_u8(ptr);
-#endif
-#else
-#warning It appears that neither ARM NEON nor AVX2 are detected.
-#endif
-  return in;
-}
-
-#ifdef SIMDJSON_UTF8VALIDATE
-really_inline void check_utf8(simd_input in,
-                              __m256i &has_error,
-                              struct avx_processed_utf_bytes &previous) {
-  __m256i highbit = _mm256_set1_epi8(0x80);
-  if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) {
-    // it is ascii, we just check continuation
-    has_error = _mm256_or_si256(
-        _mm256_cmpgt_epi8(
-            previous.carried_continuations,
-            _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                             9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
-        has_error);
-  } else {
-    // it is not ascii so we have to do heavy work
-    previous = avxcheckUTF8Bytes(in.lo, &previous, &has_error);
-    previous = avxcheckUTF8Bytes(in.hi, &previous, &has_error);
-  }
-}
-#endif
-
-#ifdef __ARM_NEON
-uint16_t neonmovemask(uint8x16_t input) {
-  const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
-  uint8x16_t minput = vandq_u8(input, bitmask);
-  uint8x16_t tmp = vpaddq_u8(minput, minput);
-  tmp = vpaddq_u8(tmp, tmp);
-  tmp = vpaddq_u8(tmp, tmp);
-  return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
-}
-
-really_inline
-uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) {
-#ifndef TRANSPOSE
-  const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
-  uint8x16_t t0 = vandq_u8(p0, bitmask);
-  uint8x16_t t1 = vandq_u8(p1, bitmask);
-  uint8x16_t t2 = vandq_u8(p2, bitmask);
-  uint8x16_t t3 = vandq_u8(p3, bitmask);
-  uint8x16_t sum0 = vpaddq_u8(t0, t1);
-  uint8x16_t sum1 = vpaddq_u8(t2, t3);
-  sum0 = vpaddq_u8(sum0, sum1);
-  sum0 = vpaddq_u8(sum0, sum0);
-  return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
-#else
-  const uint8x16_t bitmask1 = { 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10,
-                                0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10};
-  const uint8x16_t bitmask2 = { 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20,
-                                0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20};
-  const uint8x16_t bitmask3 = { 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40,
-                                0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40};
-  const uint8x16_t bitmask4 = { 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80,
-                                0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80};
-#if 0
-  uint8x16_t t0 = vandq_u8(p0, bitmask1);
-  uint8x16_t t1 = vandq_u8(p1, bitmask2);
-  uint8x16_t t2 = vandq_u8(p2, bitmask3);
-  uint8x16_t t3 = vandq_u8(p3, bitmask4);
-  uint8x16_t tmp = vorrq_u8(vorrq_u8(t0, t1), vorrq_u8(t2, t3));
-#else
-  uint8x16_t t0 = vandq_u8(p0, bitmask1);
-  uint8x16_t t1 = vbslq_u8(bitmask2, p1, t0);
-  uint8x16_t t2 = vbslq_u8(bitmask3, p2, t1);
-  uint8x16_t tmp = vbslq_u8(bitmask4, p3, t2);
-#endif
-  uint8x16_t sum = vpaddq_u8(tmp, tmp);
-  return vgetq_lane_u64(vreinterpretq_u64_u8(sum), 0);
-#endif
-}
-#endif
-
-// a straightforward comparison of a mask against input. 5 uops; would be
-// cheaper in AVX512.
-really_inline uint64_t cmp_mask_against_input(simd_input in, uint8_t m) {
-#ifdef __AVX2__
-  const __m256i mask = _mm256_set1_epi8(m);
-  __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
-  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-  __m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
-  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
-  return res_0 | (res_1 << 32);
-#elif defined(__ARM_NEON)
-  const uint8x16_t mask = vmovq_n_u8(m); 
-  uint8x16_t cmp_res_0 = vceqq_u8(in.i.val[0], mask); 
-  uint8x16_t cmp_res_1 = vceqq_u8(in.i.val[1], mask); 
-  uint8x16_t cmp_res_2 = vceqq_u8(in.i.val[2], mask); 
-  uint8x16_t cmp_res_3 = vceqq_u8(in.i.val[3], mask); 
-  return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
-#else
-#warning It appears that neither ARM NEON nor AVX2 are detected.
-#endif
-}
-
-// find all values less than or equal than the content of maxval (using unsigned arithmetic) 
-really_inline uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m) {
-#ifdef __AVX2__
-  const __m256i maxval = _mm256_set1_epi8(m);
-  __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.lo),maxval);
-  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-  __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.hi),maxval);
-  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
-  return res_0 | (res_1 << 32);
-#elif defined(__ARM_NEON)
-  const uint8x16_t mask = vmovq_n_u8(m); 
-  uint8x16_t cmp_res_0 = vcleq_u8(in.i.val[0], mask); 
-  uint8x16_t cmp_res_1 = vcleq_u8(in.i.val[1], mask); 
-  uint8x16_t cmp_res_2 = vcleq_u8(in.i.val[2], mask); 
-  uint8x16_t cmp_res_3 = vcleq_u8(in.i.val[3], mask); 
-  return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
-#else
-#warning It appears that neither ARM NEON nor AVX2 are detected.
-#endif
-}
-
-// return a bitvector indicating where we have characters that end an odd-length
-// sequence of backslashes (and thus change the behavior of the next character
-// to follow). A even-length sequence of backslashes, and, for that matter, the
-// largest even-length prefix of our odd-length sequence of backslashes, simply
-// modify the behavior of the backslashes themselves.
-// We also update the prev_iter_ends_odd_backslash reference parameter to
-// indicate whether we end an iteration on an odd-length sequence of
-// backslashes, which modifies our subsequent search for odd-length
-// sequences of backslashes in an obvious way.
-really_inline uint64_t
-find_odd_backslash_sequences(simd_input in,
-                             uint64_t &prev_iter_ends_odd_backslash) {
-  const uint64_t even_bits = 0x5555555555555555ULL;
-  const uint64_t odd_bits = ~even_bits;
-  uint64_t bs_bits = cmp_mask_against_input(in, '\\');
-  uint64_t start_edges = bs_bits & ~(bs_bits << 1);
-  // flip lowest if we have an odd-length run at the end of the prior
-  // iteration
-  uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
-  uint64_t even_starts = start_edges & even_start_mask;
-  uint64_t odd_starts = start_edges & ~even_start_mask;
-  uint64_t even_carries = bs_bits + even_starts;
-
-  uint64_t odd_carries;
-  // must record the carry-out of our odd-carries out of bit 63; this
-  // indicates whether the sense of any edge going to the next iteration
-  // should be flipped
-  bool iter_ends_odd_backslash =
-      add_overflow(bs_bits, odd_starts, &odd_carries);
-
-  odd_carries |=
-      prev_iter_ends_odd_backslash;  // push in bit zero as a potential end
-                                     // if we had an odd-numbered run at the
-                                     // end of the previous iteration
-  prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
-  uint64_t even_carry_ends = even_carries & ~bs_bits;
-  uint64_t odd_carry_ends = odd_carries & ~bs_bits;
-  uint64_t even_start_odd_end = even_carry_ends & odd_bits;
-  uint64_t odd_start_even_end = odd_carry_ends & even_bits;
-  uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
-  return odd_ends;
-}
-
-// return both the quote mask (which is a half-open mask that covers the first
-// quote
-// in an unescaped quote pair and everything in the quote pair) and the quote
-// bits, which are the simple
-// unescaped quoted bits. We also update the prev_iter_inside_quote value to
-// tell the next iteration
-// whether we finished the final iteration inside a quote pair; if so, this
-// inverts our behavior of
-// whether we're inside quotes for the next iteration.
-// Note that we don't do any error checking to see if we have backslash
-// sequences outside quotes; these
-// backslash sequences (of any length) will be detected elsewhere.
-really_inline uint64_t find_quote_mask_and_bits(simd_input in, uint64_t odd_ends,
-    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits, uint64_t &error_mask) {
-  quote_bits = cmp_mask_against_input(in, '"');
-  quote_bits = quote_bits & ~odd_ends;
-  uint64_t quote_mask = compute_quote_mask(quote_bits);
-  quote_mask ^= prev_iter_inside_quote;
-  // All Unicode characters may be placed within the
-  // quotation marks, except for the characters that MUST be escaped:
-  // quotation mark, reverse solidus, and the control characters (U+0000
-  //through U+001F).
-  // https://tools.ietf.org/html/rfc8259
-  uint64_t unescaped = unsigned_lteq_against_input(in, 0x1F);
-  error_mask |= quote_mask & unescaped;
-  // right shift of a signed value expected to be well-defined and standard
-  // compliant as of C++20,
-  // John Regher from Utah U. says this is fine code
-  prev_iter_inside_quote =
-      static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63);
-  return quote_mask;
-}
-
-really_inline void find_whitespace_and_structurals(simd_input in,
-                                                   uint64_t &whitespace,
-                                                   uint64_t &structurals) {
-  // do a 'shufti' to detect structural JSON characters
-  // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
-  // these go into the first 3 buckets of the comparison (1/2/4)
-
-  // we are also interested in the four whitespace characters
-  // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
-  // these go into the next 2 buckets of the comparison (8/16)
-#ifdef __AVX2__
-#ifdef SIMDJSON_NAIVE_STRUCTURAL
-  // You should never need this naive approach, but it can be useful
-  // for research purposes
-  const __m256i mask_open_brace = _mm256_set1_epi8(0x7b);
-  __m256i struct_lo = _mm256_cmpeq_epi8(in.lo, mask_open_brace);
-  __m256i struct_hi = _mm256_cmpeq_epi8(in.hi, mask_open_brace);
-  const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
-  struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_close_brace));
-  struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_close_brace));
-  const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
-  struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_open_bracket));
-  struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_open_bracket));
-  const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
-  struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_close_bracket));
-  struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_close_bracket));
-  const __m256i mask_column = _mm256_set1_epi8(0x3a);
-  struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_column));
-  struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_column));
-  const __m256i mask_comma = _mm256_set1_epi8(0x2c);
-  struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_comma));
-  struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_comma));
-  uint64_t structural_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(struct_lo));
-  uint64_t structural_res_1 = _mm256_movemask_epi8(struct_hi);
-  structurals = (structural_res_0 | (structural_res_1 << 32));
-
-  const __m256i mask_space = _mm256_set1_epi8(0x20);
-  __m256i space_lo = _mm256_cmpeq_epi8(in.lo, mask_space);
-  __m256i space_hi = _mm256_cmpeq_epi8(in.hi, mask_space);
-  const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
-  space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_linefeed));
-  space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_linefeed));
-  const __m256i mask_tab = _mm256_set1_epi8(0x09);
-  space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_tab));
-  space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_tab));
-  const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
-  space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_carriage));
-  space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_carriage));
-
-  uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(space_lo));
-  uint64_t ws_res_1 = _mm256_movemask_epi8(space_hi);
-  whitespace = (ws_res_0 | (ws_res_1 << 32));
-  // end of naive approach
-
-#else // SIMDJSON_NAIVE_STRUCTURAL
-  const __m256i low_nibble_mask = _mm256_setr_epi8(
-      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 
-      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
-  const __m256i high_nibble_mask = _mm256_setr_epi8(
-      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 
-      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
-
-  __m256i structural_shufti_mask = _mm256_set1_epi8(0x7);
-  __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);
-
-  __m256i v_lo = _mm256_and_si256(
-      _mm256_shuffle_epi8(low_nibble_mask, in.lo),
-      _mm256_shuffle_epi8(high_nibble_mask,
-                          _mm256_and_si256(_mm256_srli_epi32(in.lo, 4),
-                                           _mm256_set1_epi8(0x7f))));
-
-  __m256i v_hi = _mm256_and_si256(
-      _mm256_shuffle_epi8(low_nibble_mask, in.hi),
-      _mm256_shuffle_epi8(high_nibble_mask,
-                          _mm256_and_si256(_mm256_srli_epi32(in.hi, 4),
-                                           _mm256_set1_epi8(0x7f))));
-  __m256i tmp_lo = _mm256_cmpeq_epi8(
-      _mm256_and_si256(v_lo, structural_shufti_mask), _mm256_set1_epi8(0));
-  __m256i tmp_hi = _mm256_cmpeq_epi8(
-      _mm256_and_si256(v_hi, structural_shufti_mask), _mm256_set1_epi8(0));
-
-  uint64_t structural_res_0 =
-      static_cast<uint32_t>(_mm256_movemask_epi8(tmp_lo));
-  uint64_t structural_res_1 = _mm256_movemask_epi8(tmp_hi);
-  structurals = ~(structural_res_0 | (structural_res_1 << 32));
-
-  __m256i tmp_ws_lo = _mm256_cmpeq_epi8(
-      _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
-  __m256i tmp_ws_hi = _mm256_cmpeq_epi8(
-      _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
-
-  uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
-  uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
-  whitespace = ~(ws_res_0 | (ws_res_1 << 32));
-#endif // SIMDJSON_NAIVE_STRUCTURAL
-#elif defined(__ARM_NEON)
-#ifndef FUNKY_BAD_TABLE
-  const uint8x16_t low_nibble_mask = (uint8x16_t){ 
-      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
-  const uint8x16_t high_nibble_mask = (uint8x16_t){ 
-      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
-  const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7); 
-  const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); 
-  const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf); 
-
-  uint8x16_t nib_0_lo = vandq_u8(in.i.val[0], low_nib_and_mask);
-  uint8x16_t nib_0_hi = vshrq_n_u8(in.i.val[0], 4);
-  uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
-  uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
-  uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
-
-  uint8x16_t nib_1_lo = vandq_u8(in.i.val[1], low_nib_and_mask);
-  uint8x16_t nib_1_hi = vshrq_n_u8(in.i.val[1], 4);
-  uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
-  uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
-  uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
-
-  uint8x16_t nib_2_lo = vandq_u8(in.i.val[2], low_nib_and_mask);
-  uint8x16_t nib_2_hi = vshrq_n_u8(in.i.val[2], 4);
-  uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
-  uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
-  uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
-
-  uint8x16_t nib_3_lo = vandq_u8(in.i.val[3], low_nib_and_mask);
-  uint8x16_t nib_3_hi = vshrq_n_u8(in.i.val[3], 4);
-  uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
-  uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
-  uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
-
-  uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
-  uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
-  uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
-  uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
-  structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
-
-  uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
-  uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
-  uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
-  uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
-  whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
-#else
-  // I think this one is garbage. In order to save the expense
-  // of another shuffle, I use an equally expensive shift, and 
-  // this gets glued to the end of the dependency chain. Seems a bit
-  // slower for no good reason.
-  //
-  // need to use a weird arrangement. Bytes in this bitvector
-  // are in conventional order, but bits are reversed as we are
-  // using a signed left shift (that is a +ve value from 0..7) to
-  // shift upwards to 0x80 in the bit. So we need to reverse bits.
-  
-  // note no structural/whitespace has the high bit on
-  // so it's OK to put the high 5 bits into our TBL shuffle
-  //
-
-  // structurals are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
-  // or in 5 bit, 3 bit form thats
-  // (15,3) (15, 5) (7,2) (11,3) (11,5) (5,4) 
-  // bit-reversing (subtract low 3 bits from 7) yields:
-  // (15,4) (15, 2) (7,5) (11,4) (11,2) (5,3) 
-  
-  const uint8x16_t structural_bitvec = (uint8x16_t){ 
-      0, 0, 0, 0, 
-      0, 8, 0, 32, 
-      0, 0, 0, 20, 
-      0, 0, 0, 20};
-  // we are also interested in the four whitespace characters
-  // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
-  // (4,0) (1, 2) (1, 1) (1, 5)
-  // bit-reversing (subtract low 3 bits from 7) yields:
-  // (4,7) (1, 5) (1, 6) (1, 2)
-  
-  const uint8x16_t whitespace_bitvec = (uint8x16_t){ 
-      0, 100, 0, 0, 
-      128, 0, 0, 0, 
-      0, 0, 0, 0, 
-      0, 0, 0, 0};
-  const uint8x16_t low_3bits_and_mask = vmovq_n_u8(0x7); 
-  const uint8x16_t high_1bit_tst_mask = vmovq_n_u8(0x80); 
-
-  int8x16_t low_3bits_0 = vreinterpretq_s8_u8(vandq_u8(in.i.val[0], low_3bits_and_mask));
-  uint8x16_t high_5bits_0 = vshrq_n_u8(in.i.val[0], 3);
-  uint8x16_t shuffle_structural_0 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_0), low_3bits_0);
-  uint8x16_t shuffle_ws_0 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_0), low_3bits_0);
-  uint8x16_t tmp_0 = vtstq_u8(shuffle_structural_0, high_1bit_tst_mask);
-  uint8x16_t tmp_ws_0 = vtstq_u8(shuffle_ws_0, high_1bit_tst_mask);
-
-  int8x16_t low_3bits_1 = vreinterpretq_s8_u8(vandq_u8(in.i.val[1], low_3bits_and_mask));
-  uint8x16_t high_5bits_1 = vshrq_n_u8(in.i.val[1], 3);
-  uint8x16_t shuffle_structural_1 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_1), low_3bits_1);
-  uint8x16_t shuffle_ws_1 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_1), low_3bits_1);
-  uint8x16_t tmp_1 = vtstq_u8(shuffle_structural_1, high_1bit_tst_mask);
-  uint8x16_t tmp_ws_1 = vtstq_u8(shuffle_ws_1, high_1bit_tst_mask);
-
-  int8x16_t low_3bits_2 = vreinterpretq_s8_u8(vandq_u8(in.i.val[2], low_3bits_and_mask));
-  uint8x16_t high_5bits_2 = vshrq_n_u8(in.i.val[2], 3);
-  uint8x16_t shuffle_structural_2 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_2), low_3bits_2);
-  uint8x16_t shuffle_ws_2 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_2), low_3bits_2);
-  uint8x16_t tmp_2 = vtstq_u8(shuffle_structural_2, high_1bit_tst_mask);
-  uint8x16_t tmp_ws_2 = vtstq_u8(shuffle_ws_2, high_1bit_tst_mask);
-
-  int8x16_t low_3bits_3 = vreinterpretq_s8_u8(vandq_u8(in.i.val[3], low_3bits_and_mask));
-  uint8x16_t high_5bits_3 = vshrq_n_u8(in.i.val[3], 3);
-  uint8x16_t shuffle_structural_3 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_3), low_3bits_3);
-  uint8x16_t shuffle_ws_3 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_3), low_3bits_3);
-  uint8x16_t tmp_3 = vtstq_u8(shuffle_structural_3, high_1bit_tst_mask);
-  uint8x16_t tmp_ws_3 = vtstq_u8(shuffle_ws_3, high_1bit_tst_mask);
-
-  structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
-  whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
-#endif
-#else 
-#warning It appears that neither ARM NEON nor AVX2 are detected.
-#endif
-}
-
-
-#ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking
-//
-// This is just a naive implementation. It should be normally
-// disable, but can be used for research purposes to compare
-// again our optimized version.
-really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
-                                uint32_t idx, uint64_t bits) {
-  uint32_t * out_ptr = base_ptr + base;
-  idx -= 64;
-  while(bits != 0) {
-      out_ptr[0] = idx + trailingzeroes(bits);
-      bits = bits & (bits - 1);
-      out_ptr++;
-  }
-  base = (out_ptr - base_ptr);
-}
-
-#else 
-// flatten out values in 'bits' assuming that they are are to have values of idx
-// plus their position in the bitvector, and store these indexes at
-// base_ptr[base] incrementing base as we go
-// will potentially store extra values beyond end of valid bits, so base_ptr
-// needs to be large enough to handle this
-really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
-                                uint32_t idx, uint64_t bits) {
-  // In some instances, the next branch is expensive because it is mispredicted. 
-  // Unfortunately, in other cases,
-  // it helps tremendously.
-  if(bits == 0) return; 
-  uint32_t cnt = hamming(bits);
-  uint32_t next_base = base + cnt;
-  idx -= 64;
-  base_ptr += base;
-  { 
-    base_ptr[0] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[1] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[2] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[3] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[4] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[5] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[6] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[7] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr += 8;
-  }
-  // We hope that the next branch is easily predicted.
-  if (cnt > 8) {
-    base_ptr[0] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[1] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[2] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[3] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[4] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[5] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[6] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[7] = idx + trailingzeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr += 8;
-  }
-  if (cnt > 16) { // unluckly: we rarely get here
-    // since it means having one structural or pseudo-structral element 
-    // every 4 characters (possible with inputs like "","","",...).
-    do {
-      base_ptr[0] = idx + trailingzeroes(bits);
-      bits = bits & (bits - 1);
-      base_ptr++;
-    } while(bits != 0);
-  }
-  base = next_base;
-}
-#endif
-
-// return a updated structural bit vector with quoted contents cleared out and
-// pseudo-structural characters added to the mask
-// updates prev_iter_ends_pseudo_pred which tells us whether the previous
-// iteration ended on a whitespace or a structural character (which means that
-// the next iteration
-// will have a pseudo-structural character at its start)
-really_inline uint64_t finalize_structurals(
-    uint64_t structurals, uint64_t whitespace, uint64_t quote_mask,
-    uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) {
-  // mask off anything inside quotes
-  structurals &= ~quote_mask;
-  // add the real quote bits back into our bitmask as well, so we can
-  // quickly traverse the strings we've spent all this trouble gathering
-  structurals |= quote_bits;
-  // Now, establish "pseudo-structural characters". These are non-whitespace
-  // characters that are (a) outside quotes and (b) have a predecessor that's
-  // either whitespace or a structural character. This means that subsequent
-  // passes will get a chance to encounter the first character of every string
-  // of non-whitespace and, if we're parsing an atom like true/false/null or a
-  // number we can stop at the first whitespace or structural character
-  // following it.
-
-  // a qualified predecessor is something that can happen 1 position before an
-  // pseudo-structural character
-  uint64_t pseudo_pred = structurals | whitespace;
-
-  uint64_t shifted_pseudo_pred =
-      (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
-  prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
-  uint64_t pseudo_structurals =
-      shifted_pseudo_pred & (~whitespace) & (~quote_mask);
-  structurals |= pseudo_structurals;
-
-  // now, we've used our close quotes all we need to. So let's switch them off
-  // they will be off in the quote mask and on in quote bits.
-  structurals &= ~(quote_bits & ~quote_mask);
-  return structurals;
-}
-
-WARN_UNUSED
-/*never_inline*/ int find_structural_bits(const uint8_t *buf, size_t len,
-                                           ParsedJson &pj) {
-  if (len > pj.bytecapacity) {
-    std::cerr << "Your ParsedJson object only supports documents up to "
-         << pj.bytecapacity << " bytes but you are trying to process " << len
-         << " bytes" << std::endl;
-    return simdjson::CAPACITY;
-  }
-  uint32_t *base_ptr = pj.structural_indexes;
-  uint32_t base = 0;
-#ifdef SIMDJSON_UTF8VALIDATE
-  __m256i has_error = _mm256_setzero_si256();
-  struct avx_processed_utf_bytes previous {};
-  previous.rawbytes = _mm256_setzero_si256();
-  previous.high_nibbles = _mm256_setzero_si256();
-  previous.carried_continuations = _mm256_setzero_si256();
-#endif
-
-  // we have padded the input out to 64 byte multiple with the remainder being
-  // zeros
-
-  // persistent state across loop
-  // does the last iteration end with an odd-length sequence of backslashes? 
-  // either 0 or 1, but a 64-bit value
-  uint64_t prev_iter_ends_odd_backslash = 0ULL;
-  // does the previous iteration end inside a double-quote pair?
-  uint64_t prev_iter_inside_quote = 0ULL;  // either all zeros or all ones
-  // does the previous iteration end on something that is a predecessor of a
-  // pseudo-structural character - i.e. whitespace or a structural character
-  // effectively the very first char is considered to follow "whitespace" for
-  // the
-  // purposes of pseudo-structural character detection so we initialize to 1
-  uint64_t prev_iter_ends_pseudo_pred = 1ULL;
-
-  // structurals are persistent state across loop as we flatten them on the
-  // subsequent iteration into our array pointed to be base_ptr.
-  // This is harmless on the first iteration as structurals==0
-  // and is done for performance reasons; we can hide some of the latency of the
-  // expensive carryless multiply in the previous step with this work
-  uint64_t structurals = 0;
-
-  size_t lenminus64 = len < 64 ? 0 : len - 64;
-  size_t idx = 0;
-  uint64_t error_mask = 0; // for unescaped characters within strings (ASCII code points < 0x20)
-
-  for (; idx < lenminus64; idx += 64) {
-#ifndef _MSC_VER
-    __builtin_prefetch(buf + idx + 128);
-#endif
-    simd_input in = fill_input(buf+idx);
-#ifdef SIMDJSON_UTF8VALIDATE
-    check_utf8(in, has_error, previous);
-#endif
-    // detect odd sequences of backslashes
-    uint64_t odd_ends = find_odd_backslash_sequences(
-        in, prev_iter_ends_odd_backslash);
-
-    // detect insides of quote pairs ("quote_mask") and also our quote_bits
-    // themselves
-    uint64_t quote_bits;
-    uint64_t quote_mask = find_quote_mask_and_bits(
-        in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
-
-    // take the previous iterations structural bits, not our current iteration,
-    // and flatten
-    flatten_bits(base_ptr, base, idx, structurals);
-
-    uint64_t whitespace;
-    find_whitespace_and_structurals(in, whitespace, structurals);
-
-    // fixup structurals to reflect quotes and add pseudo-structural characters
-    structurals = finalize_structurals(structurals, whitespace, quote_mask,
-                                       quote_bits, prev_iter_ends_pseudo_pred);
-  }
-
-  ////////////////
-  /// we use a giant copy-paste which is ugly.
-  /// but otherwise the string needs to be properly padded or else we
-  /// risk invalidating the UTF-8 checks.
-  ////////////
-  if (idx < len) {
-    uint8_t tmpbuf[64];
-    memset(tmpbuf, 0x20, 64);
-    memcpy(tmpbuf, buf + idx, len - idx);
-    simd_input in = fill_input(tmpbuf);
-#ifdef SIMDJSON_UTF8VALIDATE
-    check_utf8(in, has_error, previous);
-#endif
-
-    // detect odd sequences of backslashes
-    uint64_t odd_ends = find_odd_backslash_sequences(
-        in, prev_iter_ends_odd_backslash);
-
-    // detect insides of quote pairs ("quote_mask") and also our quote_bits
-    // themselves
-    uint64_t quote_bits;
-    uint64_t quote_mask = find_quote_mask_and_bits(
-        in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
-
-    // take the previous iterations structural bits, not our current iteration,
-    // and flatten
-    flatten_bits(base_ptr, base, idx, structurals);
-
-    uint64_t whitespace;
-    find_whitespace_and_structurals(in, whitespace, structurals);
-
-    // fixup structurals to reflect quotes and add pseudo-structural characters
-    structurals = finalize_structurals(structurals, whitespace, quote_mask,
-                                       quote_bits, prev_iter_ends_pseudo_pred);
-    idx += 64;
-  }
-
-  // is last string quote closed?
-  if (prev_iter_inside_quote) {
-      return simdjson::UNCLOSED_STRING;
-  }
-
-  // finally, flatten out the remaining structurals from the last iteration
-  flatten_bits(base_ptr, base, idx, structurals);
-
-  pj.n_structural_indexes = base;
-  // a valid JSON file cannot have zero structural indexes - we should have
-  // found something
-  if (pj.n_structural_indexes == 0u) {
-    fprintf(stderr, "Empty document?\n");
-    return simdjson::EMPTY;
-  }
-  if (base_ptr[pj.n_structural_indexes - 1] > len) {
-    fprintf(stderr, "Internal bug\n");
-    return simdjson::UNEXPECTED_ERROR;
-  }
-  if (len != base_ptr[pj.n_structural_indexes - 1]) {
-    // the string might not be NULL terminated, but we add a virtual NULL ending
-    // character.
-    base_ptr[pj.n_structural_indexes++] = len;
-  }
-  // make it safe to dereference one beyond this array
-  base_ptr[pj.n_structural_indexes] = 0;  
-  if (error_mask) {
-    fprintf(stderr, "Unescaped characters\n");
-    return simdjson::UNESCAPED_CHARS;
-  }
-#ifdef SIMDJSON_UTF8VALIDATE
-    return _mm256_testz_si256(has_error, has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
-#else
-  return simdjson::SUCCESS;
-#endif
-}
-
-int find_structural_bits(const char *buf, size_t len, ParsedJson &pj) {
-  return find_structural_bits(reinterpret_cast<const uint8_t *>(buf), len, pj);
-}
+// File kept in case we want to reuse it soon. (many configuration files to edit)
--- a/tools/jsonstats.cpp
+++ b/tools/jsonstats.cpp
@ -115,15 +115,15 @@ stat_t simdjson_computestats(const padded_string &p) {


 int main(int argc, char *argv[]) {
-  int optind = 1;
-  if (optind >= argc) {
+  int myoptind = 1;
+  if (myoptind >= argc) {
    std::cerr << "Reads json, prints stats. " << std::endl;
    std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
    exit(1);
  }
-  const char *filename = argv[optind];
-  if (optind + 1 < argc) {
-    std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
+  const char *filename = argv[myoptind];
+  if (myoptind + 1 < argc) {
+    std::cerr << "warning: ignoring everything after " << argv[myoptind + 1] << std::endl;
  }
  padded_string p;
  try {