From 8f01cece3ac42dc42e2cd27b5a27f2c9132e9782 Mon Sep 17 00:00:00 2001
From: John Keiser <john@johnkeiser.com>
Date: Tue, 13 Aug 2019 17:44:06 -0700
Subject: [PATCH 1/3] Move simd_input and associated functions to their own
 header

---
 include/simdjson/simd_input.h                 | 26 +++++++
 include/simdjson/simd_input_arm64.h           | 78 +++++++++++++++++++
 include/simdjson/simd_input_haswell.h         | 52 +++++++++++++
 include/simdjson/simd_input_westmere.h        | 64 +++++++++++++++
 include/simdjson/stage1_find_marks.h          | 14 +---
 include/simdjson/stage1_find_marks_arm64.h    | 68 +---------------
 include/simdjson/stage1_find_marks_haswell.h  | 36 +--------
 include/simdjson/stage1_find_marks_westmere.h | 48 +-----------
 8 files changed, 226 insertions(+), 160 deletions(-)
 create mode 100644 include/simdjson/simd_input.h
 create mode 100644 include/simdjson/simd_input_arm64.h
 create mode 100644 include/simdjson/simd_input_haswell.h
 create mode 100644 include/simdjson/simd_input_westmere.h
diff --git a/include/simdjson/simd_input.h b/include/simdjson/simd_input.h
new file mode 100644
index 00000000..085d89b4
--- /dev/null
+++ b/include/simdjson/simd_input.h
@@ -0,0 +1,26 @@
+#ifndef SIMDJSON_SIMD_INPUT_H
+#define SIMDJSON_SIMD_INPUT_H
+
+#include "simdjson/common_defs.h"
+#include "simdjson/portability.h"
+#include "simdjson/simdjson.h"
+#include <cassert>
+
+namespace simdjson {
+
+template <Architecture> struct simd_input;
+
+// a straightforward comparison of a mask against input.
+template <Architecture T>
+uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
+
+template <Architecture T> simd_input<T> fill_input(const uint8_t *ptr);
+
+// find all values less than or equal than the content of maxval (using unsigned
+// arithmetic)
+template <Architecture T>
+uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
+
+} // namespace simdjson
+
+#endif
diff --git a/include/simdjson/simd_input_arm64.h b/include/simdjson/simd_input_arm64.h
new file mode 100644
index 00000000..658194a2
--- /dev/null
+++ b/include/simdjson/simd_input_arm64.h
@@ -0,0 +1,78 @@
+#ifndef SIMDJSON_SIMD_INPUT_ARM64_H
+#define SIMDJSON_SIMD_INPUT_ARM64_H
+
+#include "simdjson/simd_input.h"
+
+#ifdef IS_ARM64
+namespace simdjson {
+
+template <>
+struct simd_input<Architecture::ARM64> {
+  uint8x16_t i0;
+  uint8x16_t i1;
+  uint8x16_t i2;
+  uint8x16_t i3;
+};
+
+template <>
+really_inline simd_input<Architecture::ARM64>
+fill_input<Architecture::ARM64>(const uint8_t *ptr) {
+  struct simd_input<Architecture::ARM64> in;
+  in.i0 = vld1q_u8(ptr + 0);
+  in.i1 = vld1q_u8(ptr + 16);
+  in.i2 = vld1q_u8(ptr + 32);
+  in.i3 = vld1q_u8(ptr + 48);
+  return in;
+}
+
+really_inline uint16_t neon_movemask(uint8x16_t input) {
+  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+  uint8x16_t minput = vandq_u8(input, bit_mask);
+  uint8x16_t tmp = vpaddq_u8(minput, minput);
+  tmp = vpaddq_u8(tmp, tmp);
+  tmp = vpaddq_u8(tmp, tmp);
+  return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
+}
+
+really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
+                                          uint8x16_t p2, uint8x16_t p3) {
+  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+  uint8x16_t t0 = vandq_u8(p0, bit_mask);
+  uint8x16_t t1 = vandq_u8(p1, bit_mask);
+  uint8x16_t t2 = vandq_u8(p2, bit_mask);
+  uint8x16_t t3 = vandq_u8(p3, bit_mask);
+  uint8x16_t sum0 = vpaddq_u8(t0, t1);
+  uint8x16_t sum1 = vpaddq_u8(t2, t3);
+  sum0 = vpaddq_u8(sum0, sum1);
+  sum0 = vpaddq_u8(sum0, sum0);
+  return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+}
+
+template <>
+really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
+    simd_input<Architecture::ARM64> in, uint8_t m) {
+  const uint8x16_t mask = vmovq_n_u8(m);
+  uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
+  uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
+  uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
+  uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
+  return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+}
+
+template <>
+really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
+    simd_input<Architecture::ARM64> in, uint8_t m) {
+  const uint8x16_t mask = vmovq_n_u8(m);
+  uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
+  uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
+  uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
+  uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
+  return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+}
+
+} // namespace simdjson
+
+#endif // IS_ARM64
+#endif // SIMDJSON_SIMD_INPUT_ARM64_H
diff --git a/include/simdjson/simd_input_haswell.h b/include/simdjson/simd_input_haswell.h
new file mode 100644
index 00000000..8f5b9973
--- /dev/null
+++ b/include/simdjson/simd_input_haswell.h
@@ -0,0 +1,52 @@
+#ifndef SIMDJSON_SIMD_INPUT_HASWELL_H
+#define SIMDJSON_SIMD_INPUT_HASWELL_H
+
+#include "simdjson/simd_input.h"
+
+#ifdef IS_X86_64
+
+TARGET_HASWELL
+namespace simdjson {
+
+template <>
+struct simd_input<Architecture::HASWELL> {
+  __m256i lo;
+  __m256i hi;
+};
+
+template <>
+really_inline simd_input<Architecture::HASWELL>
+fill_input<Architecture::HASWELL>(const uint8_t *ptr) {
+  struct simd_input<Architecture::HASWELL> in;
+  in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
+  in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
+  return in;
+}
+
+template <>
+really_inline uint64_t cmp_mask_against_input<Architecture::HASWELL>(
+    simd_input<Architecture::HASWELL> in, uint8_t m) {
+  const __m256i mask = _mm256_set1_epi8(m);
+  __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
+  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
+  __m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
+  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
+  return res_0 | (res_1 << 32);
+}
+
+template <>
+really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
+    simd_input<Architecture::HASWELL> in, uint8_t m) {
+  const __m256i maxval = _mm256_set1_epi8(m);
+  __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval);
+  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
+  __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval);
+  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
+  return res_0 | (res_1 << 32);
+}
+
+} // namespace simdjson
+UNTARGET_REGION
+
+#endif // IS_X86_64
+#endif // SIMDJSON_SIMD_INPUT_HASWELL_H
diff --git a/include/simdjson/simd_input_westmere.h b/include/simdjson/simd_input_westmere.h
new file mode 100644
index 00000000..3082946c
--- /dev/null
+++ b/include/simdjson/simd_input_westmere.h
@@ -0,0 +1,64 @@
+#ifndef SIMDJSON_SIMD_INPUT_WESTMERE_H
+#define SIMDJSON_SIMD_INPUT_WESTMERE_H
+
+#include "simdjson/simd_input.h"
+
+#ifdef IS_X86_64
+
+TARGET_WESTMERE
+namespace simdjson {
+
+template <>
+struct simd_input<Architecture::WESTMERE> {
+  __m128i v0;
+  __m128i v1;
+  __m128i v2;
+  __m128i v3;
+};
+
+template <>
+really_inline simd_input<Architecture::WESTMERE>
+fill_input<Architecture::WESTMERE>(const uint8_t *ptr) {
+  struct simd_input<Architecture::WESTMERE> in;
+  in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
+  in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
+  in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
+  in.v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
+  return in;
+}
+
+template <>
+really_inline uint64_t cmp_mask_against_input<Architecture::WESTMERE>(
+    simd_input<Architecture::WESTMERE> in, uint8_t m) {
+  const __m128i mask = _mm_set1_epi8(m);
+  __m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
+  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
+  __m128i cmp_res_1 = _mm_cmpeq_epi8(in.v1, mask);
+  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
+  __m128i cmp_res_2 = _mm_cmpeq_epi8(in.v2, mask);
+  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
+  __m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask);
+  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
+  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
+}
+
+template <>
+really_inline uint64_t unsigned_lteq_against_input<Architecture::WESTMERE>(
+    simd_input<Architecture::WESTMERE> in, uint8_t m) {
+  const __m128i maxval = _mm_set1_epi8(m);
+  __m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v0), maxval);
+  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
+  __m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v1), maxval);
+  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
+  __m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v2), maxval);
+  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
+  __m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v3), maxval);
+  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
+  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
+}
+
+} // namespace simdjson
+UNTARGET_REGION
+
+#endif // IS_X86_64
+#endif // SIMDJSON_SIMD_INPUT_WESTMERE_H
diff --git a/include/simdjson/stage1_find_marks.h b/include/simdjson/stage1_find_marks.h
index 24a0bd15..f0a644e7 100644
--- a/include/simdjson/stage1_find_marks.h
+++ b/include/simdjson/stage1_find_marks.h
@@ -5,12 +5,11 @@
 #include "simdjson/parsedjson.h"
 #include "simdjson/portability.h"
 #include "simdjson/simdjson.h"
+#include "simdjson/simd_input.h"
 #include <cassert>
 
 namespace simdjson {
 
-template <Architecture> struct simd_input;
-
 template <Architecture> uint64_t compute_quote_mask(uint64_t quote_bits);
 
 namespace {
@@ -36,17 +35,6 @@ void check_utf8(simd_input<T> in, utf8_checking_state<T> &state);
 template <Architecture T>
 ErrorValues check_utf8_errors(utf8_checking_state<T> &state);
 
-// a straightforward comparison of a mask against input.
-template <Architecture T>
-uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
-
-template <Architecture T> simd_input<T> fill_input(const uint8_t *ptr);
-
-// find all values less than or equal than the content of maxval (using unsigned
-// arithmetic)
-template <Architecture T>
-uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
-
 template <Architecture T>
 really_inline uint64_t find_odd_backslash_sequences(
     simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash);
diff --git a/include/simdjson/stage1_find_marks_arm64.h b/include/simdjson/stage1_find_marks_arm64.h
index 51a77879..412ef849 100644
--- a/include/simdjson/stage1_find_marks_arm64.h
+++ b/include/simdjson/stage1_find_marks_arm64.h
@@ -1,53 +1,12 @@
 #ifndef SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
 #define SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
 
+#include "simdjson/simd_input_arm64.h"
 #include "simdjson/simdutf8check_arm64.h"
 #include "simdjson/stage1_find_marks.h"
 
 #ifdef IS_ARM64
 namespace simdjson {
-template <> struct simd_input<Architecture::ARM64> {
-  uint8x16_t i0;
-  uint8x16_t i1;
-  uint8x16_t i2;
-  uint8x16_t i3;
-};
-
-template <>
-really_inline simd_input<Architecture::ARM64>
-fill_input<Architecture::ARM64>(const uint8_t *ptr) {
-  struct simd_input<Architecture::ARM64> in;
-  in.i0 = vld1q_u8(ptr + 0);
-  in.i1 = vld1q_u8(ptr + 16);
-  in.i2 = vld1q_u8(ptr + 32);
-  in.i3 = vld1q_u8(ptr + 48);
-  return in;
-}
-
-really_inline uint16_t neon_movemask(uint8x16_t input) {
-  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
-  uint8x16_t minput = vandq_u8(input, bit_mask);
-  uint8x16_t tmp = vpaddq_u8(minput, minput);
-  tmp = vpaddq_u8(tmp, tmp);
-  tmp = vpaddq_u8(tmp, tmp);
-  return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
-}
-
-really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
-                                          uint8x16_t p2, uint8x16_t p3) {
-  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
-  uint8x16_t t0 = vandq_u8(p0, bit_mask);
-  uint8x16_t t1 = vandq_u8(p1, bit_mask);
-  uint8x16_t t2 = vandq_u8(p2, bit_mask);
-  uint8x16_t t3 = vandq_u8(p3, bit_mask);
-  uint8x16_t sum0 = vpaddq_u8(t0, t1);
-  uint8x16_t sum1 = vpaddq_u8(t2, t3);
-  sum0 = vpaddq_u8(sum0, sum1);
-  sum0 = vpaddq_u8(sum0, sum0);
-  return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
-}
 
 template <>
 really_inline uint64_t
@@ -59,7 +18,8 @@ compute_quote_mask<Architecture::ARM64>(uint64_t quote_bits) {
 #endif
 }
 
-template <> struct utf8_checking_state<Architecture::ARM64> {
+template <>
+struct utf8_checking_state<Architecture::ARM64> {
   int8x16_t has_error{};
   processed_utf_bytes previous{};
 };
@@ -115,28 +75,6 @@ really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
                                        : simdjson::SUCCESS;
 }
 
-template <>
-really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
-    simd_input<Architecture::ARM64> in, uint8_t m) {
-  const uint8x16_t mask = vmovq_n_u8(m);
-  uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
-  uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
-  uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
-  uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
-  return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
-}
-
-template <>
-really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
-    simd_input<Architecture::ARM64> in, uint8_t m) {
-  const uint8x16_t mask = vmovq_n_u8(m);
-  uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
-  uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
-  uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
-  uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
-  return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
-}
-
 template <>
 really_inline void find_whitespace_and_structurals<Architecture::ARM64>(
     simd_input<Architecture::ARM64> in, uint64_t &whitespace,
diff --git a/include/simdjson/stage1_find_marks_haswell.h b/include/simdjson/stage1_find_marks_haswell.h
index d6ff7113..ae589e66 100644
--- a/include/simdjson/stage1_find_marks_haswell.h
+++ b/include/simdjson/stage1_find_marks_haswell.h
@@ -1,6 +1,7 @@
 #ifndef SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
 #define SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
 
+#include "simdjson/simd_input_haswell.h"
 #include "simdjson/simdutf8check_haswell.h"
 #include "simdjson/stage1_find_marks.h"
 
@@ -8,19 +9,6 @@
 
 TARGET_HASWELL
 namespace simdjson {
-template <> struct simd_input<Architecture::HASWELL> {
-  __m256i lo;
-  __m256i hi;
-};
-
-template <>
-really_inline simd_input<Architecture::HASWELL>
-fill_input<Architecture::HASWELL>(const uint8_t *ptr) {
-  struct simd_input<Architecture::HASWELL> in;
-  in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
-  in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
-  return in;
-}
 
 template <>
 really_inline uint64_t
@@ -73,28 +61,6 @@ really_inline ErrorValues check_utf8_errors<Architecture::HASWELL>(
              : simdjson::SUCCESS;
 }
 
-template <>
-really_inline uint64_t cmp_mask_against_input<Architecture::HASWELL>(
-    simd_input<Architecture::HASWELL> in, uint8_t m) {
-  const __m256i mask = _mm256_set1_epi8(m);
-  __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
-  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-  __m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
-  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
-  return res_0 | (res_1 << 32);
-}
-
-template <>
-really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
-    simd_input<Architecture::HASWELL> in, uint8_t m) {
-  const __m256i maxval = _mm256_set1_epi8(m);
-  __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval);
-  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-  __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval);
-  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
-  return res_0 | (res_1 << 32);
-}
-
 template <>
 really_inline void find_whitespace_and_structurals<Architecture::HASWELL>(
     simd_input<Architecture::HASWELL> in, uint64_t &whitespace,
diff --git a/include/simdjson/stage1_find_marks_westmere.h b/include/simdjson/stage1_find_marks_westmere.h
index f39b8a96..7336a2a7 100644
--- a/include/simdjson/stage1_find_marks_westmere.h
+++ b/include/simdjson/stage1_find_marks_westmere.h
@@ -1,6 +1,7 @@
 #ifndef SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H
 #define SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H
 
+#include "simdjson/simd_input_westmere.h"
 #include "simdjson/simdutf8check_westmere.h"
 #include "simdjson/stage1_find_marks.h"
 
@@ -8,23 +9,6 @@
 
 TARGET_WESTMERE
 namespace simdjson {
-template <> struct simd_input<Architecture::WESTMERE> {
-  __m128i v0;
-  __m128i v1;
-  __m128i v2;
-  __m128i v3;
-};
-
-template <>
-really_inline simd_input<Architecture::WESTMERE>
-fill_input<Architecture::WESTMERE>(const uint8_t *ptr) {
-  struct simd_input<Architecture::WESTMERE> in;
-  in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
-  in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
-  in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
-  in.v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
-  return in;
-}
 
 template <>
 really_inline uint64_t
@@ -86,36 +70,6 @@ really_inline ErrorValues check_utf8_errors<Architecture::WESTMERE>(
              : simdjson::SUCCESS;
 }
 
-template <>
-really_inline uint64_t cmp_mask_against_input<Architecture::WESTMERE>(
-    simd_input<Architecture::WESTMERE> in, uint8_t m) {
-  const __m128i mask = _mm_set1_epi8(m);
-  __m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
-  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
-  __m128i cmp_res_1 = _mm_cmpeq_epi8(in.v1, mask);
-  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
-  __m128i cmp_res_2 = _mm_cmpeq_epi8(in.v2, mask);
-  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
-  __m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask);
-  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
-  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
-}
-
-template <>
-really_inline uint64_t unsigned_lteq_against_input<Architecture::WESTMERE>(
-    simd_input<Architecture::WESTMERE> in, uint8_t m) {
-  const __m128i maxval = _mm_set1_epi8(m);
-  __m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v0), maxval);
-  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
-  __m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v1), maxval);
-  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
-  __m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v2), maxval);
-  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
-  __m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v3), maxval);
-  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
-  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
-}
-
 template <>
 really_inline void find_whitespace_and_structurals<Architecture::WESTMERE>(
     simd_input<Architecture::WESTMERE> in, uint64_t &whitespace,

From 237b8865f533ea70904053d2ddcb35d091366602 Mon Sep 17 00:00:00 2001
From: John Keiser <john@johnkeiser.com>
Date: Tue, 13 Aug 2019 17:44:26 -0700
Subject: [PATCH 2/3] Correct header #define

---
 include/simdjson/simdjson.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/simdjson/simdjson.h b/include/simdjson/simdjson.h
index ffb0c717..4f08f89e 100644
--- a/include/simdjson/simdjson.h
+++ b/include/simdjson/simdjson.h
@@ -1,5 +1,5 @@
-#ifndef SIMDJSON_ERR_H
-#define SIMDJSON_ERR_H
+#ifndef SIMDJSON_SIMDJSON_H
+#define SIMDJSON_SIMDJSON_H
 
 #include <string>
 
@@ -41,4 +41,4 @@ enum ErrorValues {
 };
 const std::string &error_message(const int);
 } // namespace simdjson
-#endif
+#endif // SIMDJSON_SIMDJSON_H

From 0042d9b406fc9dc006e455bcb95f21d888cee528 Mon Sep 17 00:00:00 2001
From: John Keiser <john@johnkeiser.com>
Date: Wed, 14 Aug 2019 09:45:33 -0700
Subject: [PATCH 3/3] Move UTF8 checking functions into their own file

---
 amalgamation.sh                               |    5 +
 include/simdjson/simd_input.h                 |    5 +-
 include/simdjson/simdutf8check.h              |   21 +
 include/simdjson/simdutf8check_arm64.h        |   59 +
 include/simdjson/simdutf8check_haswell.h      |   43 +
 include/simdjson/simdutf8check_westmere.h     |   56 +
 include/simdjson/stage1_find_marks_arm64.h    |   57 -
 include/simdjson/stage1_find_marks_haswell.h  |   41 -
 include/simdjson/stage1_find_marks_westmere.h |   53 -
 singleheader/amalgamation_demo.cpp            |    2 +-
 singleheader/simdjson.cpp                     | 3142 ++++++++++++++---
 singleheader/simdjson.h                       | 1749 ++++-----
 12 files changed, 3547 insertions(+), 1686 deletions(-)
 create mode 100644 include/simdjson/simdutf8check.h

diff --git a/amalgamation.sh b/amalgamation.sh
index 3ae78910..34aa48f9 100755
--- a/amalgamation.sh
+++ b/amalgamation.sh
@@ -36,6 +36,11 @@ $SCRIPTPATH/include/simdjson/jsoncharutils.h
 $SCRIPTPATH/include/simdjson/jsonformatutils.h
 $SCRIPTPATH/include/simdjson/jsonioutil.h
 $SCRIPTPATH/include/simdjson/simdprune_tables.h
+$SCRIPTPATH/include/simdjson/simd_input.h
+$SCRIPTPATH/include/simdjson/simd_input_haswell.h
+$SCRIPTPATH/include/simdjson/simd_input_westmere.h
+$SCRIPTPATH/include/simdjson/simd_input_arm64.h
+$SCRIPTPATH/include/simdjson/simdutf8check.h
 $SCRIPTPATH/include/simdjson/simdutf8check_haswell.h
 $SCRIPTPATH/include/simdjson/simdutf8check_westmere.h
 $SCRIPTPATH/include/simdjson/simdutf8check_arm64.h
diff --git a/include/simdjson/simd_input.h b/include/simdjson/simd_input.h
index 085d89b4..f834c442 100644
--- a/include/simdjson/simd_input.h
+++ b/include/simdjson/simd_input.h
@@ -10,12 +10,13 @@ namespace simdjson {
 
 template <Architecture> struct simd_input;
 
+template <Architecture T>
+simd_input<T> fill_input(const uint8_t *ptr);
+
 // a straightforward comparison of a mask against input.
 template <Architecture T>
 uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
 
-template <Architecture T> simd_input<T> fill_input(const uint8_t *ptr);
-
 // find all values less than or equal than the content of maxval (using unsigned
 // arithmetic)
 template <Architecture T>
diff --git a/include/simdjson/simdutf8check.h b/include/simdjson/simdutf8check.h
new file mode 100644
index 00000000..6097e28e
--- /dev/null
+++ b/include/simdjson/simdutf8check.h
@@ -0,0 +1,21 @@
+#ifndef SIMDJSON_SIMDUTF8CHECK_H
+#define SIMDJSON_SIMDUTF8CHECK_H
+
+#include "simdjson/simdjson.h"
+#include "simdjson/simd_input.h"
+
+namespace simdjson {
+
+// Holds the state required to perform check_utf8().
+template <Architecture> struct utf8_checking_state;
+
+template <Architecture T>
+void check_utf8(simd_input<T> in, utf8_checking_state<T> &state);
+
+// Checks if the utf8 validation has found any error.
+template <Architecture T>
+ErrorValues check_utf8_errors(utf8_checking_state<T> &state);
+
+} // namespace simdjson
+
+#endif // SIMDJSON_SIMDUTF8CHECK_H
diff --git a/include/simdjson/simdutf8check_arm64.h b/include/simdjson/simdutf8check_arm64.h
index 6360b012..4b0baa30 100644
--- a/include/simdjson/simdutf8check_arm64.h
+++ b/include/simdjson/simdutf8check_arm64.h
@@ -7,6 +7,7 @@
 #if defined(_ARM_NEON) || defined(__aarch64__) ||                              \
     (defined(_MSC_VER) && defined(_M_ARM64))
 
+#include "simdjson/simdutf8check.h"
 #include <arm_neon.h>
 #include <cinttypes>
 #include <cstddef>
@@ -175,6 +176,64 @@ check_utf8_bytes(int8x16_t current_bytes, struct processed_utf_bytes *previous,
                  previous->high_nibbles, has_error);
   return pb;
 }
+
+template <>
+struct utf8_checking_state<Architecture::ARM64> {
+  int8x16_t has_error{};
+  processed_utf_bytes previous{};
+};
+
+// Checks that all bytes are ascii
+really_inline bool check_ascii_neon(simd_input<Architecture::ARM64> in) {
+  // checking if the most significant bit is always equal to 0.
+  uint8x16_t high_bit = vdupq_n_u8(0x80);
+  uint8x16_t t0 = vorrq_u8(in.i0, in.i1);
+  uint8x16_t t1 = vorrq_u8(in.i2, in.i3);
+  uint8x16_t t3 = vorrq_u8(t0, t1);
+  uint8x16_t t4 = vandq_u8(t3, high_bit);
+  uint64x2_t v64 = vreinterpretq_u64_u8(t4);
+  uint32x2_t v32 = vqmovn_u64(v64);
+  uint64x1_t result = vreinterpret_u64_u32(v32);
+  return vget_lane_u64(result, 0) == 0;
+}
+
+template <>
+really_inline void check_utf8<Architecture::ARM64>(
+    simd_input<Architecture::ARM64> in,
+    utf8_checking_state<Architecture::ARM64> &state) {
+  if (check_ascii_neon(in)) {
+    // All bytes are ascii. Therefore the byte that was just before must be
+    // ascii too. We only check the byte that was just before simd_input. Nines
+    // are arbitrary values.
+    const int8x16_t verror =
+        (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
+    state.has_error =
+        vorrq_s8(vreinterpretq_s8_u8(
+                     vcgtq_s8(state.previous.carried_continuations, verror)),
+                 state.has_error);
+  } else {
+    // it is not ascii so we have to do heavy work
+    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0),
+                                      &(state.previous), &(state.has_error));
+    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1),
+                                      &(state.previous), &(state.has_error));
+    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2),
+                                      &(state.previous), &(state.has_error));
+    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3),
+                                      &(state.previous), &(state.has_error));
+  }
+}
+
+template <>
+really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
+    utf8_checking_state<Architecture::ARM64> &state) {
+  uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error);
+  uint32x2_t v32 = vqmovn_u64(v64);
+  uint64x1_t result = vreinterpret_u64_u32(v32);
+  return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
+                                       : simdjson::SUCCESS;
+}
+
 } // namespace simdjson
 #endif
 #endif
diff --git a/include/simdjson/simdutf8check_haswell.h b/include/simdjson/simdutf8check_haswell.h
index 6097af0c..355d6247 100644
--- a/include/simdjson/simdutf8check_haswell.h
+++ b/include/simdjson/simdutf8check_haswell.h
@@ -2,6 +2,7 @@
 #define SIMDJSON_SIMDUTF8CHECK_HASWELL_H
 
 #include "simdjson/portability.h"
+#include "simdjson/simdutf8check.h"
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
@@ -190,6 +191,48 @@ avx_check_utf8_bytes(__m256i current_bytes,
                      previous->high_nibbles, has_error);
   return pb;
 }
+
+template <> struct utf8_checking_state<Architecture::HASWELL> {
+  __m256i has_error;
+  avx_processed_utf_bytes previous;
+  utf8_checking_state() {
+    has_error = _mm256_setzero_si256();
+    previous.raw_bytes = _mm256_setzero_si256();
+    previous.high_nibbles = _mm256_setzero_si256();
+    previous.carried_continuations = _mm256_setzero_si256();
+  }
+};
+
+template <>
+really_inline void check_utf8<Architecture::HASWELL>(
+    simd_input<Architecture::HASWELL> in,
+    utf8_checking_state<Architecture::HASWELL> &state) {
+  __m256i high_bit = _mm256_set1_epi8(0x80u);
+  if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) {
+    // it is ascii, we just check continuation
+    state.has_error = _mm256_or_si256(
+        _mm256_cmpgt_epi8(state.previous.carried_continuations,
+                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 1)),
+        state.has_error);
+  } else {
+    // it is not ascii so we have to do heavy work
+    state.previous =
+        avx_check_utf8_bytes(in.lo, &(state.previous), &(state.has_error));
+    state.previous =
+        avx_check_utf8_bytes(in.hi, &(state.previous), &(state.has_error));
+  }
+}
+
+template <>
+really_inline ErrorValues check_utf8_errors<Architecture::HASWELL>(
+    utf8_checking_state<Architecture::HASWELL> &state) {
+  return _mm256_testz_si256(state.has_error, state.has_error) == 0
+             ? simdjson::UTF8_ERROR
+             : simdjson::SUCCESS;
+}
+
 } // namespace simdjson
 UNTARGET_REGION // haswell
 
diff --git a/include/simdjson/simdutf8check_westmere.h b/include/simdjson/simdutf8check_westmere.h
index cf57fec9..46361cdb 100644
--- a/include/simdjson/simdutf8check_westmere.h
+++ b/include/simdjson/simdutf8check_westmere.h
@@ -2,6 +2,7 @@
 #define SIMDJSON_SIMDUTF8CHECK_WESTMERE_H
 
 #include "simdjson/portability.h"
+#include "simdjson/simdutf8check.h"
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
@@ -161,6 +162,61 @@ check_utf8_bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
                  previous->high_nibbles, has_error);
   return pb;
 }
+
+template <>
+struct utf8_checking_state<Architecture::WESTMERE> {
+  __m128i has_error = _mm_setzero_si128();
+  processed_utf_bytes previous{
+      _mm_setzero_si128(), // raw_bytes
+      _mm_setzero_si128(), // high_nibbles
+      _mm_setzero_si128()  // carried_continuations
+  };
+};
+
+template <>
+really_inline void check_utf8<Architecture::WESTMERE>(
+    simd_input<Architecture::WESTMERE> in,
+    utf8_checking_state<Architecture::WESTMERE> &state) {
+  __m128i high_bit = _mm_set1_epi8(0x80u);
+  if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), high_bit)) == 1) {
+    // it is ascii, we just check continuation
+    state.has_error =
+        _mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
+                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                                  9, 9, 9, 9, 9, 1)),
+                     state.has_error);
+  } else {
+    // it is not ascii so we have to do heavy work
+    state.previous =
+        check_utf8_bytes(in.v0, &(state.previous), &(state.has_error));
+    state.previous =
+        check_utf8_bytes(in.v1, &(state.previous), &(state.has_error));
+  }
+
+  if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), high_bit)) == 1) {
+    // it is ascii, we just check continuation
+    state.has_error =
+        _mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
+                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                                  9, 9, 9, 9, 9, 1)),
+                     state.has_error);
+  } else {
+    // it is not ascii so we have to do heavy work
+    state.previous =
+        check_utf8_bytes(in.v2, &(state.previous), &(state.has_error));
+    state.previous =
+        check_utf8_bytes(in.v3, &(state.previous), &(state.has_error));
+  }
+}
+
+template <>
+really_inline ErrorValues check_utf8_errors<Architecture::WESTMERE>(
+    utf8_checking_state<Architecture::WESTMERE> &state) {
+  return _mm_testz_si128(state.has_error, state.has_error) == 0
+             ? simdjson::UTF8_ERROR
+             : simdjson::SUCCESS;
+}
+
 } // namespace simdjson
 UNTARGET_REGION // westmere
 
diff --git a/include/simdjson/stage1_find_marks_arm64.h b/include/simdjson/stage1_find_marks_arm64.h
index 412ef849..3edeaaa2 100644
--- a/include/simdjson/stage1_find_marks_arm64.h
+++ b/include/simdjson/stage1_find_marks_arm64.h
@@ -18,63 +18,6 @@ compute_quote_mask<Architecture::ARM64>(uint64_t quote_bits) {
 #endif
 }
 
-template <>
-struct utf8_checking_state<Architecture::ARM64> {
-  int8x16_t has_error{};
-  processed_utf_bytes previous{};
-};
-
-// Checks that all bytes are ascii
-really_inline bool check_ascii_neon(simd_input<Architecture::ARM64> in) {
-  // checking if the most significant bit is always equal to 0.
-  uint8x16_t high_bit = vdupq_n_u8(0x80);
-  uint8x16_t t0 = vorrq_u8(in.i0, in.i1);
-  uint8x16_t t1 = vorrq_u8(in.i2, in.i3);
-  uint8x16_t t3 = vorrq_u8(t0, t1);
-  uint8x16_t t4 = vandq_u8(t3, high_bit);
-  uint64x2_t v64 = vreinterpretq_u64_u8(t4);
-  uint32x2_t v32 = vqmovn_u64(v64);
-  uint64x1_t result = vreinterpret_u64_u32(v32);
-  return vget_lane_u64(result, 0) == 0;
-}
-
-template <>
-really_inline void check_utf8<Architecture::ARM64>(
-    simd_input<Architecture::ARM64> in,
-    utf8_checking_state<Architecture::ARM64> &state) {
-  if (check_ascii_neon(in)) {
-    // All bytes are ascii. Therefore the byte that was just before must be
-    // ascii too. We only check the byte that was just before simd_input. Nines
-    // are arbitrary values.
-    const int8x16_t verror =
-        (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
-    state.has_error =
-        vorrq_s8(vreinterpretq_s8_u8(
-                     vcgtq_s8(state.previous.carried_continuations, verror)),
-                 state.has_error);
-  } else {
-    // it is not ascii so we have to do heavy work
-    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0),
-                                      &(state.previous), &(state.has_error));
-    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1),
-                                      &(state.previous), &(state.has_error));
-    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2),
-                                      &(state.previous), &(state.has_error));
-    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3),
-                                      &(state.previous), &(state.has_error));
-  }
-}
-
-template <>
-really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
-    utf8_checking_state<Architecture::ARM64> &state) {
-  uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error);
-  uint32x2_t v32 = vqmovn_u64(v64);
-  uint64x1_t result = vreinterpret_u64_u32(v32);
-  return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
-                                       : simdjson::SUCCESS;
-}
-
 template <>
 really_inline void find_whitespace_and_structurals<Architecture::ARM64>(
     simd_input<Architecture::ARM64> in, uint64_t &whitespace,
diff --git a/include/simdjson/stage1_find_marks_haswell.h b/include/simdjson/stage1_find_marks_haswell.h
index ae589e66..c43f33c3 100644
--- a/include/simdjson/stage1_find_marks_haswell.h
+++ b/include/simdjson/stage1_find_marks_haswell.h
@@ -20,47 +20,6 @@ compute_quote_mask<Architecture::HASWELL>(uint64_t quote_bits) {
   return quote_mask;
 }
 
-template <> struct utf8_checking_state<Architecture::HASWELL> {
-  __m256i has_error;
-  avx_processed_utf_bytes previous;
-  utf8_checking_state() {
-    has_error = _mm256_setzero_si256();
-    previous.raw_bytes = _mm256_setzero_si256();
-    previous.high_nibbles = _mm256_setzero_si256();
-    previous.carried_continuations = _mm256_setzero_si256();
-  }
-};
-
-template <>
-really_inline void check_utf8<Architecture::HASWELL>(
-    simd_input<Architecture::HASWELL> in,
-    utf8_checking_state<Architecture::HASWELL> &state) {
-  __m256i high_bit = _mm256_set1_epi8(0x80u);
-  if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) {
-    // it is ascii, we just check continuation
-    state.has_error = _mm256_or_si256(
-        _mm256_cmpgt_epi8(state.previous.carried_continuations,
-                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 1)),
-        state.has_error);
-  } else {
-    // it is not ascii so we have to do heavy work
-    state.previous =
-        avx_check_utf8_bytes(in.lo, &(state.previous), &(state.has_error));
-    state.previous =
-        avx_check_utf8_bytes(in.hi, &(state.previous), &(state.has_error));
-  }
-}
-
-template <>
-really_inline ErrorValues check_utf8_errors<Architecture::HASWELL>(
-    utf8_checking_state<Architecture::HASWELL> &state) {
-  return _mm256_testz_si256(state.has_error, state.has_error) == 0
-             ? simdjson::UTF8_ERROR
-             : simdjson::SUCCESS;
-}
-
 template <>
 really_inline void find_whitespace_and_structurals<Architecture::HASWELL>(
     simd_input<Architecture::HASWELL> in, uint64_t &whitespace,
diff --git a/include/simdjson/stage1_find_marks_westmere.h b/include/simdjson/stage1_find_marks_westmere.h
index 7336a2a7..082c8db2 100644
--- a/include/simdjson/stage1_find_marks_westmere.h
+++ b/include/simdjson/stage1_find_marks_westmere.h
@@ -17,59 +17,6 @@ compute_quote_mask<Architecture::WESTMERE>(uint64_t quote_bits) {
       _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
 }
 
-template <> struct utf8_checking_state<Architecture::WESTMERE> {
-  __m128i has_error = _mm_setzero_si128();
-  processed_utf_bytes previous{
-      _mm_setzero_si128(), // raw_bytes
-      _mm_setzero_si128(), // high_nibbles
-      _mm_setzero_si128()  // carried_continuations
-  };
-};
-
-template <>
-really_inline void check_utf8<Architecture::WESTMERE>(
-    simd_input<Architecture::WESTMERE> in,
-    utf8_checking_state<Architecture::WESTMERE> &state) {
-  __m128i high_bit = _mm_set1_epi8(0x80u);
-  if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), high_bit)) == 1) {
-    // it is ascii, we just check continuation
-    state.has_error =
-        _mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
-                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                                  9, 9, 9, 9, 9, 1)),
-                     state.has_error);
-  } else {
-    // it is not ascii so we have to do heavy work
-    state.previous =
-        check_utf8_bytes(in.v0, &(state.previous), &(state.has_error));
-    state.previous =
-        check_utf8_bytes(in.v1, &(state.previous), &(state.has_error));
-  }
-
-  if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), high_bit)) == 1) {
-    // it is ascii, we just check continuation
-    state.has_error =
-        _mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
-                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                                  9, 9, 9, 9, 9, 1)),
-                     state.has_error);
-  } else {
-    // it is not ascii so we have to do heavy work
-    state.previous =
-        check_utf8_bytes(in.v2, &(state.previous), &(state.has_error));
-    state.previous =
-        check_utf8_bytes(in.v3, &(state.previous), &(state.has_error));
-  }
-}
-
-template <>
-really_inline ErrorValues check_utf8_errors<Architecture::WESTMERE>(
-    utf8_checking_state<Architecture::WESTMERE> &state) {
-  return _mm_testz_si128(state.has_error, state.has_error) == 0
-             ? simdjson::UTF8_ERROR
-             : simdjson::SUCCESS;
-}
-
 template <>
 really_inline void find_whitespace_and_structurals<Architecture::WESTMERE>(
     simd_input<Architecture::WESTMERE> in, uint64_t &whitespace,
diff --git a/singleheader/amalgamation_demo.cpp b/singleheader/amalgamation_demo.cpp
index 789e0b52..71b1e9c9 100644
--- a/singleheader/amalgamation_demo.cpp
+++ b/singleheader/amalgamation_demo.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on Sun Aug  4 15:43:41 EDT 2019. Do not edit! */
+/* auto-generated on Wed Aug 14 10:31:26 DST 2019. Do not edit! */
 
 #include <iostream>
 #include "simdjson.h"
diff --git a/singleheader/simdjson.cpp b/singleheader/simdjson.cpp
index 63a7349a..ed000f63 100644
--- a/singleheader/simdjson.cpp
+++ b/singleheader/simdjson.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on Sun Aug  4 15:43:41 EDT 2019. Do not edit! */
+/* auto-generated on Wed Aug 14 10:31:26 DST 2019. Do not edit! */
 #include "simdjson.h"
 
 /* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
@@ -359,6 +359,7 @@ size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out) {
 #endif
 /* end file src/jsonminifier.cpp */
 /* begin file src/jsonparser.cpp */
+#include <atomic>
 
 namespace simdjson {
 
@@ -368,21 +369,21 @@ namespace simdjson {
 
 // function pointer type for json_parse
 using json_parse_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj,
-                                bool realloc_if_needed);
+                                bool realloc);
 
 // Pointer that holds the json_parse implementation corresponding to the
 // available SIMD instruction set
-extern json_parse_functype *json_parse_ptr;
+extern std::atomic<json_parse_functype *> json_parse_ptr;
 
 int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj,
-               bool realloc_if_needed) {
-  return json_parse_ptr(buf, len, pj, realloc_if_needed);
+               bool realloc) {
+  return json_parse_ptr.load(std::memory_order_relaxed)(buf, len, pj, realloc);
 }
 
 int json_parse(const char *buf, size_t len, ParsedJson &pj,
-               bool realloc_if_needed) {
-  return json_parse_ptr(reinterpret_cast<const uint8_t *>(buf), len, pj,
-                        realloc_if_needed);
+               bool realloc) {
+  return json_parse_ptr.load(std::memory_order_relaxed)(reinterpret_cast<const uint8_t *>(buf), len, pj,
+                                                        realloc);
 }
 
 Architecture find_best_supported_implementation() {
@@ -406,21 +407,21 @@ Architecture find_best_supported_implementation() {
 
 // Responsible to select the best json_parse implementation
 int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj,
-                        bool realloc_if_needed) {
+                        bool realloc) {
   Architecture best_implementation = find_best_supported_implementation();
   // Selecting the best implementation
   switch (best_implementation) {
 #ifdef IS_X86_64
   case Architecture::HASWELL:
-    json_parse_ptr = &json_parse_implementation<Architecture::HASWELL>;
+    json_parse_ptr.store(&json_parse_implementation<Architecture::HASWELL>, std::memory_order_relaxed);
     break;
   case Architecture::WESTMERE:
-    json_parse_ptr = &json_parse_implementation<Architecture::WESTMERE>;
+    json_parse_ptr.store(&json_parse_implementation<Architecture::WESTMERE>, std::memory_order_relaxed);
     break;
 #endif
 #ifdef IS_ARM64
   case Architecture::ARM64:
-    json_parse_ptr = &json_parse_implementation<Architecture::ARM64>;
+    json_parse_ptr.store(&json_parse_implementation<Architecture::ARM64>, std::memory_order_relaxed);
     break;
 #endif
   default:
@@ -428,18 +429,18 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj,
     return simdjson::UNEXPECTED_ERROR;
   }
 
-  return json_parse_ptr(buf, len, pj, realloc_if_needed);
+  return json_parse_ptr.load(std::memory_order_relaxed)(buf, len, pj, realloc);
 }
 
-json_parse_functype *json_parse_ptr = &json_parse_dispatch;
+std::atomic<json_parse_functype *> json_parse_ptr = &json_parse_dispatch;
 
 WARN_UNUSED
 ParsedJson build_parsed_json(const uint8_t *buf, size_t len,
-                             bool realloc_if_needed) {
+                             bool realloc) {
   ParsedJson pj;
   bool ok = pj.allocate_capacity(len);
   if (ok) {
-    json_parse(buf, len, pj, realloc_if_needed);
+    json_parse(buf, len, pj, realloc);
   } else {
     std::cerr << "failure during memory allocation " << std::endl;
   }
@@ -447,47 +448,1044 @@ ParsedJson build_parsed_json(const uint8_t *buf, size_t len,
 }
 } // namespace simdjson
 /* end file src/jsonparser.cpp */
-/* begin file src/stage1_find_marks.cpp */
+/* begin file include/simdjson/stage1_find_marks_flatten_haswell.h */
+// This file provides the same function as
+// stage1_find_marks_flatten_common.h, but uses Intel intrinsics.
+// This should provide better performance on Visual Studio
+// and other compilers that do a conservative optimization.
+
+// Specifically, on x64 processors with BMI,
+// x & (x - 1) should be mapped to
+// the blsr instruction. By using the
+// _blsr_u64 intrinsic, we
+// ensure that this will happen.
+/////////
+
 
 #ifdef IS_X86_64
 
 TARGET_HASWELL
 namespace simdjson {
-template <>
-int find_structural_bits<Architecture::HASWELL>(const uint8_t *buf, size_t len,
-                                                ParsedJson &pj) {
-  FIND_STRUCTURAL_BITS(Architecture::HASWELL, buf, len, pj,
-                       simdjson::haswell::flatten_bits);
+
+// flatten out values in 'bits' assuming that they are are to have values of idx
+// plus their position in the bitvector, and store these indexes at
+// base_ptr[base] incrementing base as we go
+// will potentially store extra values beyond end of valid bits, so base_ptr
+// needs to be large enough to handle this
+template<>
+really_inline void flatten_bits<Architecture::HASWELL>(uint32_t *base_ptr, uint32_t &base,
+													   uint32_t idx, uint64_t bits) {
+  // In some instances, the next branch is expensive because it is mispredicted.
+  // Unfortunately, in other cases,
+  // it helps tremendously.
+  if (bits == 0)
+    return;
+  uint32_t cnt = _mm_popcnt_u64(bits);
+  uint32_t next_base = base + cnt;
+  idx -= 64;
+  base_ptr += base;
+  {
+    base_ptr[0] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[1] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[2] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[3] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[4] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[5] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[6] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[7] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr += 8;
+  }
+  // We hope that the next branch is easily predicted.
+  if (cnt > 8) {
+    base_ptr[0] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[1] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[2] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[3] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[4] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[5] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[6] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr[7] = idx + trailing_zeroes(bits);
+    bits = _blsr_u64(bits);
+    base_ptr += 8;
+  }
+  if (cnt > 16) { // unluckly: we rarely get here
+    // since it means having one structural or pseudo-structral element
+    // every 4 characters (possible with inputs like "","","",...).
+    do {
+      base_ptr[0] = idx + trailing_zeroes(bits);
+      bits = _blsr_u64(bits);
+      base_ptr++;
+    } while (bits != 0);
+  }
+  base = next_base;
 }
 } // namespace simdjson
 UNTARGET_REGION
+#endif // IS_X86_64
+/* end file include/simdjson/stage1_find_marks_flatten_haswell.h */
+/* begin file src/stage1_find_marks.cpp */
 
-TARGET_WESTMERE
+#ifdef IS_X86_64
+
+#define TARGETED_ARCHITECTURE Architecture::HASWELL
+#define TARGETED_REGION TARGET_HASWELL
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is include already includes
+// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+
+#ifdef TARGETED_ARCHITECTURE
+#ifdef TARGETED_REGION
+
+TARGETED_REGION
 namespace simdjson {
+
+// return a bitvector indicating where we have characters that end an odd-length
+// sequence of backslashes (and thus change the behavior of the next character
+// to follow). A even-length sequence of backslashes, and, for that matter, the
+// largest even-length prefix of our odd-length sequence of backslashes, simply
+// modify the behavior of the backslashes themselves.
+// We also update the prev_iter_ends_odd_backslash reference parameter to
+// indicate whether we end an iteration on an odd-length sequence of
+// backslashes, which modifies our subsequent search for odd-length
+// sequences of backslashes in an obvious way.
 template <>
-int find_structural_bits<Architecture::WESTMERE>(const uint8_t *buf, size_t len,
-                                                 ParsedJson &pj) {
-  FIND_STRUCTURAL_BITS(Architecture::WESTMERE, buf, len, pj,
-                       simdjson::flatten_bits);
+really_inline uint64_t find_odd_backslash_sequences<TARGETED_ARCHITECTURE>(
+    simd_input<TARGETED_ARCHITECTURE> in,
+    uint64_t &prev_iter_ends_odd_backslash) {
+  const uint64_t even_bits = 0x5555555555555555ULL;
+  const uint64_t odd_bits = ~even_bits;
+  uint64_t bs_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '\\');
+  uint64_t start_edges = bs_bits & ~(bs_bits << 1);
+  /* flip lowest if we have an odd-length run at the end of the prior
+   * iteration */
+  uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
+  uint64_t even_starts = start_edges & even_start_mask;
+  uint64_t odd_starts = start_edges & ~even_start_mask;
+  uint64_t even_carries = bs_bits + even_starts;
+
+  uint64_t odd_carries;
+  /* must record the carry-out of our odd-carries out of bit 63; this
+   * indicates whether the sense of any edge going to the next iteration
+   * should be flipped */
+  bool iter_ends_odd_backslash =
+      add_overflow(bs_bits, odd_starts, &odd_carries);
+
+  odd_carries |= prev_iter_ends_odd_backslash; /* push in bit zero as a
+                                                * potential end if we had an
+                                                * odd-numbered run at the
+                                                * end of the previous
+                                                * iteration */
+  prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
+  uint64_t even_carry_ends = even_carries & ~bs_bits;
+  uint64_t odd_carry_ends = odd_carries & ~bs_bits;
+  uint64_t even_start_odd_end = even_carry_ends & odd_bits;
+  uint64_t odd_start_even_end = odd_carry_ends & even_bits;
+  uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
+  return odd_ends;
 }
+
+// return both the quote mask (which is a half-open mask that covers the first
+// quote
+// in an unescaped quote pair and everything in the quote pair) and the quote
+// bits, which are the simple
+// unescaped quoted bits. We also update the prev_iter_inside_quote value to
+// tell the next iteration
+// whether we finished the final iteration inside a quote pair; if so, this
+// inverts our behavior of
+// whether we're inside quotes for the next iteration.
+// Note that we don't do any error checking to see if we have backslash
+// sequences outside quotes; these
+// backslash sequences (of any length) will be detected elsewhere.
+template <>
+really_inline uint64_t find_quote_mask_and_bits<TARGETED_ARCHITECTURE>(
+    simd_input<TARGETED_ARCHITECTURE> in, uint64_t odd_ends,
+    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
+    uint64_t &error_mask) {
+  quote_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '"');
+  quote_bits = quote_bits & ~odd_ends;
+  uint64_t quote_mask = compute_quote_mask<TARGETED_ARCHITECTURE>(quote_bits);
+  quote_mask ^= prev_iter_inside_quote;
+  /* All Unicode characters may be placed within the
+   * quotation marks, except for the characters that MUST be escaped:
+   * quotation mark, reverse solidus, and the control characters (U+0000
+   * through U+001F).
+   * https://tools.ietf.org/html/rfc8259 */
+  uint64_t unescaped =
+      unsigned_lteq_against_input<TARGETED_ARCHITECTURE>(in, 0x1F);
+  error_mask |= quote_mask & unescaped;
+  /* right shift of a signed value expected to be well-defined and standard
+   * compliant as of C++20,
+   * John Regher from Utah U. says this is fine code */
+  prev_iter_inside_quote =
+      static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63);
+  return quote_mask;
+}
+
+// Find structural bits in a 64-byte chunk.
+really_inline void find_structural_bits_64(
+    const uint8_t *buf, size_t idx, uint32_t *base_ptr, uint32_t &base,
+    uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote,
+    uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals,
+    uint64_t &error_mask,
+    utf8_checking_state<TARGETED_ARCHITECTURE> &utf8_state) {
+  simd_input<TARGETED_ARCHITECTURE> in = fill_input<TARGETED_ARCHITECTURE>(buf);
+  check_utf8<TARGETED_ARCHITECTURE>(in, utf8_state);
+  /* detect odd sequences of backslashes */
+  uint64_t odd_ends = find_odd_backslash_sequences<TARGETED_ARCHITECTURE>(
+      in, prev_iter_ends_odd_backslash);
+
+  /* detect insides of quote pairs ("quote_mask") and also our quote_bits
+   * themselves */
+  uint64_t quote_bits;
+  uint64_t quote_mask = find_quote_mask_and_bits<TARGETED_ARCHITECTURE>(
+      in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
+
+  /* take the previous iterations structural bits, not our current
+   * iteration,
+   * and flatten */
+  flatten_bits<TARGETED_ARCHITECTURE>(base_ptr, base, idx, structurals);
+
+  uint64_t whitespace;
+  find_whitespace_and_structurals<TARGETED_ARCHITECTURE>(in, whitespace,
+                                                         structurals);
+
+  /* fixup structurals to reflect quotes and add pseudo-structural
+   * characters */
+  structurals = finalize_structurals(structurals, whitespace, quote_mask,
+                                     quote_bits, prev_iter_ends_pseudo_pred);
+}
+
+template <>
+int find_structural_bits<TARGETED_ARCHITECTURE>(const uint8_t *buf, size_t len,
+                                                ParsedJson &pj) {
+  if (len > pj.byte_capacity) {
+    std::cerr << "Your ParsedJson object only supports documents up to "
+              << pj.byte_capacity << " bytes but you are trying to process "
+              << len << " bytes" << std::endl;
+    return simdjson::CAPACITY;
+  }
+  uint32_t *base_ptr = pj.structural_indexes;
+  uint32_t base = 0;
+  utf8_checking_state<TARGETED_ARCHITECTURE> utf8_state;
+
+  /* we have padded the input out to 64 byte multiple with the remainder
+   * being zeros persistent state across loop does the last iteration end
+   * with an odd-length sequence of backslashes? */
+
+  /* either 0 or 1, but a 64-bit value */
+  uint64_t prev_iter_ends_odd_backslash = 0ULL;
+  /* does the previous iteration end inside a double-quote pair? */
+  uint64_t prev_iter_inside_quote =
+      0ULL; /* either all zeros or all ones
+             * does the previous iteration end on something that is a
+             * predecessor of a pseudo-structural character - i.e.
+             * whitespace or a structural character effectively the very
+             * first char is considered to follow "whitespace" for the
+             * purposes of pseudo-structural character detection so we
+             * initialize to 1 */
+  uint64_t prev_iter_ends_pseudo_pred = 1ULL;
+
+  /* structurals are persistent state across loop as we flatten them on the
+   * subsequent iteration into our array pointed to be base_ptr.
+   * This is harmless on the first iteration as structurals==0
+   * and is done for performance reasons; we can hide some of the latency of
+   * the
+   * expensive carryless multiply in the previous step with this work */
+  uint64_t structurals = 0;
+
+  size_t lenminus64 = len < 64 ? 0 : len - 64;
+  size_t idx = 0;
+  uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII
+                              code points < 0x20) */
+
+  for (; idx < lenminus64; idx += 64) {
+    find_structural_bits_64(&buf[idx], idx, base_ptr, base,
+                            prev_iter_ends_odd_backslash,
+                            prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
+                            structurals, error_mask, utf8_state);
+  }
+  /* If we have a final chunk of less than 64 bytes, pad it to 64 with
+   * spaces  before processing it (otherwise, we risk invalidating the UTF-8
+   * checks). */
+  if (idx < len) {
+    uint8_t tmp_buf[64];
+    memset(tmp_buf, 0x20, 64);
+    memcpy(tmp_buf, buf + idx, len - idx);
+    find_structural_bits_64(&tmp_buf[0], idx, base_ptr, base,
+                            prev_iter_ends_odd_backslash,
+                            prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
+                            structurals, error_mask, utf8_state);
+    idx += 64;
+  }
+
+  /* is last string quote closed? */
+  if (prev_iter_inside_quote) {
+    return simdjson::UNCLOSED_STRING;
+  }
+
+  /* finally, flatten out the remaining structurals from the last iteration
+   */
+  flatten_bits<TARGETED_ARCHITECTURE>(base_ptr, base, idx, structurals);
+
+  pj.n_structural_indexes = base;
+  /* a valid JSON file cannot have zero structural indexes - we should have
+   * found something */
+  if (pj.n_structural_indexes == 0u) {
+    return simdjson::EMPTY;
+  }
+  if (base_ptr[pj.n_structural_indexes - 1] > len) {
+    return simdjson::UNEXPECTED_ERROR;
+  }
+  if (len != base_ptr[pj.n_structural_indexes - 1]) {
+    /* the string might not be NULL terminated, but we add a virtual NULL
+     * ending
+     * character. */
+    base_ptr[pj.n_structural_indexes++] = len;
+  }
+  /* make it safe to dereference one beyond this array */
+  base_ptr[pj.n_structural_indexes] = 0;
+  if (error_mask) {
+    return simdjson::UNESCAPED_CHARS;
+  }
+  return check_utf8_errors<TARGETED_ARCHITECTURE>(utf8_state);
+}
+
 } // namespace simdjson
 UNTARGET_REGION
 
-#endif
+#else
+#error TARGETED_REGION must be specified before including.
+#endif // TARGETED_REGION
+#else
+#error TARGETED_ARCHITECTURE must be specified before including.
+#endif // TARGETED_ARCHITECTURE
+#undef TARGETED_ARCHITECTURE
+#undef TARGETED_REGION
+
+#define TARGETED_ARCHITECTURE Architecture::WESTMERE
+#define TARGETED_REGION TARGET_WESTMERE
+// This file contains a non-architecture-specific version of "flatten" used in stage1.
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is include already includes
+// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+
+#ifdef TARGETED_ARCHITECTURE
+#ifdef TARGETED_REGION
+
+TARGETED_REGION
+namespace simdjson {
+
+#ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking
+//
+// This is just a naive implementation. It should be normally
+// disable, but can be used for research purposes to compare
+// again our optimized version.
+template <>
+really_inline void flatten_bits<TARGETED_ARCHITECTURE>(uint32_t *base_ptr, uint32_t &base,
+                                                       uint32_t idx, uint64_t bits) {
+  uint32_t *out_ptr = base_ptr + base;
+  idx -= 64;
+  while (bits != 0) {
+    out_ptr[0] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    out_ptr++;
+  }
+  base = (out_ptr - base_ptr);
+}
+
+#else
+// flatten out values in 'bits' assuming that they are are to have values of idx
+// plus their position in the bitvector, and store these indexes at
+// base_ptr[base] incrementing base as we go
+// will potentially store extra values beyond end of valid bits, so base_ptr
+// needs to be large enough to handle this
+template<>
+really_inline void flatten_bits<TARGETED_ARCHITECTURE>(uint32_t *base_ptr, uint32_t &base,
+                                                       uint32_t idx, uint64_t bits) {
+  // In some instances, the next branch is expensive because it is mispredicted.
+  // Unfortunately, in other cases,
+  // it helps tremendously.
+  if (bits == 0)
+    return;
+  uint32_t cnt = hamming(bits);
+  uint32_t next_base = base + cnt;
+  idx -= 64;
+  base_ptr += base;
+  {
+    base_ptr[0] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[1] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[2] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[3] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[4] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[5] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[6] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[7] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr += 8;
+  }
+  // We hope that the next branch is easily predicted.
+  if (cnt > 8) {
+    base_ptr[0] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[1] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[2] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[3] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[4] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[5] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[6] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[7] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr += 8;
+  }
+  if (cnt > 16) { // unluckly: we rarely get here
+    // since it means having one structural or pseudo-structral element
+    // every 4 characters (possible with inputs like "","","",...).
+    do {
+      base_ptr[0] = idx + trailing_zeroes(bits);
+      bits = bits & (bits - 1);
+      base_ptr++;
+    } while (bits != 0);
+  }
+  base = next_base;
+}
+#endif // SIMDJSON_NAIVE_FLATTEN
+
+} // namespace simdjson
+UNTARGET_REGION
+
+#else
+#error TARGETED_REGION must be specified before including.
+#endif // TARGETED_REGION
+#else
+#error TARGETED_ARCHITECTURE must be specified before including.
+#endif // TARGETED_ARCHITECTURE
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is include already includes
+// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+
+#ifdef TARGETED_ARCHITECTURE
+#ifdef TARGETED_REGION
+
+TARGETED_REGION
+namespace simdjson {
+
+// return a bitvector indicating where we have characters that end an odd-length
+// sequence of backslashes (and thus change the behavior of the next character
+// to follow). A even-length sequence of backslashes, and, for that matter, the
+// largest even-length prefix of our odd-length sequence of backslashes, simply
+// modify the behavior of the backslashes themselves.
+// We also update the prev_iter_ends_odd_backslash reference parameter to
+// indicate whether we end an iteration on an odd-length sequence of
+// backslashes, which modifies our subsequent search for odd-length
+// sequences of backslashes in an obvious way.
+template <>
+really_inline uint64_t find_odd_backslash_sequences<TARGETED_ARCHITECTURE>(
+    simd_input<TARGETED_ARCHITECTURE> in,
+    uint64_t &prev_iter_ends_odd_backslash) {
+  const uint64_t even_bits = 0x5555555555555555ULL;
+  const uint64_t odd_bits = ~even_bits;
+  uint64_t bs_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '\\');
+  uint64_t start_edges = bs_bits & ~(bs_bits << 1);
+  /* flip lowest if we have an odd-length run at the end of the prior
+   * iteration */
+  uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
+  uint64_t even_starts = start_edges & even_start_mask;
+  uint64_t odd_starts = start_edges & ~even_start_mask;
+  uint64_t even_carries = bs_bits + even_starts;
+
+  uint64_t odd_carries;
+  /* must record the carry-out of our odd-carries out of bit 63; this
+   * indicates whether the sense of any edge going to the next iteration
+   * should be flipped */
+  bool iter_ends_odd_backslash =
+      add_overflow(bs_bits, odd_starts, &odd_carries);
+
+  odd_carries |= prev_iter_ends_odd_backslash; /* push in bit zero as a
+                                                * potential end if we had an
+                                                * odd-numbered run at the
+                                                * end of the previous
+                                                * iteration */
+  prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
+  uint64_t even_carry_ends = even_carries & ~bs_bits;
+  uint64_t odd_carry_ends = odd_carries & ~bs_bits;
+  uint64_t even_start_odd_end = even_carry_ends & odd_bits;
+  uint64_t odd_start_even_end = odd_carry_ends & even_bits;
+  uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
+  return odd_ends;
+}
+
+// return both the quote mask (which is a half-open mask that covers the first
+// quote
+// in an unescaped quote pair and everything in the quote pair) and the quote
+// bits, which are the simple
+// unescaped quoted bits. We also update the prev_iter_inside_quote value to
+// tell the next iteration
+// whether we finished the final iteration inside a quote pair; if so, this
+// inverts our behavior of
+// whether we're inside quotes for the next iteration.
+// Note that we don't do any error checking to see if we have backslash
+// sequences outside quotes; these
+// backslash sequences (of any length) will be detected elsewhere.
+template <>
+really_inline uint64_t find_quote_mask_and_bits<TARGETED_ARCHITECTURE>(
+    simd_input<TARGETED_ARCHITECTURE> in, uint64_t odd_ends,
+    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
+    uint64_t &error_mask) {
+  quote_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '"');
+  quote_bits = quote_bits & ~odd_ends;
+  uint64_t quote_mask = compute_quote_mask<TARGETED_ARCHITECTURE>(quote_bits);
+  quote_mask ^= prev_iter_inside_quote;
+  /* All Unicode characters may be placed within the
+   * quotation marks, except for the characters that MUST be escaped:
+   * quotation mark, reverse solidus, and the control characters (U+0000
+   * through U+001F).
+   * https://tools.ietf.org/html/rfc8259 */
+  uint64_t unescaped =
+      unsigned_lteq_against_input<TARGETED_ARCHITECTURE>(in, 0x1F);
+  error_mask |= quote_mask & unescaped;
+  /* right shift of a signed value expected to be well-defined and standard
+   * compliant as of C++20,
+   * John Regher from Utah U. says this is fine code */
+  prev_iter_inside_quote =
+      static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63);
+  return quote_mask;
+}
+
+// Find structural bits in a 64-byte chunk.
+really_inline void find_structural_bits_64(
+    const uint8_t *buf, size_t idx, uint32_t *base_ptr, uint32_t &base,
+    uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote,
+    uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals,
+    uint64_t &error_mask,
+    utf8_checking_state<TARGETED_ARCHITECTURE> &utf8_state) {
+  simd_input<TARGETED_ARCHITECTURE> in = fill_input<TARGETED_ARCHITECTURE>(buf);
+  check_utf8<TARGETED_ARCHITECTURE>(in, utf8_state);
+  /* detect odd sequences of backslashes */
+  uint64_t odd_ends = find_odd_backslash_sequences<TARGETED_ARCHITECTURE>(
+      in, prev_iter_ends_odd_backslash);
+
+  /* detect insides of quote pairs ("quote_mask") and also our quote_bits
+   * themselves */
+  uint64_t quote_bits;
+  uint64_t quote_mask = find_quote_mask_and_bits<TARGETED_ARCHITECTURE>(
+      in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
+
+  /* take the previous iterations structural bits, not our current
+   * iteration,
+   * and flatten */
+  flatten_bits<TARGETED_ARCHITECTURE>(base_ptr, base, idx, structurals);
+
+  uint64_t whitespace;
+  find_whitespace_and_structurals<TARGETED_ARCHITECTURE>(in, whitespace,
+                                                         structurals);
+
+  /* fixup structurals to reflect quotes and add pseudo-structural
+   * characters */
+  structurals = finalize_structurals(structurals, whitespace, quote_mask,
+                                     quote_bits, prev_iter_ends_pseudo_pred);
+}
+
+template <>
+int find_structural_bits<TARGETED_ARCHITECTURE>(const uint8_t *buf, size_t len,
+                                                ParsedJson &pj) {
+  if (len > pj.byte_capacity) {
+    std::cerr << "Your ParsedJson object only supports documents up to "
+              << pj.byte_capacity << " bytes but you are trying to process "
+              << len << " bytes" << std::endl;
+    return simdjson::CAPACITY;
+  }
+  uint32_t *base_ptr = pj.structural_indexes;
+  uint32_t base = 0;
+  utf8_checking_state<TARGETED_ARCHITECTURE> utf8_state;
+
+  /* we have padded the input out to 64 byte multiple with the remainder
+   * being zeros persistent state across loop does the last iteration end
+   * with an odd-length sequence of backslashes? */
+
+  /* either 0 or 1, but a 64-bit value */
+  uint64_t prev_iter_ends_odd_backslash = 0ULL;
+  /* does the previous iteration end inside a double-quote pair? */
+  uint64_t prev_iter_inside_quote =
+      0ULL; /* either all zeros or all ones
+             * does the previous iteration end on something that is a
+             * predecessor of a pseudo-structural character - i.e.
+             * whitespace or a structural character effectively the very
+             * first char is considered to follow "whitespace" for the
+             * purposes of pseudo-structural character detection so we
+             * initialize to 1 */
+  uint64_t prev_iter_ends_pseudo_pred = 1ULL;
+
+  /* structurals are persistent state across loop as we flatten them on the
+   * subsequent iteration into our array pointed to be base_ptr.
+   * This is harmless on the first iteration as structurals==0
+   * and is done for performance reasons; we can hide some of the latency of
+   * the
+   * expensive carryless multiply in the previous step with this work */
+  uint64_t structurals = 0;
+
+  size_t lenminus64 = len < 64 ? 0 : len - 64;
+  size_t idx = 0;
+  uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII
+                              code points < 0x20) */
+
+  for (; idx < lenminus64; idx += 64) {
+    find_structural_bits_64(&buf[idx], idx, base_ptr, base,
+                            prev_iter_ends_odd_backslash,
+                            prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
+                            structurals, error_mask, utf8_state);
+  }
+  /* If we have a final chunk of less than 64 bytes, pad it to 64 with
+   * spaces  before processing it (otherwise, we risk invalidating the UTF-8
+   * checks). */
+  if (idx < len) {
+    uint8_t tmp_buf[64];
+    memset(tmp_buf, 0x20, 64);
+    memcpy(tmp_buf, buf + idx, len - idx);
+    find_structural_bits_64(&tmp_buf[0], idx, base_ptr, base,
+                            prev_iter_ends_odd_backslash,
+                            prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
+                            structurals, error_mask, utf8_state);
+    idx += 64;
+  }
+
+  /* is last string quote closed? */
+  if (prev_iter_inside_quote) {
+    return simdjson::UNCLOSED_STRING;
+  }
+
+  /* finally, flatten out the remaining structurals from the last iteration
+   */
+  flatten_bits<TARGETED_ARCHITECTURE>(base_ptr, base, idx, structurals);
+
+  pj.n_structural_indexes = base;
+  /* a valid JSON file cannot have zero structural indexes - we should have
+   * found something */
+  if (pj.n_structural_indexes == 0u) {
+    return simdjson::EMPTY;
+  }
+  if (base_ptr[pj.n_structural_indexes - 1] > len) {
+    return simdjson::UNEXPECTED_ERROR;
+  }
+  if (len != base_ptr[pj.n_structural_indexes - 1]) {
+    /* the string might not be NULL terminated, but we add a virtual NULL
+     * ending
+     * character. */
+    base_ptr[pj.n_structural_indexes++] = len;
+  }
+  /* make it safe to dereference one beyond this array */
+  base_ptr[pj.n_structural_indexes] = 0;
+  if (error_mask) {
+    return simdjson::UNESCAPED_CHARS;
+  }
+  return check_utf8_errors<TARGETED_ARCHITECTURE>(utf8_state);
+}
+
+} // namespace simdjson
+UNTARGET_REGION
+
+#else
+#error TARGETED_REGION must be specified before including.
+#endif // TARGETED_REGION
+#else
+#error TARGETED_ARCHITECTURE must be specified before including.
+#endif // TARGETED_ARCHITECTURE
+#undef TARGETED_ARCHITECTURE
+#undef TARGETED_REGION
+
+#endif // IS_X86_64
 
 #ifdef IS_ARM64
+
+#define TARGETED_ARCHITECTURE Architecture::ARM64
+#define TARGETED_REGION TARGET_ARM64
+// This file contains a non-architecture-specific version of "flatten" used in stage1.
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is include already includes
+// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+
+#ifdef TARGETED_ARCHITECTURE
+#ifdef TARGETED_REGION
+
+TARGETED_REGION
 namespace simdjson {
+
+#ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking
+//
+// This is just a naive implementation. It should be normally
+// disable, but can be used for research purposes to compare
+// again our optimized version.
 template <>
-int find_structural_bits<Architecture::ARM64>(const uint8_t *buf, size_t len,
-                                              ParsedJson &pj) {
-  FIND_STRUCTURAL_BITS(Architecture::ARM64, buf, len, pj,
-                       simdjson::flatten_bits);
+really_inline void flatten_bits<TARGETED_ARCHITECTURE>(uint32_t *base_ptr, uint32_t &base,
+                                                       uint32_t idx, uint64_t bits) {
+  uint32_t *out_ptr = base_ptr + base;
+  idx -= 64;
+  while (bits != 0) {
+    out_ptr[0] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    out_ptr++;
+  }
+  base = (out_ptr - base_ptr);
 }
+
+#else
+// flatten out values in 'bits' assuming that they are are to have values of idx
+// plus their position in the bitvector, and store these indexes at
+// base_ptr[base] incrementing base as we go
+// will potentially store extra values beyond end of valid bits, so base_ptr
+// needs to be large enough to handle this
+template<>
+really_inline void flatten_bits<TARGETED_ARCHITECTURE>(uint32_t *base_ptr, uint32_t &base,
+                                                       uint32_t idx, uint64_t bits) {
+  // In some instances, the next branch is expensive because it is mispredicted.
+  // Unfortunately, in other cases,
+  // it helps tremendously.
+  if (bits == 0)
+    return;
+  uint32_t cnt = hamming(bits);
+  uint32_t next_base = base + cnt;
+  idx -= 64;
+  base_ptr += base;
+  {
+    base_ptr[0] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[1] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[2] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[3] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[4] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[5] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[6] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[7] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr += 8;
+  }
+  // We hope that the next branch is easily predicted.
+  if (cnt > 8) {
+    base_ptr[0] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[1] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[2] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[3] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[4] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[5] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[6] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr[7] = idx + trailing_zeroes(bits);
+    bits = bits & (bits - 1);
+    base_ptr += 8;
+  }
+  if (cnt > 16) { // unluckly: we rarely get here
+    // since it means having one structural or pseudo-structral element
+    // every 4 characters (possible with inputs like "","","",...).
+    do {
+      base_ptr[0] = idx + trailing_zeroes(bits);
+      bits = bits & (bits - 1);
+      base_ptr++;
+    } while (bits != 0);
+  }
+  base = next_base;
+}
+#endif // SIMDJSON_NAIVE_FLATTEN
+
 } // namespace simdjson
-#endif
+UNTARGET_REGION
+
+#else
+#error TARGETED_REGION must be specified before including.
+#endif // TARGETED_REGION
+#else
+#error TARGETED_ARCHITECTURE must be specified before including.
+#endif // TARGETED_ARCHITECTURE
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is include already includes
+// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+
+#ifdef TARGETED_ARCHITECTURE
+#ifdef TARGETED_REGION
+
+TARGETED_REGION
+namespace simdjson {
+
+// return a bitvector indicating where we have characters that end an odd-length
+// sequence of backslashes (and thus change the behavior of the next character
+// to follow). A even-length sequence of backslashes, and, for that matter, the
+// largest even-length prefix of our odd-length sequence of backslashes, simply
+// modify the behavior of the backslashes themselves.
+// We also update the prev_iter_ends_odd_backslash reference parameter to
+// indicate whether we end an iteration on an odd-length sequence of
+// backslashes, which modifies our subsequent search for odd-length
+// sequences of backslashes in an obvious way.
+template <>
+really_inline uint64_t find_odd_backslash_sequences<TARGETED_ARCHITECTURE>(
+    simd_input<TARGETED_ARCHITECTURE> in,
+    uint64_t &prev_iter_ends_odd_backslash) {
+  const uint64_t even_bits = 0x5555555555555555ULL;
+  const uint64_t odd_bits = ~even_bits;
+  uint64_t bs_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '\\');
+  uint64_t start_edges = bs_bits & ~(bs_bits << 1);
+  /* flip lowest if we have an odd-length run at the end of the prior
+   * iteration */
+  uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
+  uint64_t even_starts = start_edges & even_start_mask;
+  uint64_t odd_starts = start_edges & ~even_start_mask;
+  uint64_t even_carries = bs_bits + even_starts;
+
+  uint64_t odd_carries;
+  /* must record the carry-out of our odd-carries out of bit 63; this
+   * indicates whether the sense of any edge going to the next iteration
+   * should be flipped */
+  bool iter_ends_odd_backslash =
+      add_overflow(bs_bits, odd_starts, &odd_carries);
+
+  odd_carries |= prev_iter_ends_odd_backslash; /* push in bit zero as a
+                                                * potential end if we had an
+                                                * odd-numbered run at the
+                                                * end of the previous
+                                                * iteration */
+  prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
+  uint64_t even_carry_ends = even_carries & ~bs_bits;
+  uint64_t odd_carry_ends = odd_carries & ~bs_bits;
+  uint64_t even_start_odd_end = even_carry_ends & odd_bits;
+  uint64_t odd_start_even_end = odd_carry_ends & even_bits;
+  uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
+  return odd_ends;
+}
+
+// return both the quote mask (which is a half-open mask that covers the first
+// quote
+// in an unescaped quote pair and everything in the quote pair) and the quote
+// bits, which are the simple
+// unescaped quoted bits. We also update the prev_iter_inside_quote value to
+// tell the next iteration
+// whether we finished the final iteration inside a quote pair; if so, this
+// inverts our behavior of
+// whether we're inside quotes for the next iteration.
+// Note that we don't do any error checking to see if we have backslash
+// sequences outside quotes; these
+// backslash sequences (of any length) will be detected elsewhere.
+template <>
+really_inline uint64_t find_quote_mask_and_bits<TARGETED_ARCHITECTURE>(
+    simd_input<TARGETED_ARCHITECTURE> in, uint64_t odd_ends,
+    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
+    uint64_t &error_mask) {
+  quote_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '"');
+  quote_bits = quote_bits & ~odd_ends;
+  uint64_t quote_mask = compute_quote_mask<TARGETED_ARCHITECTURE>(quote_bits);
+  quote_mask ^= prev_iter_inside_quote;
+  /* All Unicode characters may be placed within the
+   * quotation marks, except for the characters that MUST be escaped:
+   * quotation mark, reverse solidus, and the control characters (U+0000
+   * through U+001F).
+   * https://tools.ietf.org/html/rfc8259 */
+  uint64_t unescaped =
+      unsigned_lteq_against_input<TARGETED_ARCHITECTURE>(in, 0x1F);
+  error_mask |= quote_mask & unescaped;
+  /* right shift of a signed value expected to be well-defined and standard
+   * compliant as of C++20,
+   * John Regher from Utah U. says this is fine code */
+  prev_iter_inside_quote =
+      static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63);
+  return quote_mask;
+}
+
+// Find structural bits in a 64-byte chunk.
+really_inline void find_structural_bits_64(
+    const uint8_t *buf, size_t idx, uint32_t *base_ptr, uint32_t &base,
+    uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote,
+    uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals,
+    uint64_t &error_mask,
+    utf8_checking_state<TARGETED_ARCHITECTURE> &utf8_state) {
+  simd_input<TARGETED_ARCHITECTURE> in = fill_input<TARGETED_ARCHITECTURE>(buf);
+  check_utf8<TARGETED_ARCHITECTURE>(in, utf8_state);
+  /* detect odd sequences of backslashes */
+  uint64_t odd_ends = find_odd_backslash_sequences<TARGETED_ARCHITECTURE>(
+      in, prev_iter_ends_odd_backslash);
+
+  /* detect insides of quote pairs ("quote_mask") and also our quote_bits
+   * themselves */
+  uint64_t quote_bits;
+  uint64_t quote_mask = find_quote_mask_and_bits<TARGETED_ARCHITECTURE>(
+      in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
+
+  /* take the previous iterations structural bits, not our current
+   * iteration,
+   * and flatten */
+  flatten_bits<TARGETED_ARCHITECTURE>(base_ptr, base, idx, structurals);
+
+  uint64_t whitespace;
+  find_whitespace_and_structurals<TARGETED_ARCHITECTURE>(in, whitespace,
+                                                         structurals);
+
+  /* fixup structurals to reflect quotes and add pseudo-structural
+   * characters */
+  structurals = finalize_structurals(structurals, whitespace, quote_mask,
+                                     quote_bits, prev_iter_ends_pseudo_pred);
+}
+
+template <>
+int find_structural_bits<TARGETED_ARCHITECTURE>(const uint8_t *buf, size_t len,
+                                                ParsedJson &pj) {
+  if (len > pj.byte_capacity) {
+    std::cerr << "Your ParsedJson object only supports documents up to "
+              << pj.byte_capacity << " bytes but you are trying to process "
+              << len << " bytes" << std::endl;
+    return simdjson::CAPACITY;
+  }
+  uint32_t *base_ptr = pj.structural_indexes;
+  uint32_t base = 0;
+  utf8_checking_state<TARGETED_ARCHITECTURE> utf8_state;
+
+  /* we have padded the input out to 64 byte multiple with the remainder
+   * being zeros persistent state across loop does the last iteration end
+   * with an odd-length sequence of backslashes? */
+
+  /* either 0 or 1, but a 64-bit value */
+  uint64_t prev_iter_ends_odd_backslash = 0ULL;
+  /* does the previous iteration end inside a double-quote pair? */
+  uint64_t prev_iter_inside_quote =
+      0ULL; /* either all zeros or all ones
+             * does the previous iteration end on something that is a
+             * predecessor of a pseudo-structural character - i.e.
+             * whitespace or a structural character effectively the very
+             * first char is considered to follow "whitespace" for the
+             * purposes of pseudo-structural character detection so we
+             * initialize to 1 */
+  uint64_t prev_iter_ends_pseudo_pred = 1ULL;
+
+  /* structurals are persistent state across loop as we flatten them on the
+   * subsequent iteration into our array pointed to be base_ptr.
+   * This is harmless on the first iteration as structurals==0
+   * and is done for performance reasons; we can hide some of the latency of
+   * the
+   * expensive carryless multiply in the previous step with this work */
+  uint64_t structurals = 0;
+
+  size_t lenminus64 = len < 64 ? 0 : len - 64;
+  size_t idx = 0;
+  uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII
+                              code points < 0x20) */
+
+  for (; idx < lenminus64; idx += 64) {
+    find_structural_bits_64(&buf[idx], idx, base_ptr, base,
+                            prev_iter_ends_odd_backslash,
+                            prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
+                            structurals, error_mask, utf8_state);
+  }
+  /* If we have a final chunk of less than 64 bytes, pad it to 64 with
+   * spaces  before processing it (otherwise, we risk invalidating the UTF-8
+   * checks). */
+  if (idx < len) {
+    uint8_t tmp_buf[64];
+    memset(tmp_buf, 0x20, 64);
+    memcpy(tmp_buf, buf + idx, len - idx);
+    find_structural_bits_64(&tmp_buf[0], idx, base_ptr, base,
+                            prev_iter_ends_odd_backslash,
+                            prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
+                            structurals, error_mask, utf8_state);
+    idx += 64;
+  }
+
+  /* is last string quote closed? */
+  if (prev_iter_inside_quote) {
+    return simdjson::UNCLOSED_STRING;
+  }
+
+  /* finally, flatten out the remaining structurals from the last iteration
+   */
+  flatten_bits<TARGETED_ARCHITECTURE>(base_ptr, base, idx, structurals);
+
+  pj.n_structural_indexes = base;
+  /* a valid JSON file cannot have zero structural indexes - we should have
+   * found something */
+  if (pj.n_structural_indexes == 0u) {
+    return simdjson::EMPTY;
+  }
+  if (base_ptr[pj.n_structural_indexes - 1] > len) {
+    return simdjson::UNEXPECTED_ERROR;
+  }
+  if (len != base_ptr[pj.n_structural_indexes - 1]) {
+    /* the string might not be NULL terminated, but we add a virtual NULL
+     * ending
+     * character. */
+    base_ptr[pj.n_structural_indexes++] = len;
+  }
+  /* make it safe to dereference one beyond this array */
+  base_ptr[pj.n_structural_indexes] = 0;
+  if (error_mask) {
+    return simdjson::UNESCAPED_CHARS;
+  }
+  return check_utf8_errors<TARGETED_ARCHITECTURE>(utf8_state);
+}
+
+} // namespace simdjson
+UNTARGET_REGION
+
+#else
+#error TARGETED_REGION must be specified before including.
+#endif // TARGETED_REGION
+#else
+#error TARGETED_ARCHITECTURE must be specified before including.
+#endif // TARGETED_ARCHITECTURE
+#undef TARGETED_ARCHITECTURE
+#undef TARGETED_REGION
+
+#endif // IS_ARM64
 /* end file src/stage1_find_marks.cpp */
 /* begin file src/stage2_build_tape.cpp */
 
+#ifdef IS_X86_64
+#define TARGETED_ARCHITECTURE Architecture::HASWELL
+#define TARGETED_REGION TARGET_HASWELL
+// This file contains the common code every implementation uses for stage2
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is include already includes
+// "simdjson/stage2_build_tape.h" (this simplifies amalgation)
+
+#ifdef TARGETED_ARCHITECTURE
+#ifdef TARGETED_REGION
+
+TARGETED_REGION
 namespace simdjson {
 
 // this macro reads the next structural character, updating idx, i and c.
@@ -522,537 +1520,1583 @@ namespace simdjson {
  * The JSON is parsed to a tape, see the accompanying tape.md file
  * for documentation.
  ***********/
-// We need to compile that code for multiple architectures. However, target
-// attributes can be used only once by function definition. Huge macro seemed
-// better than huge code duplication. int UNIFIED_MACHINE(const uint8_t *buf,
-// size_t len, ParsedJson &pj)
-#define UNIFIED_MACHINE(T, buf, len, pj)                                       \
-  {                                                                            \
-    if (ALLOW_SAME_PAGE_BUFFER_OVERRUN) {                                      \
-      memset((uint8_t *)buf + len, 0,                                          \
-             SIMDJSON_PADDING); /* to please valgrind */                       \
-    }                                                                          \
-    uint32_t i = 0; /* index of the structural character (0,1,2,3...) */       \
-    uint32_t                                                                   \
-        idx;   /* location of the structural character in the input (buf)   */ \
-    uint8_t c; /* used to track the (structural) character we are looking at,  \
-                  updated */                                                   \
-    /* by UPDATE_CHAR macro */                                                 \
-    uint32_t depth = 0; /* could have an arbitrary starting depth */           \
-    pj.init();          /* sets is_valid to false          */                  \
-    if (pj.byte_capacity < len) {                                              \
-      pj.error_code = simdjson::CAPACITY;                                      \
-      return pj.error_code;                                                    \
-    }                                                                          \
-                                                                               \
-    /*//////////////////////////// START STATE /////////////////////////////   \
-     */                                                                        \
-    SET_GOTO_START_CONTINUE()                                                  \
-    pj.containing_scope_offset[depth] = pj.get_current_loc();                  \
-    pj.write_tape(0, 'r'); /* r for root, 0 is going to get overwritten */     \
-    /* the root is used, if nothing else, to capture the size of the tape */   \
-    depth++; /* everything starts at depth = 1, depth = 0 is just for the      \
-                root, the root may contain an object, an array or something    \
-                else. */                                                       \
-    if (depth >= pj.depth_capacity) {                                          \
-      goto fail;                                                               \
-    }                                                                          \
-                                                                               \
-    UPDATE_CHAR();                                                             \
-    switch (c) {                                                               \
-    case '{':                                                                  \
-      pj.containing_scope_offset[depth] = pj.get_current_loc();                \
-      SET_GOTO_START_CONTINUE();                                               \
-      depth++;                                                                 \
-      if (depth >= pj.depth_capacity) {                                        \
-        goto fail;                                                             \
-      }                                                                        \
-      pj.write_tape(                                                           \
-          0,                                                                   \
-          c); /* strangely, moving this to object_begin slows things down */   \
-      goto object_begin;                                                       \
-    case '[':                                                                  \
-      pj.containing_scope_offset[depth] = pj.get_current_loc();                \
-      SET_GOTO_START_CONTINUE();                                               \
-      depth++;                                                                 \
-      if (depth >= pj.depth_capacity) {                                        \
-        goto fail;                                                             \
-      }                                                                        \
-      pj.write_tape(0, c);                                                     \
-      goto array_begin;                                                        \
-      /* #define SIMDJSON_ALLOWANYTHINGINROOT                                  \
-       * A JSON text is a serialized value.  Note that certain previous        \
-       * specifications of JSON constrained a JSON text to be an object or an  \
-       * array.  Implementations that generate only objects or arrays where a  \
-       * JSON text is called for will be interoperable in the sense that all   \
-       * implementations will accept these as conforming JSON texts.           \
-       * https://tools.ietf.org/html/rfc8259                                   \
-       * #ifdef SIMDJSON_ALLOWANYTHINGINROOT */                                \
-    case '"': {                                                                \
-      if (!parse_string<T>(buf, len, pj, depth, idx)) {                        \
-        goto fail;                                                             \
-      }                                                                        \
-      break;                                                                   \
-    }                                                                          \
-    case 't': {                                                                \
-      /* we need to make a copy to make sure that the string is space          \
-       * terminated.                                                           \
-       * this only applies to the JSON document made solely of the true value. \
-       * this will almost never be called in practice */                       \
-      char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));        \
-      if (copy == nullptr) {                                                   \
-        goto fail;                                                             \
-      }                                                                        \
-      memcpy(copy, buf, len);                                                  \
-      copy[len] = ' ';                                                         \
-      if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) +        \
-                              idx)) {                                          \
-        free(copy);                                                            \
-        goto fail;                                                             \
-      }                                                                        \
-      free(copy);                                                              \
-      pj.write_tape(0, c);                                                     \
-      break;                                                                   \
-    }                                                                          \
-    case 'f': {                                                                \
-      /* we need to make a copy to make sure that the string is space          \
-       * terminated.                                                           \
-       * this only applies to the JSON document made solely of the false       \
-       * value.                                                                \
-       * this will almost never be called in practice */                       \
-      char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));        \
-      if (copy == nullptr) {                                                   \
-        goto fail;                                                             \
-      }                                                                        \
-      memcpy(copy, buf, len);                                                  \
-      copy[len] = ' ';                                                         \
-      if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) +       \
-                               idx)) {                                         \
-        free(copy);                                                            \
-        goto fail;                                                             \
-      }                                                                        \
-      free(copy);                                                              \
-      pj.write_tape(0, c);                                                     \
-      break;                                                                   \
-    }                                                                          \
-    case 'n': {                                                                \
-      /* we need to make a copy to make sure that the string is space          \
-       * terminated.                                                           \
-       * this only applies to the JSON document made solely of the null value. \
-       * this will almost never be called in practice */                       \
-      char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));        \
-      if (copy == nullptr) {                                                   \
-        goto fail;                                                             \
-      }                                                                        \
-      memcpy(copy, buf, len);                                                  \
-      copy[len] = ' ';                                                         \
-      if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) +        \
-                              idx)) {                                          \
-        free(copy);                                                            \
-        goto fail;                                                             \
-      }                                                                        \
-      free(copy);                                                              \
-      pj.write_tape(0, c);                                                     \
-      break;                                                                   \
-    }                                                                          \
-    case '0':                                                                  \
-    case '1':                                                                  \
-    case '2':                                                                  \
-    case '3':                                                                  \
-    case '4':                                                                  \
-    case '5':                                                                  \
-    case '6':                                                                  \
-    case '7':                                                                  \
-    case '8':                                                                  \
-    case '9': {                                                                \
-      /* we need to make a copy to make sure that the string is space          \
-       * terminated.                                                           \
-       * this is done only for JSON documents made of a sole number            \
-       * this will almost never be called in practice. We terminate with a     \
-       * space                                                                 \
-       * because we do not want to allow NULLs in the middle of a number       \
-       * (whereas a                                                            \
-       * space in the middle of a number would be identified in stage 1). */   \
-      char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));        \
-      if (copy == nullptr) {                                                   \
-        goto fail;                                                             \
-      }                                                                        \
-      memcpy(copy, buf, len);                                                  \
-      copy[len] = ' ';                                                         \
-      if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx,      \
-                        false)) {                                              \
-        free(copy);                                                            \
-        goto fail;                                                             \
-      }                                                                        \
-      free(copy);                                                              \
-      break;                                                                   \
-    }                                                                          \
-    case '-': {                                                                \
-      /* we need to make a copy to make sure that the string is NULL           \
-       * terminated.                                                           \
-       * this is done only for JSON documents made of a sole number            \
-       * this will almost never be called in practice */                       \
-      char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));        \
-      if (copy == nullptr) {                                                   \
-        goto fail;                                                             \
-      }                                                                        \
-      memcpy(copy, buf, len);                                                  \
-      copy[len] = ' ';                                                         \
-      if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx,      \
-                        true)) {                                               \
-        free(copy);                                                            \
-        goto fail;                                                             \
-      }                                                                        \
-      free(copy);                                                              \
-      break;                                                                   \
-    }                                                                          \
-    default:                                                                   \
-      goto fail;                                                               \
-    }                                                                          \
-  start_continue:                                                              \
-    /* the string might not be NULL terminated. */                             \
-    if (i + 1 == pj.n_structural_indexes) {                                    \
-      goto succeed;                                                            \
-    } else {                                                                   \
-      goto fail;                                                               \
-    }                                                                          \
-    /*//////////////////////////// OBJECT STATES ///////////////////////////*/ \
-                                                                               \
-  object_begin:                                                                \
-    UPDATE_CHAR();                                                             \
-    switch (c) {                                                               \
-    case '"': {                                                                \
-      if (!parse_string<T>(buf, len, pj, depth, idx)) {                        \
-        goto fail;                                                             \
-      }                                                                        \
-      goto object_key_state;                                                   \
-    }                                                                          \
-    case '}':                                                                  \
-      goto scope_end; /* could also go to object_continue */                   \
-    default:                                                                   \
-      goto fail;                                                               \
-    }                                                                          \
-                                                                               \
-  object_key_state:                                                            \
-    UPDATE_CHAR();                                                             \
-    if (c != ':') {                                                            \
-      goto fail;                                                               \
-    }                                                                          \
-    UPDATE_CHAR();                                                             \
-    switch (c) {                                                               \
-    case '"': {                                                                \
-      if (!parse_string<T>(buf, len, pj, depth, idx)) {                        \
-        goto fail;                                                             \
-      }                                                                        \
-      break;                                                                   \
-    }                                                                          \
-    case 't':                                                                  \
-      if (!is_valid_true_atom(buf + idx)) {                                    \
-        goto fail;                                                             \
-      }                                                                        \
-      pj.write_tape(0, c);                                                     \
-      break;                                                                   \
-    case 'f':                                                                  \
-      if (!is_valid_false_atom(buf + idx)) {                                   \
-        goto fail;                                                             \
-      }                                                                        \
-      pj.write_tape(0, c);                                                     \
-      break;                                                                   \
-    case 'n':                                                                  \
-      if (!is_valid_null_atom(buf + idx)) {                                    \
-        goto fail;                                                             \
-      }                                                                        \
-      pj.write_tape(0, c);                                                     \
-      break;                                                                   \
-    case '0':                                                                  \
-    case '1':                                                                  \
-    case '2':                                                                  \
-    case '3':                                                                  \
-    case '4':                                                                  \
-    case '5':                                                                  \
-    case '6':                                                                  \
-    case '7':                                                                  \
-    case '8':                                                                  \
-    case '9': {                                                                \
-      if (!parse_number(buf, pj, idx, false)) {                                \
-        goto fail;                                                             \
-      }                                                                        \
-      break;                                                                   \
-    }                                                                          \
-    case '-': {                                                                \
-      if (!parse_number(buf, pj, idx, true)) {                                 \
-        goto fail;                                                             \
-      }                                                                        \
-      break;                                                                   \
-    }                                                                          \
-    case '{': {                                                                \
-      pj.containing_scope_offset[depth] = pj.get_current_loc();                \
-      pj.write_tape(0, c); /* here the compilers knows what c is so this gets  \
-                              optimized */                                     \
-      /* we have not yet encountered } so we need to come back for it */       \
-      SET_GOTO_OBJECT_CONTINUE()                                               \
-      /* we found an object inside an object, so we need to increment the      \
-       * depth                                                             */  \
-      depth++;                                                                 \
-      if (depth >= pj.depth_capacity) {                                        \
-        goto fail;                                                             \
-      }                                                                        \
-                                                                               \
-      goto object_begin;                                                       \
-    }                                                                          \
-    case '[': {                                                                \
-      pj.containing_scope_offset[depth] = pj.get_current_loc();                \
-      pj.write_tape(0, c); /* here the compilers knows what c is so this gets  \
-                              optimized */                                     \
-      /* we have not yet encountered } so we need to come back for it */       \
-      SET_GOTO_OBJECT_CONTINUE()                                               \
-      /* we found an array inside an object, so we need to increment the depth \
-       */                                                                      \
-      depth++;                                                                 \
-      if (depth >= pj.depth_capacity) {                                        \
-        goto fail;                                                             \
-      }                                                                        \
-      goto array_begin;                                                        \
-    }                                                                          \
-    default:                                                                   \
-      goto fail;                                                               \
-    }                                                                          \
-                                                                               \
-  object_continue:                                                             \
-    UPDATE_CHAR();                                                             \
-    switch (c) {                                                               \
-    case ',':                                                                  \
-      UPDATE_CHAR();                                                           \
-      if (c != '"') {                                                          \
-        goto fail;                                                             \
-      } else {                                                                 \
-        if (!parse_string<T>(buf, len, pj, depth, idx)) {                      \
-          goto fail;                                                           \
-        }                                                                      \
-        goto object_key_state;                                                 \
-      }                                                                        \
-    case '}':                                                                  \
-      goto scope_end;                                                          \
-    default:                                                                   \
-      goto fail;                                                               \
-    }                                                                          \
-                                                                               \
-    /*//////////////////////////// COMMON STATE ///////////////////////////*/  \
-                                                                               \
-  scope_end:                                                                   \
-    /* write our tape location to the header scope */                          \
-    depth--;                                                                   \
-    pj.write_tape(pj.containing_scope_offset[depth], c);                       \
-    pj.annotate_previous_loc(pj.containing_scope_offset[depth],                \
-                             pj.get_current_loc());                            \
-    /* goto saved_state */                                                     \
-    GOTO_CONTINUE()                                                            \
-                                                                               \
-    /*//////////////////////////// ARRAY STATES ///////////////////////////*/  \
-  array_begin:                                                                 \
-    UPDATE_CHAR();                                                             \
-    if (c == ']') {                                                            \
-      goto scope_end; /* could also go to array_continue */                    \
-    }                                                                          \
-                                                                               \
-  main_array_switch:                                                           \
-    /* we call update char on all paths in, so we can peek at c on the         \
-     * on paths that can accept a close square brace (post-, and at start) */  \
-    switch (c) {                                                               \
-    case '"': {                                                                \
-      if (!parse_string<T>(buf, len, pj, depth, idx)) {                        \
-        goto fail;                                                             \
-      }                                                                        \
-      break;                                                                   \
-    }                                                                          \
-    case 't':                                                                  \
-      if (!is_valid_true_atom(buf + idx)) {                                    \
-        goto fail;                                                             \
-      }                                                                        \
-      pj.write_tape(0, c);                                                     \
-      break;                                                                   \
-    case 'f':                                                                  \
-      if (!is_valid_false_atom(buf + idx)) {                                   \
-        goto fail;                                                             \
-      }                                                                        \
-      pj.write_tape(0, c);                                                     \
-      break;                                                                   \
-    case 'n':                                                                  \
-      if (!is_valid_null_atom(buf + idx)) {                                    \
-        goto fail;                                                             \
-      }                                                                        \
-      pj.write_tape(0, c);                                                     \
-      break; /* goto array_continue; */                                        \
-                                                                               \
-    case '0':                                                                  \
-    case '1':                                                                  \
-    case '2':                                                                  \
-    case '3':                                                                  \
-    case '4':                                                                  \
-    case '5':                                                                  \
-    case '6':                                                                  \
-    case '7':                                                                  \
-    case '8':                                                                  \
-    case '9': {                                                                \
-      if (!parse_number(buf, pj, idx, false)) {                                \
-        goto fail;                                                             \
-      }                                                                        \
-      break; /* goto array_continue; */                                        \
-    }                                                                          \
-    case '-': {                                                                \
-      if (!parse_number(buf, pj, idx, true)) {                                 \
-        goto fail;                                                             \
-      }                                                                        \
-      break; /* goto array_continue; */                                        \
-    }                                                                          \
-    case '{': {                                                                \
-      /* we have not yet encountered ] so we need to come back for it */       \
-      pj.containing_scope_offset[depth] = pj.get_current_loc();                \
-      pj.write_tape(0, c); /* here the compilers knows what c is so this gets  \
-                              optimized */                                     \
-      SET_GOTO_ARRAY_CONTINUE()                                                \
-      /* we found an object inside an array, so we need to increment the depth \
-       */                                                                      \
-      depth++;                                                                 \
-      if (depth >= pj.depth_capacity) {                                        \
-        goto fail;                                                             \
-      }                                                                        \
-                                                                               \
-      goto object_begin;                                                       \
-    }                                                                          \
-    case '[': {                                                                \
-      /* we have not yet encountered ] so we need to come back for it */       \
-      pj.containing_scope_offset[depth] = pj.get_current_loc();                \
-      pj.write_tape(0, c); /* here the compilers knows what c is so this gets  \
-                              optimized */                                     \
-      SET_GOTO_ARRAY_CONTINUE()                                                \
-      /* we found an array inside an array, so we need to increment the depth  \
-       */                                                                      \
-      depth++;                                                                 \
-      if (depth >= pj.depth_capacity) {                                        \
-        goto fail;                                                             \
-      }                                                                        \
-      goto array_begin;                                                        \
-    }                                                                          \
-    default:                                                                   \
-      goto fail;                                                               \
-    }                                                                          \
-                                                                               \
-  array_continue:                                                              \
-    UPDATE_CHAR();                                                             \
-    switch (c) {                                                               \
-    case ',':                                                                  \
-      UPDATE_CHAR();                                                           \
-      goto main_array_switch;                                                  \
-    case ']':                                                                  \
-      goto scope_end;                                                          \
-    default:                                                                   \
-      goto fail;                                                               \
-    }                                                                          \
-                                                                               \
-    /*//////////////////////////// FINAL STATES ///////////////////////////*/  \
-                                                                               \
-  succeed:                                                                     \
-    depth--;                                                                   \
-    if (depth != 0) {                                                          \
-      fprintf(stderr, "internal bug\n");                                       \
-      abort();                                                                 \
-    }                                                                          \
-    if (pj.containing_scope_offset[depth] != 0) {                              \
-      fprintf(stderr, "internal bug\n");                                       \
-      abort();                                                                 \
-    }                                                                          \
-    pj.annotate_previous_loc(pj.containing_scope_offset[depth],                \
-                             pj.get_current_loc());                            \
-    pj.write_tape(pj.containing_scope_offset[depth], 'r'); /* r is root */     \
-                                                                               \
-    pj.valid = true;                                                           \
-    pj.error_code = simdjson::SUCCESS;                                         \
-    return pj.error_code;                                                      \
-  fail:                                                                        \
-    /* we do not need the next line because this is done by pj.init(),         \
-     * pessimistically.                                                        \
-     * pj.is_valid  = false;                                                   \
-     * At this point in the code, we have all the time in the world.           \
-     * Note that we know exactly where we are in the document so we could,     \
-     * without any overhead on the processing code, report a specific          \
-     * location.                                                               \
-     * We could even trigger special code paths to assess what happened        \
-     * carefully,                                                              \
-     * all without any added cost. */                                          \
-    if (depth >= pj.depth_capacity) {                                          \
-      pj.error_code = simdjson::DEPTH_ERROR;                                   \
-      return pj.error_code;                                                    \
-    }                                                                          \
-    switch (c) {                                                               \
-    case '"':                                                                  \
-      pj.error_code = simdjson::STRING_ERROR;                                  \
-      return pj.error_code;                                                    \
-    case '0':                                                                  \
-    case '1':                                                                  \
-    case '2':                                                                  \
-    case '3':                                                                  \
-    case '4':                                                                  \
-    case '5':                                                                  \
-    case '6':                                                                  \
-    case '7':                                                                  \
-    case '8':                                                                  \
-    case '9':                                                                  \
-    case '-':                                                                  \
-      pj.error_code = simdjson::NUMBER_ERROR;                                  \
-      return pj.error_code;                                                    \
-    case 't':                                                                  \
-      pj.error_code = simdjson::T_ATOM_ERROR;                                  \
-      return pj.error_code;                                                    \
-    case 'n':                                                                  \
-      pj.error_code = simdjson::N_ATOM_ERROR;                                  \
-      return pj.error_code;                                                    \
-    case 'f':                                                                  \
-      pj.error_code = simdjson::F_ATOM_ERROR;                                  \
-      return pj.error_code;                                                    \
-    default:                                                                   \
-      break;                                                                   \
-    }                                                                          \
-    pj.error_code = simdjson::TAPE_ERROR;                                      \
-    return pj.error_code;                                                      \
+template <>
+WARN_UNUSED  int
+unified_machine<TARGETED_ARCHITECTURE>(const uint8_t *buf, size_t len,
+                                       ParsedJson &pj) {
+  uint32_t i = 0; /* index of the structural character (0,1,2,3...) */
+  uint32_t idx; /* location of the structural character in the input (buf)   */
+  uint8_t c;    /* used to track the (structural) character we are looking at,
+                   updated */
+  /* by UPDATE_CHAR macro */
+  uint32_t depth = 0; /* could have an arbitrary starting depth */
+  pj.init();          /* sets is_valid to false          */
+  if (pj.byte_capacity < len) {
+    pj.error_code = simdjson::CAPACITY;
+    return pj.error_code;
   }
 
-} // namespace simdjson
+  /*//////////////////////////// START STATE /////////////////////////////
+   */
+  SET_GOTO_START_CONTINUE()
+  pj.containing_scope_offset[depth] = pj.get_current_loc();
+  pj.write_tape(0, 'r'); /* r for root, 0 is going to get overwritten */
+  /* the root is used, if nothing else, to capture the size of the tape */
+  depth++; /* everything starts at depth = 1, depth = 0 is just for the
+              root, the root may contain an object, an array or something
+              else. */
+  if (depth >= pj.depth_capacity) {
+    goto fail;
+  }
 
-#ifdef IS_X86_64
-TARGET_HASWELL
+  UPDATE_CHAR();
+  switch (c) {
+  case '{':
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    SET_GOTO_START_CONTINUE();
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+    pj.write_tape(
+        0, c); /* strangely, moving this to object_begin slows things down */
+    goto object_begin;
+  case '[':
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    SET_GOTO_START_CONTINUE();
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    goto array_begin;
+    /* #define SIMDJSON_ALLOWANYTHINGINROOT
+     * A JSON text is a serialized value.  Note that certain previous
+     * specifications of JSON constrained a JSON text to be an object or an
+     * array.  Implementations that generate only objects or arrays where a
+     * JSON text is called for will be interoperable in the sense that all
+     * implementations will accept these as conforming JSON texts.
+     * https://tools.ietf.org/html/rfc8259
+     * #ifdef SIMDJSON_ALLOWANYTHINGINROOT */
+  case '"': {
+    if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    break;
+  }
+  case 't': {
+    /* we need to make a copy to make sure that the string is space
+     * terminated.
+     * this only applies to the JSON document made solely of the true value.
+     * this will almost never be called in practice */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    pj.write_tape(0, c);
+    break;
+  }
+  case 'f': {
+    /* we need to make a copy to make sure that the string is space
+     * terminated.
+     * this only applies to the JSON document made solely of the false
+     * value.
+     * this will almost never be called in practice */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    pj.write_tape(0, c);
+    break;
+  }
+  case 'n': {
+    /* we need to make a copy to make sure that the string is space
+     * terminated.
+     * this only applies to the JSON document made solely of the null value.
+     * this will almost never be called in practice */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    pj.write_tape(0, c);
+    break;
+  }
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9': {
+    /* we need to make a copy to make sure that the string is space
+     * terminated.
+     * this is done only for JSON documents made of a sole number
+     * this will almost never be called in practice. We terminate with a
+     * space
+     * because we do not want to allow NULLs in the middle of a number
+     * (whereas a
+     * space in the middle of a number would be identified in stage 1). */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx,
+                      false)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    break;
+  }
+  case '-': {
+    /* we need to make a copy to make sure that the string is NULL
+     * terminated.
+     * this is done only for JSON documents made of a sole number
+     * this will almost never be called in practice */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    break;
+  }
+  default:
+    goto fail;
+  }
+start_continue:
+  /* the string might not be NULL terminated. */
+  if (i + 1 == pj.n_structural_indexes) {
+    goto succeed;
+  } else {
+    goto fail;
+  }
+  /*//////////////////////////// OBJECT STATES ///////////////////////////*/
+
+object_begin:
+  UPDATE_CHAR();
+  switch (c) {
+  case '"': {
+    if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    goto object_key_state;
+  }
+  case '}':
+    goto scope_end; /* could also go to object_continue */
+  default:
+    goto fail;
+  }
+
+object_key_state:
+  UPDATE_CHAR();
+  if (c != ':') {
+    goto fail;
+  }
+  UPDATE_CHAR();
+  switch (c) {
+  case '"': {
+    if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    break;
+  }
+  case 't':
+    if (!is_valid_true_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'f':
+    if (!is_valid_false_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'n':
+    if (!is_valid_null_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9': {
+    if (!parse_number(buf, pj, idx, false)) {
+      goto fail;
+    }
+    break;
+  }
+  case '-': {
+    if (!parse_number(buf, pj, idx, true)) {
+      goto fail;
+    }
+    break;
+  }
+  case '{': {
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); /* here the compilers knows what c is so this gets
+                            optimized */
+    /* we have not yet encountered } so we need to come back for it */
+    SET_GOTO_OBJECT_CONTINUE()
+    /* we found an object inside an object, so we need to increment the
+     * depth                                                             */
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+
+    goto object_begin;
+  }
+  case '[': {
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); /* here the compilers knows what c is so this gets
+                            optimized */
+    /* we have not yet encountered } so we need to come back for it */
+    SET_GOTO_OBJECT_CONTINUE()
+    /* we found an array inside an object, so we need to increment the depth
+     */
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+    goto array_begin;
+  }
+  default:
+    goto fail;
+  }
+
+object_continue:
+  UPDATE_CHAR();
+  switch (c) {
+  case ',':
+    UPDATE_CHAR();
+    if (c != '"') {
+      goto fail;
+    } else {
+      if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+        goto fail;
+      }
+      goto object_key_state;
+    }
+  case '}':
+    goto scope_end;
+  default:
+    goto fail;
+  }
+
+  /*//////////////////////////// COMMON STATE ///////////////////////////*/
+
+scope_end:
+  /* write our tape location to the header scope */
+  depth--;
+  pj.write_tape(pj.containing_scope_offset[depth], c);
+  pj.annotate_previous_loc(pj.containing_scope_offset[depth],
+                           pj.get_current_loc());
+  /* goto saved_state */
+  GOTO_CONTINUE()
+
+  /*//////////////////////////// ARRAY STATES ///////////////////////////*/
+array_begin:
+  UPDATE_CHAR();
+  if (c == ']') {
+    goto scope_end; /* could also go to array_continue */
+  }
+
+main_array_switch:
+  /* we call update char on all paths in, so we can peek at c on the
+   * on paths that can accept a close square brace (post-, and at start) */
+  switch (c) {
+  case '"': {
+    if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    break;
+  }
+  case 't':
+    if (!is_valid_true_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'f':
+    if (!is_valid_false_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'n':
+    if (!is_valid_null_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break; /* goto array_continue; */
+
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9': {
+    if (!parse_number(buf, pj, idx, false)) {
+      goto fail;
+    }
+    break; /* goto array_continue; */
+  }
+  case '-': {
+    if (!parse_number(buf, pj, idx, true)) {
+      goto fail;
+    }
+    break; /* goto array_continue; */
+  }
+  case '{': {
+    /* we have not yet encountered ] so we need to come back for it */
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); /* here the compilers knows what c is so this gets
+                            optimized */
+    SET_GOTO_ARRAY_CONTINUE()
+    /* we found an object inside an array, so we need to increment the depth
+     */
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+
+    goto object_begin;
+  }
+  case '[': {
+    /* we have not yet encountered ] so we need to come back for it */
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); /* here the compilers knows what c is so this gets
+                            optimized */
+    SET_GOTO_ARRAY_CONTINUE()
+    /* we found an array inside an array, so we need to increment the depth
+     */
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+    goto array_begin;
+  }
+  default:
+    goto fail;
+  }
+
+array_continue:
+  UPDATE_CHAR();
+  switch (c) {
+  case ',':
+    UPDATE_CHAR();
+    goto main_array_switch;
+  case ']':
+    goto scope_end;
+  default:
+    goto fail;
+  }
+
+  /*//////////////////////////// FINAL STATES ///////////////////////////*/
+
+succeed:
+  depth--;
+  if (depth != 0) {
+    fprintf(stderr, "internal bug\n");
+    abort();
+  }
+  if (pj.containing_scope_offset[depth] != 0) {
+    fprintf(stderr, "internal bug\n");
+    abort();
+  }
+  pj.annotate_previous_loc(pj.containing_scope_offset[depth],
+                           pj.get_current_loc());
+  pj.write_tape(pj.containing_scope_offset[depth], 'r'); /* r is root */
+
+  pj.valid = true;
+  pj.error_code = simdjson::SUCCESS;
+  return pj.error_code;
+fail:
+  /* we do not need the next line because this is done by pj.init(),
+   * pessimistically.
+   * pj.is_valid  = false;
+   * At this point in the code, we have all the time in the world.
+   * Note that we know exactly where we are in the document so we could,
+   * without any overhead on the processing code, report a specific
+   * location.
+   * We could even trigger special code paths to assess what happened
+   * carefully,
+   * all without any added cost. */
+  if (depth >= pj.depth_capacity) {
+    pj.error_code = simdjson::DEPTH_ERROR;
+    return pj.error_code;
+  }
+  switch (c) {
+  case '"':
+    pj.error_code = simdjson::STRING_ERROR;
+    return pj.error_code;
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9':
+  case '-':
+    pj.error_code = simdjson::NUMBER_ERROR;
+    return pj.error_code;
+  case 't':
+    pj.error_code = simdjson::T_ATOM_ERROR;
+    return pj.error_code;
+  case 'n':
+    pj.error_code = simdjson::N_ATOM_ERROR;
+    return pj.error_code;
+  case 'f':
+    pj.error_code = simdjson::F_ATOM_ERROR;
+    return pj.error_code;
+  default:
+    break;
+  }
+  pj.error_code = simdjson::TAPE_ERROR;
+  return pj.error_code;
+}
+
+} // namespace simdjson
+UNTARGET_REGION
+
+#else
+#error TARGETED_REGION must be specified before including.
+#endif // TARGETED_REGION
+#else
+#error TARGETED_ARCHITECTURE must be specified before including.
+#endif // TARGETED_ARCHITECTURE
+#undef TARGETED_ARCHITECTURE
+#undef TARGETED_REGION
+
+#define TARGETED_ARCHITECTURE Architecture::WESTMERE
+#define TARGETED_REGION TARGET_WESTMERE
+// This file contains the common code every implementation uses for stage2
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is include already includes
+// "simdjson/stage2_build_tape.h" (this simplifies amalgation)
+
+#ifdef TARGETED_ARCHITECTURE
+#ifdef TARGETED_REGION
+
+TARGETED_REGION
 namespace simdjson {
+
+// this macro reads the next structural character, updating idx, i and c.
+#define UPDATE_CHAR()                                                          \
+  {                                                                            \
+    idx = pj.structural_indexes[i++];                                          \
+    c = buf[idx];                                                              \
+  }
+
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+#define SET_GOTO_ARRAY_CONTINUE() pj.ret_address[depth] = &&array_continue;
+#define SET_GOTO_OBJECT_CONTINUE() pj.ret_address[depth] = &&object_continue;
+#define SET_GOTO_START_CONTINUE() pj.ret_address[depth] = &&start_continue;
+#define GOTO_CONTINUE() goto *pj.ret_address[depth];
+#else
+#define SET_GOTO_ARRAY_CONTINUE() pj.ret_address[depth] = 'a';
+#define SET_GOTO_OBJECT_CONTINUE() pj.ret_address[depth] = 'o';
+#define SET_GOTO_START_CONTINUE() pj.ret_address[depth] = 's';
+#define GOTO_CONTINUE()                                                        \
+  {                                                                            \
+    if (pj.ret_address[depth] == 'a') {                                        \
+      goto array_continue;                                                     \
+    } else if (pj.ret_address[depth] == 'o') {                                 \
+      goto object_continue;                                                    \
+    } else {                                                                   \
+      goto start_continue;                                                     \
+    }                                                                          \
+  }
+#endif
+
+/************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
 template <>
-WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER int
-unified_machine<Architecture::HASWELL>(const uint8_t *buf, size_t len,
+WARN_UNUSED  int
+unified_machine<TARGETED_ARCHITECTURE>(const uint8_t *buf, size_t len,
                                        ParsedJson &pj) {
-  UNIFIED_MACHINE(Architecture::HASWELL, buf, len, pj);
+  uint32_t i = 0; /* index of the structural character (0,1,2,3...) */
+  uint32_t idx; /* location of the structural character in the input (buf)   */
+  uint8_t c;    /* used to track the (structural) character we are looking at,
+                   updated */
+  /* by UPDATE_CHAR macro */
+  uint32_t depth = 0; /* could have an arbitrary starting depth */
+  pj.init();          /* sets is_valid to false          */
+  if (pj.byte_capacity < len) {
+    pj.error_code = simdjson::CAPACITY;
+    return pj.error_code;
+  }
+
+  /*//////////////////////////// START STATE /////////////////////////////
+   */
+  SET_GOTO_START_CONTINUE()
+  pj.containing_scope_offset[depth] = pj.get_current_loc();
+  pj.write_tape(0, 'r'); /* r for root, 0 is going to get overwritten */
+  /* the root is used, if nothing else, to capture the size of the tape */
+  depth++; /* everything starts at depth = 1, depth = 0 is just for the
+              root, the root may contain an object, an array or something
+              else. */
+  if (depth >= pj.depth_capacity) {
+    goto fail;
+  }
+
+  UPDATE_CHAR();
+  switch (c) {
+  case '{':
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    SET_GOTO_START_CONTINUE();
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+    pj.write_tape(
+        0, c); /* strangely, moving this to object_begin slows things down */
+    goto object_begin;
+  case '[':
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    SET_GOTO_START_CONTINUE();
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    goto array_begin;
+    /* #define SIMDJSON_ALLOWANYTHINGINROOT
+     * A JSON text is a serialized value.  Note that certain previous
+     * specifications of JSON constrained a JSON text to be an object or an
+     * array.  Implementations that generate only objects or arrays where a
+     * JSON text is called for will be interoperable in the sense that all
+     * implementations will accept these as conforming JSON texts.
+     * https://tools.ietf.org/html/rfc8259
+     * #ifdef SIMDJSON_ALLOWANYTHINGINROOT */
+  case '"': {
+    if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    break;
+  }
+  case 't': {
+    /* we need to make a copy to make sure that the string is space
+     * terminated.
+     * this only applies to the JSON document made solely of the true value.
+     * this will almost never be called in practice */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    pj.write_tape(0, c);
+    break;
+  }
+  case 'f': {
+    /* we need to make a copy to make sure that the string is space
+     * terminated.
+     * this only applies to the JSON document made solely of the false
+     * value.
+     * this will almost never be called in practice */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    pj.write_tape(0, c);
+    break;
+  }
+  case 'n': {
+    /* we need to make a copy to make sure that the string is space
+     * terminated.
+     * this only applies to the JSON document made solely of the null value.
+     * this will almost never be called in practice */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    pj.write_tape(0, c);
+    break;
+  }
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9': {
+    /* we need to make a copy to make sure that the string is space
+     * terminated.
+     * this is done only for JSON documents made of a sole number
+     * this will almost never be called in practice. We terminate with a
+     * space
+     * because we do not want to allow NULLs in the middle of a number
+     * (whereas a
+     * space in the middle of a number would be identified in stage 1). */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx,
+                      false)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    break;
+  }
+  case '-': {
+    /* we need to make a copy to make sure that the string is NULL
+     * terminated.
+     * this is done only for JSON documents made of a sole number
+     * this will almost never be called in practice */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    break;
+  }
+  default:
+    goto fail;
+  }
+start_continue:
+  /* the string might not be NULL terminated. */
+  if (i + 1 == pj.n_structural_indexes) {
+    goto succeed;
+  } else {
+    goto fail;
+  }
+  /*//////////////////////////// OBJECT STATES ///////////////////////////*/
+
+object_begin:
+  UPDATE_CHAR();
+  switch (c) {
+  case '"': {
+    if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    goto object_key_state;
+  }
+  case '}':
+    goto scope_end; /* could also go to object_continue */
+  default:
+    goto fail;
+  }
+
+object_key_state:
+  UPDATE_CHAR();
+  if (c != ':') {
+    goto fail;
+  }
+  UPDATE_CHAR();
+  switch (c) {
+  case '"': {
+    if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    break;
+  }
+  case 't':
+    if (!is_valid_true_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'f':
+    if (!is_valid_false_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'n':
+    if (!is_valid_null_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9': {
+    if (!parse_number(buf, pj, idx, false)) {
+      goto fail;
+    }
+    break;
+  }
+  case '-': {
+    if (!parse_number(buf, pj, idx, true)) {
+      goto fail;
+    }
+    break;
+  }
+  case '{': {
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); /* here the compilers knows what c is so this gets
+                            optimized */
+    /* we have not yet encountered } so we need to come back for it */
+    SET_GOTO_OBJECT_CONTINUE()
+    /* we found an object inside an object, so we need to increment the
+     * depth                                                             */
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+
+    goto object_begin;
+  }
+  case '[': {
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); /* here the compilers knows what c is so this gets
+                            optimized */
+    /* we have not yet encountered } so we need to come back for it */
+    SET_GOTO_OBJECT_CONTINUE()
+    /* we found an array inside an object, so we need to increment the depth
+     */
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+    goto array_begin;
+  }
+  default:
+    goto fail;
+  }
+
+object_continue:
+  UPDATE_CHAR();
+  switch (c) {
+  case ',':
+    UPDATE_CHAR();
+    if (c != '"') {
+      goto fail;
+    } else {
+      if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+        goto fail;
+      }
+      goto object_key_state;
+    }
+  case '}':
+    goto scope_end;
+  default:
+    goto fail;
+  }
+
+  /*//////////////////////////// COMMON STATE ///////////////////////////*/
+
+scope_end:
+  /* write our tape location to the header scope */
+  depth--;
+  pj.write_tape(pj.containing_scope_offset[depth], c);
+  pj.annotate_previous_loc(pj.containing_scope_offset[depth],
+                           pj.get_current_loc());
+  /* goto saved_state */
+  GOTO_CONTINUE()
+
+  /*//////////////////////////// ARRAY STATES ///////////////////////////*/
+array_begin:
+  UPDATE_CHAR();
+  if (c == ']') {
+    goto scope_end; /* could also go to array_continue */
+  }
+
+main_array_switch:
+  /* we call update char on all paths in, so we can peek at c on the
+   * on paths that can accept a close square brace (post-, and at start) */
+  switch (c) {
+  case '"': {
+    if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    break;
+  }
+  case 't':
+    if (!is_valid_true_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'f':
+    if (!is_valid_false_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'n':
+    if (!is_valid_null_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break; /* goto array_continue; */
+
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9': {
+    if (!parse_number(buf, pj, idx, false)) {
+      goto fail;
+    }
+    break; /* goto array_continue; */
+  }
+  case '-': {
+    if (!parse_number(buf, pj, idx, true)) {
+      goto fail;
+    }
+    break; /* goto array_continue; */
+  }
+  case '{': {
+    /* we have not yet encountered ] so we need to come back for it */
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); /* here the compilers knows what c is so this gets
+                            optimized */
+    SET_GOTO_ARRAY_CONTINUE()
+    /* we found an object inside an array, so we need to increment the depth
+     */
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+
+    goto object_begin;
+  }
+  case '[': {
+    /* we have not yet encountered ] so we need to come back for it */
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); /* here the compilers knows what c is so this gets
+                            optimized */
+    SET_GOTO_ARRAY_CONTINUE()
+    /* we found an array inside an array, so we need to increment the depth
+     */
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+    goto array_begin;
+  }
+  default:
+    goto fail;
+  }
+
+array_continue:
+  UPDATE_CHAR();
+  switch (c) {
+  case ',':
+    UPDATE_CHAR();
+    goto main_array_switch;
+  case ']':
+    goto scope_end;
+  default:
+    goto fail;
+  }
+
+  /*//////////////////////////// FINAL STATES ///////////////////////////*/
+
+succeed:
+  depth--;
+  if (depth != 0) {
+    fprintf(stderr, "internal bug\n");
+    abort();
+  }
+  if (pj.containing_scope_offset[depth] != 0) {
+    fprintf(stderr, "internal bug\n");
+    abort();
+  }
+  pj.annotate_previous_loc(pj.containing_scope_offset[depth],
+                           pj.get_current_loc());
+  pj.write_tape(pj.containing_scope_offset[depth], 'r'); /* r is root */
+
+  pj.valid = true;
+  pj.error_code = simdjson::SUCCESS;
+  return pj.error_code;
+fail:
+  /* we do not need the next line because this is done by pj.init(),
+   * pessimistically.
+   * pj.is_valid  = false;
+   * At this point in the code, we have all the time in the world.
+   * Note that we know exactly where we are in the document so we could,
+   * without any overhead on the processing code, report a specific
+   * location.
+   * We could even trigger special code paths to assess what happened
+   * carefully,
+   * all without any added cost. */
+  if (depth >= pj.depth_capacity) {
+    pj.error_code = simdjson::DEPTH_ERROR;
+    return pj.error_code;
+  }
+  switch (c) {
+  case '"':
+    pj.error_code = simdjson::STRING_ERROR;
+    return pj.error_code;
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9':
+  case '-':
+    pj.error_code = simdjson::NUMBER_ERROR;
+    return pj.error_code;
+  case 't':
+    pj.error_code = simdjson::T_ATOM_ERROR;
+    return pj.error_code;
+  case 'n':
+    pj.error_code = simdjson::N_ATOM_ERROR;
+    return pj.error_code;
+  case 'f':
+    pj.error_code = simdjson::F_ATOM_ERROR;
+    return pj.error_code;
+  default:
+    break;
+  }
+  pj.error_code = simdjson::TAPE_ERROR;
+  return pj.error_code;
 }
+
 } // namespace simdjson
 UNTARGET_REGION
 
-TARGET_WESTMERE
-namespace simdjson {
-template <>
-WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER int
-unified_machine<Architecture::WESTMERE>(const uint8_t *buf, size_t len,
-                                        ParsedJson &pj) {
-  UNIFIED_MACHINE(Architecture::WESTMERE, buf, len, pj);
-}
-} // namespace simdjson
-UNTARGET_REGION
+#else
+#error TARGETED_REGION must be specified before including.
+#endif // TARGETED_REGION
+#else
+#error TARGETED_ARCHITECTURE must be specified before including.
+#endif // TARGETED_ARCHITECTURE
+#undef TARGETED_ARCHITECTURE
+#undef TARGETED_REGION
 #endif // IS_X86_64
 
 #ifdef IS_ARM64
+#define TARGETED_ARCHITECTURE Architecture::ARM64
+#define TARGETED_REGION TARGET_ARM64
+// This file contains the common code every implementation uses for stage2
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is include already includes
+// "simdjson/stage2_build_tape.h" (this simplifies amalgation)
+
+#ifdef TARGETED_ARCHITECTURE
+#ifdef TARGETED_REGION
+
+TARGETED_REGION
 namespace simdjson {
-template <>
-WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER int
-unified_machine<Architecture::ARM64>(const uint8_t *buf, size_t len,
-                                     ParsedJson &pj) {
-  UNIFIED_MACHINE(Architecture::ARM64, buf, len, pj);
-}
-} // namespace simdjson
+
+// this macro reads the next structural character, updating idx, i and c.
+#define UPDATE_CHAR()                                                          \
+  {                                                                            \
+    idx = pj.structural_indexes[i++];                                          \
+    c = buf[idx];                                                              \
+  }
+
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+#define SET_GOTO_ARRAY_CONTINUE() pj.ret_address[depth] = &&array_continue;
+#define SET_GOTO_OBJECT_CONTINUE() pj.ret_address[depth] = &&object_continue;
+#define SET_GOTO_START_CONTINUE() pj.ret_address[depth] = &&start_continue;
+#define GOTO_CONTINUE() goto *pj.ret_address[depth];
+#else
+#define SET_GOTO_ARRAY_CONTINUE() pj.ret_address[depth] = 'a';
+#define SET_GOTO_OBJECT_CONTINUE() pj.ret_address[depth] = 'o';
+#define SET_GOTO_START_CONTINUE() pj.ret_address[depth] = 's';
+#define GOTO_CONTINUE()                                                        \
+  {                                                                            \
+    if (pj.ret_address[depth] == 'a') {                                        \
+      goto array_continue;                                                     \
+    } else if (pj.ret_address[depth] == 'o') {                                 \
+      goto object_continue;                                                    \
+    } else {                                                                   \
+      goto start_continue;                                                     \
+    }                                                                          \
+  }
 #endif
+
+/************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+template <>
+WARN_UNUSED  int
+unified_machine<TARGETED_ARCHITECTURE>(const uint8_t *buf, size_t len,
+                                       ParsedJson &pj) {
+  uint32_t i = 0; /* index of the structural character (0,1,2,3...) */
+  uint32_t idx; /* location of the structural character in the input (buf)   */
+  uint8_t c;    /* used to track the (structural) character we are looking at,
+                   updated */
+  /* by UPDATE_CHAR macro */
+  uint32_t depth = 0; /* could have an arbitrary starting depth */
+  pj.init();          /* sets is_valid to false          */
+  if (pj.byte_capacity < len) {
+    pj.error_code = simdjson::CAPACITY;
+    return pj.error_code;
+  }
+
+  /*//////////////////////////// START STATE /////////////////////////////
+   */
+  SET_GOTO_START_CONTINUE()
+  pj.containing_scope_offset[depth] = pj.get_current_loc();
+  pj.write_tape(0, 'r'); /* r for root, 0 is going to get overwritten */
+  /* the root is used, if nothing else, to capture the size of the tape */
+  depth++; /* everything starts at depth = 1, depth = 0 is just for the
+              root, the root may contain an object, an array or something
+              else. */
+  if (depth >= pj.depth_capacity) {
+    goto fail;
+  }
+
+  UPDATE_CHAR();
+  switch (c) {
+  case '{':
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    SET_GOTO_START_CONTINUE();
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+    pj.write_tape(
+        0, c); /* strangely, moving this to object_begin slows things down */
+    goto object_begin;
+  case '[':
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    SET_GOTO_START_CONTINUE();
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    goto array_begin;
+    /* #define SIMDJSON_ALLOWANYTHINGINROOT
+     * A JSON text is a serialized value.  Note that certain previous
+     * specifications of JSON constrained a JSON text to be an object or an
+     * array.  Implementations that generate only objects or arrays where a
+     * JSON text is called for will be interoperable in the sense that all
+     * implementations will accept these as conforming JSON texts.
+     * https://tools.ietf.org/html/rfc8259
+     * #ifdef SIMDJSON_ALLOWANYTHINGINROOT */
+  case '"': {
+    if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    break;
+  }
+  case 't': {
+    /* we need to make a copy to make sure that the string is space
+     * terminated.
+     * this only applies to the JSON document made solely of the true value.
+     * this will almost never be called in practice */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    pj.write_tape(0, c);
+    break;
+  }
+  case 'f': {
+    /* we need to make a copy to make sure that the string is space
+     * terminated.
+     * this only applies to the JSON document made solely of the false
+     * value.
+     * this will almost never be called in practice */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    pj.write_tape(0, c);
+    break;
+  }
+  case 'n': {
+    /* we need to make a copy to make sure that the string is space
+     * terminated.
+     * this only applies to the JSON document made solely of the null value.
+     * this will almost never be called in practice */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    pj.write_tape(0, c);
+    break;
+  }
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9': {
+    /* we need to make a copy to make sure that the string is space
+     * terminated.
+     * this is done only for JSON documents made of a sole number
+     * this will almost never be called in practice. We terminate with a
+     * space
+     * because we do not want to allow NULLs in the middle of a number
+     * (whereas a
+     * space in the middle of a number would be identified in stage 1). */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx,
+                      false)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    break;
+  }
+  case '-': {
+    /* we need to make a copy to make sure that the string is NULL
+     * terminated.
+     * this is done only for JSON documents made of a sole number
+     * this will almost never be called in practice */
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if (copy == nullptr) {
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    break;
+  }
+  default:
+    goto fail;
+  }
+start_continue:
+  /* the string might not be NULL terminated. */
+  if (i + 1 == pj.n_structural_indexes) {
+    goto succeed;
+  } else {
+    goto fail;
+  }
+  /*//////////////////////////// OBJECT STATES ///////////////////////////*/
+
+object_begin:
+  UPDATE_CHAR();
+  switch (c) {
+  case '"': {
+    if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    goto object_key_state;
+  }
+  case '}':
+    goto scope_end; /* could also go to object_continue */
+  default:
+    goto fail;
+  }
+
+object_key_state:
+  UPDATE_CHAR();
+  if (c != ':') {
+    goto fail;
+  }
+  UPDATE_CHAR();
+  switch (c) {
+  case '"': {
+    if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    break;
+  }
+  case 't':
+    if (!is_valid_true_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'f':
+    if (!is_valid_false_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'n':
+    if (!is_valid_null_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9': {
+    if (!parse_number(buf, pj, idx, false)) {
+      goto fail;
+    }
+    break;
+  }
+  case '-': {
+    if (!parse_number(buf, pj, idx, true)) {
+      goto fail;
+    }
+    break;
+  }
+  case '{': {
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); /* here the compilers knows what c is so this gets
+                            optimized */
+    /* we have not yet encountered } so we need to come back for it */
+    SET_GOTO_OBJECT_CONTINUE()
+    /* we found an object inside an object, so we need to increment the
+     * depth                                                             */
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+
+    goto object_begin;
+  }
+  case '[': {
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); /* here the compilers knows what c is so this gets
+                            optimized */
+    /* we have not yet encountered } so we need to come back for it */
+    SET_GOTO_OBJECT_CONTINUE()
+    /* we found an array inside an object, so we need to increment the depth
+     */
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+    goto array_begin;
+  }
+  default:
+    goto fail;
+  }
+
+object_continue:
+  UPDATE_CHAR();
+  switch (c) {
+  case ',':
+    UPDATE_CHAR();
+    if (c != '"') {
+      goto fail;
+    } else {
+      if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+        goto fail;
+      }
+      goto object_key_state;
+    }
+  case '}':
+    goto scope_end;
+  default:
+    goto fail;
+  }
+
+  /*//////////////////////////// COMMON STATE ///////////////////////////*/
+
+scope_end:
+  /* write our tape location to the header scope */
+  depth--;
+  pj.write_tape(pj.containing_scope_offset[depth], c);
+  pj.annotate_previous_loc(pj.containing_scope_offset[depth],
+                           pj.get_current_loc());
+  /* goto saved_state */
+  GOTO_CONTINUE()
+
+  /*//////////////////////////// ARRAY STATES ///////////////////////////*/
+array_begin:
+  UPDATE_CHAR();
+  if (c == ']') {
+    goto scope_end; /* could also go to array_continue */
+  }
+
+main_array_switch:
+  /* we call update char on all paths in, so we can peek at c on the
+   * on paths that can accept a close square brace (post-, and at start) */
+  switch (c) {
+  case '"': {
+    if (!parse_string<TARGETED_ARCHITECTURE>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    break;
+  }
+  case 't':
+    if (!is_valid_true_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'f':
+    if (!is_valid_false_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'n':
+    if (!is_valid_null_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break; /* goto array_continue; */
+
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9': {
+    if (!parse_number(buf, pj, idx, false)) {
+      goto fail;
+    }
+    break; /* goto array_continue; */
+  }
+  case '-': {
+    if (!parse_number(buf, pj, idx, true)) {
+      goto fail;
+    }
+    break; /* goto array_continue; */
+  }
+  case '{': {
+    /* we have not yet encountered ] so we need to come back for it */
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); /* here the compilers knows what c is so this gets
+                            optimized */
+    SET_GOTO_ARRAY_CONTINUE()
+    /* we found an object inside an array, so we need to increment the depth
+     */
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+
+    goto object_begin;
+  }
+  case '[': {
+    /* we have not yet encountered ] so we need to come back for it */
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); /* here the compilers knows what c is so this gets
+                            optimized */
+    SET_GOTO_ARRAY_CONTINUE()
+    /* we found an array inside an array, so we need to increment the depth
+     */
+    depth++;
+    if (depth >= pj.depth_capacity) {
+      goto fail;
+    }
+    goto array_begin;
+  }
+  default:
+    goto fail;
+  }
+
+array_continue:
+  UPDATE_CHAR();
+  switch (c) {
+  case ',':
+    UPDATE_CHAR();
+    goto main_array_switch;
+  case ']':
+    goto scope_end;
+  default:
+    goto fail;
+  }
+
+  /*//////////////////////////// FINAL STATES ///////////////////////////*/
+
+succeed:
+  depth--;
+  if (depth != 0) {
+    fprintf(stderr, "internal bug\n");
+    abort();
+  }
+  if (pj.containing_scope_offset[depth] != 0) {
+    fprintf(stderr, "internal bug\n");
+    abort();
+  }
+  pj.annotate_previous_loc(pj.containing_scope_offset[depth],
+                           pj.get_current_loc());
+  pj.write_tape(pj.containing_scope_offset[depth], 'r'); /* r is root */
+
+  pj.valid = true;
+  pj.error_code = simdjson::SUCCESS;
+  return pj.error_code;
+fail:
+  /* we do not need the next line because this is done by pj.init(),
+   * pessimistically.
+   * pj.is_valid  = false;
+   * At this point in the code, we have all the time in the world.
+   * Note that we know exactly where we are in the document so we could,
+   * without any overhead on the processing code, report a specific
+   * location.
+   * We could even trigger special code paths to assess what happened
+   * carefully,
+   * all without any added cost. */
+  if (depth >= pj.depth_capacity) {
+    pj.error_code = simdjson::DEPTH_ERROR;
+    return pj.error_code;
+  }
+  switch (c) {
+  case '"':
+    pj.error_code = simdjson::STRING_ERROR;
+    return pj.error_code;
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9':
+  case '-':
+    pj.error_code = simdjson::NUMBER_ERROR;
+    return pj.error_code;
+  case 't':
+    pj.error_code = simdjson::T_ATOM_ERROR;
+    return pj.error_code;
+  case 'n':
+    pj.error_code = simdjson::N_ATOM_ERROR;
+    return pj.error_code;
+  case 'f':
+    pj.error_code = simdjson::F_ATOM_ERROR;
+    return pj.error_code;
+  default:
+    break;
+  }
+  pj.error_code = simdjson::TAPE_ERROR;
+  return pj.error_code;
+}
+
+} // namespace simdjson
+UNTARGET_REGION
+
+#else
+#error TARGETED_REGION must be specified before including.
+#endif // TARGETED_REGION
+#else
+#error TARGETED_ARCHITECTURE must be specified before including.
+#endif // TARGETED_ARCHITECTURE
+#undef TARGETED_ARCHITECTURE
+#undef TARGETED_REGION
+#endif // IS_ARM64
 /* end file src/stage2_build_tape.cpp */
 /* begin file src/parsedjson.cpp */
 
diff --git a/singleheader/simdjson.h b/singleheader/simdjson.h
index 2d5ca358..a16fe67b 100644
--- a/singleheader/simdjson.h
+++ b/singleheader/simdjson.h
@@ -1,4 +1,4 @@
-/* auto-generated on Sun Aug  4 15:43:41 EDT 2019. Do not edit! */
+/* auto-generated on Wed Aug 14 10:31:26 DST 2019. Do not edit! */
 /* begin file include/simdjson/simdjson_version.h */
 // /include/simdjson/simdjson_version.h automatically generated by release.py,
 // do not change by hand
@@ -44,16 +44,20 @@ enum {
 #define TARGET_REGION(T)                                                       \
   _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T)))
 #define UNTARGET_REGION _Pragma("GCC pop_options")
-#else
+#endif // clang then gcc
+
+#endif  // x86
+
+// Default target region macros don't do anything.
+#ifndef TARGET_REGION
 #define TARGET_REGION(T)
 #define UNTARGET_REGION
-#endif // clang then gcc
+#endif
 
 // under GCC and CLANG, we use these two macros
 #define TARGET_HASWELL TARGET_REGION("avx2,bmi,pclmul")
 #define TARGET_WESTMERE TARGET_REGION("sse4.2,pclmul")
-
-#endif // x86
+#define TARGET_ARM64
 
 #ifdef _MSC_VER
 #include <intrin.h>
@@ -65,6 +69,14 @@ enum {
 #endif
 #endif
 
+#if defined(__clang__)
+#define NO_SANITIZE_UNDEFINED __attribute__((no_sanitize("undefined")))
+#elif defined(__GNUC__)
+#define NO_SANITIZE_UNDEFINED __attribute__((no_sanitize_undefined))
+#else
+#define NO_SANITIZE_UNDEFINED
+#endif
+
 #ifdef _MSC_VER
 /* Microsoft C/C++-compatible compiler */
 #include <cstdint>
@@ -119,7 +131,7 @@ static inline bool mul_overflow(uint64_t value1, uint64_t value2,
 }
 
 /* result might be undefined when input_num is zero */
-static inline int trailing_zeroes(uint64_t input_num) {
+static inline NO_SANITIZE_UNDEFINED int trailing_zeroes(uint64_t input_num) {
 #ifdef __BMI__ // tzcnt is BMI1
   return _tzcnt_u64(input_num);
 #else
@@ -343,8 +355,8 @@ static inline uint32_t detect_supported_architectures() {
 #endif
 /* end file include/simdjson/isadetection.h */
 /* begin file include/simdjson/simdjson.h */
-#ifndef SIMDJSON_ERR_H
-#define SIMDJSON_ERR_H
+#ifndef SIMDJSON_SIMDJSON_H
+#define SIMDJSON_SIMDJSON_H
 
 #include <string>
 
@@ -386,7 +398,7 @@ enum ErrorValues {
 };
 const std::string &error_message(const int);
 } // namespace simdjson
-#endif
+#endif // SIMDJSON_SIMDJSON_H
 /* end file include/simdjson/simdjson.h */
 /* begin file include/simdjson/common_defs.h */
 #ifndef SIMDJSON_COMMON_DEFS_H
@@ -433,38 +445,9 @@ const std::string &error_message(const int);
 #define unlikely(x) x
 #endif
 
-// For Visual Studio compilers, same-page buffer overrun is not fine.
-#define ALLOW_SAME_PAGE_BUFFER_OVERRUN false
 
 #else
 
-// For non-Visual Studio compilers, we may assume that same-page buffer overrun
-// is fine. However, it will make it difficult to be "valgrind clean".
-//#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
-//#define ALLOW_SAME_PAGE_BUFFER_OVERRUN true
-//#else
-#define ALLOW_SAME_PAGE_BUFFER_OVERRUN false
-//#endif
-
-// The following is likely unnecessarily complex.
-#ifdef __SANITIZE_ADDRESS__
-// we have GCC, stuck with https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
-#define ALLOW_SAME_PAGE_BUFFER_OVERRUN false
-#elif defined(__has_feature)
-// we have CLANG?
-// todo: if we're setting ALLOW_SAME_PAGE_BUFFER_OVERRUN to false, why do we
-// have a non-empty qualifier?
-#if (__has_feature(address_sanitizer))
-#define ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER                               \
-  __attribute__((no_sanitize("address")))
-#endif
-#endif
-
-#if defined(__has_feature)
-#if (__has_feature(memory_sanitizer))
-#define LENIENT_MEM_SANITIZER __attribute__((no_sanitize("memory")))
-#endif
-#endif
 
 #define really_inline inline __attribute__((always_inline, unused))
 #define never_inline inline __attribute__((noinline, unused))
@@ -481,14 +464,6 @@ const std::string &error_message(const int);
 
 #endif // MSC_VER
 
-// if it does not apply, make it an empty macro
-#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER
-#define ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER
-#endif
-#ifndef LENIENT_MEM_SANITIZER
-#define LENIENT_MEM_SANITIZER
-#endif
-
 #endif // SIMDJSON_COMMON_DEFS_H
 /* end file include/simdjson/common_defs.h */
 /* begin file include/simdjson/padded_string.h */
@@ -1064,7 +1039,6 @@ static inline void print_with_escapes(const char *src, std::ostream &os,
 }
 } // namespace simdjson
 
-#
 #endif
 /* end file include/simdjson/jsonformatutils.h */
 /* begin file include/simdjson/jsonioutil.h */
@@ -36174,6 +36148,250 @@ static const uint32_t mask256_epi32[] = {
 }
 #endif
 /* end file include/simdjson/simdprune_tables.h */
+/* begin file include/simdjson/simd_input.h */
+#ifndef SIMDJSON_SIMD_INPUT_H
+#define SIMDJSON_SIMD_INPUT_H
+
+#include <cassert>
+
+namespace simdjson {
+
+template <Architecture> struct simd_input;
+
+template <Architecture T>
+simd_input<T> fill_input(const uint8_t *ptr);
+
+// a straightforward comparison of a mask against input.
+template <Architecture T>
+uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
+
+// find all values less than or equal than the content of maxval (using unsigned
+// arithmetic)
+template <Architecture T>
+uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
+
+} // namespace simdjson
+
+#endif
+/* end file include/simdjson/simd_input.h */
+/* begin file include/simdjson/simd_input_haswell.h */
+#ifndef SIMDJSON_SIMD_INPUT_HASWELL_H
+#define SIMDJSON_SIMD_INPUT_HASWELL_H
+
+
+#ifdef IS_X86_64
+
+TARGET_HASWELL
+namespace simdjson {
+
+template <>
+struct simd_input<Architecture::HASWELL> {
+  __m256i lo;
+  __m256i hi;
+};
+
+template <>
+really_inline simd_input<Architecture::HASWELL>
+fill_input<Architecture::HASWELL>(const uint8_t *ptr) {
+  struct simd_input<Architecture::HASWELL> in;
+  in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
+  in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
+  return in;
+}
+
+template <>
+really_inline uint64_t cmp_mask_against_input<Architecture::HASWELL>(
+    simd_input<Architecture::HASWELL> in, uint8_t m) {
+  const __m256i mask = _mm256_set1_epi8(m);
+  __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
+  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
+  __m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
+  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
+  return res_0 | (res_1 << 32);
+}
+
+template <>
+really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
+    simd_input<Architecture::HASWELL> in, uint8_t m) {
+  const __m256i maxval = _mm256_set1_epi8(m);
+  __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval);
+  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
+  __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval);
+  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
+  return res_0 | (res_1 << 32);
+}
+
+} // namespace simdjson
+UNTARGET_REGION
+
+#endif // IS_X86_64
+#endif // SIMDJSON_SIMD_INPUT_HASWELL_H
+/* end file include/simdjson/simd_input_haswell.h */
+/* begin file include/simdjson/simd_input_westmere.h */
+#ifndef SIMDJSON_SIMD_INPUT_WESTMERE_H
+#define SIMDJSON_SIMD_INPUT_WESTMERE_H
+
+
+#ifdef IS_X86_64
+
+TARGET_WESTMERE
+namespace simdjson {
+
+template <>
+struct simd_input<Architecture::WESTMERE> {
+  __m128i v0;
+  __m128i v1;
+  __m128i v2;
+  __m128i v3;
+};
+
+template <>
+really_inline simd_input<Architecture::WESTMERE>
+fill_input<Architecture::WESTMERE>(const uint8_t *ptr) {
+  struct simd_input<Architecture::WESTMERE> in;
+  in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
+  in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
+  in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
+  in.v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
+  return in;
+}
+
+template <>
+really_inline uint64_t cmp_mask_against_input<Architecture::WESTMERE>(
+    simd_input<Architecture::WESTMERE> in, uint8_t m) {
+  const __m128i mask = _mm_set1_epi8(m);
+  __m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
+  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
+  __m128i cmp_res_1 = _mm_cmpeq_epi8(in.v1, mask);
+  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
+  __m128i cmp_res_2 = _mm_cmpeq_epi8(in.v2, mask);
+  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
+  __m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask);
+  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
+  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
+}
+
+template <>
+really_inline uint64_t unsigned_lteq_against_input<Architecture::WESTMERE>(
+    simd_input<Architecture::WESTMERE> in, uint8_t m) {
+  const __m128i maxval = _mm_set1_epi8(m);
+  __m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v0), maxval);
+  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
+  __m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v1), maxval);
+  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
+  __m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v2), maxval);
+  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
+  __m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v3), maxval);
+  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
+  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
+}
+
+} // namespace simdjson
+UNTARGET_REGION
+
+#endif // IS_X86_64
+#endif // SIMDJSON_SIMD_INPUT_WESTMERE_H
+/* end file include/simdjson/simd_input_westmere.h */
+/* begin file include/simdjson/simd_input_arm64.h */
+#ifndef SIMDJSON_SIMD_INPUT_ARM64_H
+#define SIMDJSON_SIMD_INPUT_ARM64_H
+
+
+#ifdef IS_ARM64
+namespace simdjson {
+
+template <>
+struct simd_input<Architecture::ARM64> {
+  uint8x16_t i0;
+  uint8x16_t i1;
+  uint8x16_t i2;
+  uint8x16_t i3;
+};
+
+template <>
+really_inline simd_input<Architecture::ARM64>
+fill_input<Architecture::ARM64>(const uint8_t *ptr) {
+  struct simd_input<Architecture::ARM64> in;
+  in.i0 = vld1q_u8(ptr + 0);
+  in.i1 = vld1q_u8(ptr + 16);
+  in.i2 = vld1q_u8(ptr + 32);
+  in.i3 = vld1q_u8(ptr + 48);
+  return in;
+}
+
+really_inline uint16_t neon_movemask(uint8x16_t input) {
+  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+  uint8x16_t minput = vandq_u8(input, bit_mask);
+  uint8x16_t tmp = vpaddq_u8(minput, minput);
+  tmp = vpaddq_u8(tmp, tmp);
+  tmp = vpaddq_u8(tmp, tmp);
+  return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
+}
+
+really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
+                                          uint8x16_t p2, uint8x16_t p3) {
+  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+  uint8x16_t t0 = vandq_u8(p0, bit_mask);
+  uint8x16_t t1 = vandq_u8(p1, bit_mask);
+  uint8x16_t t2 = vandq_u8(p2, bit_mask);
+  uint8x16_t t3 = vandq_u8(p3, bit_mask);
+  uint8x16_t sum0 = vpaddq_u8(t0, t1);
+  uint8x16_t sum1 = vpaddq_u8(t2, t3);
+  sum0 = vpaddq_u8(sum0, sum1);
+  sum0 = vpaddq_u8(sum0, sum0);
+  return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+}
+
+template <>
+really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
+    simd_input<Architecture::ARM64> in, uint8_t m) {
+  const uint8x16_t mask = vmovq_n_u8(m);
+  uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
+  uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
+  uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
+  uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
+  return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+}
+
+template <>
+really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
+    simd_input<Architecture::ARM64> in, uint8_t m) {
+  const uint8x16_t mask = vmovq_n_u8(m);
+  uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
+  uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
+  uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
+  uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
+  return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+}
+
+} // namespace simdjson
+
+#endif // IS_ARM64
+#endif // SIMDJSON_SIMD_INPUT_ARM64_H
+/* end file include/simdjson/simd_input_arm64.h */
+/* begin file include/simdjson/simdutf8check.h */
+#ifndef SIMDJSON_SIMDUTF8CHECK_H
+#define SIMDJSON_SIMDUTF8CHECK_H
+
+
+namespace simdjson {
+
+// Holds the state required to perform check_utf8().
+template <Architecture> struct utf8_checking_state;
+
+template <Architecture T>
+void check_utf8(simd_input<T> in, utf8_checking_state<T> &state);
+
+// Checks if the utf8 validation has found any error.
+template <Architecture T>
+ErrorValues check_utf8_errors(utf8_checking_state<T> &state);
+
+} // namespace simdjson
+
+#endif // SIMDJSON_SIMDUTF8CHECK_H
+/* end file include/simdjson/simdutf8check.h */
 /* begin file include/simdjson/simdutf8check_haswell.h */
 #ifndef SIMDJSON_SIMDUTF8CHECK_HASWELL_H
 #define SIMDJSON_SIMDUTF8CHECK_HASWELL_H
@@ -36366,6 +36584,48 @@ avx_check_utf8_bytes(__m256i current_bytes,
                      previous->high_nibbles, has_error);
   return pb;
 }
+
+template <> struct utf8_checking_state<Architecture::HASWELL> {
+  __m256i has_error;
+  avx_processed_utf_bytes previous;
+  utf8_checking_state() {
+    has_error = _mm256_setzero_si256();
+    previous.raw_bytes = _mm256_setzero_si256();
+    previous.high_nibbles = _mm256_setzero_si256();
+    previous.carried_continuations = _mm256_setzero_si256();
+  }
+};
+
+template <>
+really_inline void check_utf8<Architecture::HASWELL>(
+    simd_input<Architecture::HASWELL> in,
+    utf8_checking_state<Architecture::HASWELL> &state) {
+  __m256i high_bit = _mm256_set1_epi8(0x80u);
+  if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) {
+    // it is ascii, we just check continuation
+    state.has_error = _mm256_or_si256(
+        _mm256_cmpgt_epi8(state.previous.carried_continuations,
+                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 1)),
+        state.has_error);
+  } else {
+    // it is not ascii so we have to do heavy work
+    state.previous =
+        avx_check_utf8_bytes(in.lo, &(state.previous), &(state.has_error));
+    state.previous =
+        avx_check_utf8_bytes(in.hi, &(state.previous), &(state.has_error));
+  }
+}
+
+template <>
+really_inline ErrorValues check_utf8_errors<Architecture::HASWELL>(
+    utf8_checking_state<Architecture::HASWELL> &state) {
+  return _mm256_testz_si256(state.has_error, state.has_error) == 0
+             ? simdjson::UTF8_ERROR
+             : simdjson::SUCCESS;
+}
+
 } // namespace simdjson
 UNTARGET_REGION // haswell
 
@@ -36536,6 +36796,61 @@ check_utf8_bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
                  previous->high_nibbles, has_error);
   return pb;
 }
+
+template <>
+struct utf8_checking_state<Architecture::WESTMERE> {
+  __m128i has_error = _mm_setzero_si128();
+  processed_utf_bytes previous{
+      _mm_setzero_si128(), // raw_bytes
+      _mm_setzero_si128(), // high_nibbles
+      _mm_setzero_si128()  // carried_continuations
+  };
+};
+
+template <>
+really_inline void check_utf8<Architecture::WESTMERE>(
+    simd_input<Architecture::WESTMERE> in,
+    utf8_checking_state<Architecture::WESTMERE> &state) {
+  __m128i high_bit = _mm_set1_epi8(0x80u);
+  if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), high_bit)) == 1) {
+    // it is ascii, we just check continuation
+    state.has_error =
+        _mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
+                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                                  9, 9, 9, 9, 9, 1)),
+                     state.has_error);
+  } else {
+    // it is not ascii so we have to do heavy work
+    state.previous =
+        check_utf8_bytes(in.v0, &(state.previous), &(state.has_error));
+    state.previous =
+        check_utf8_bytes(in.v1, &(state.previous), &(state.has_error));
+  }
+
+  if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), high_bit)) == 1) {
+    // it is ascii, we just check continuation
+    state.has_error =
+        _mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
+                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                                  9, 9, 9, 9, 9, 1)),
+                     state.has_error);
+  } else {
+    // it is not ascii so we have to do heavy work
+    state.previous =
+        check_utf8_bytes(in.v2, &(state.previous), &(state.has_error));
+    state.previous =
+        check_utf8_bytes(in.v3, &(state.previous), &(state.has_error));
+  }
+}
+
+template <>
+really_inline ErrorValues check_utf8_errors<Architecture::WESTMERE>(
+    utf8_checking_state<Architecture::WESTMERE> &state) {
+  return _mm_testz_si128(state.has_error, state.has_error) == 0
+             ? simdjson::UTF8_ERROR
+             : simdjson::SUCCESS;
+}
+
 } // namespace simdjson
 UNTARGET_REGION // westmere
 
@@ -36721,6 +37036,64 @@ check_utf8_bytes(int8x16_t current_bytes, struct processed_utf_bytes *previous,
                  previous->high_nibbles, has_error);
   return pb;
 }
+
+template <>
+struct utf8_checking_state<Architecture::ARM64> {
+  int8x16_t has_error{};
+  processed_utf_bytes previous{};
+};
+
+// Checks that all bytes are ascii
+really_inline bool check_ascii_neon(simd_input<Architecture::ARM64> in) {
+  // checking if the most significant bit is always equal to 0.
+  uint8x16_t high_bit = vdupq_n_u8(0x80);
+  uint8x16_t t0 = vorrq_u8(in.i0, in.i1);
+  uint8x16_t t1 = vorrq_u8(in.i2, in.i3);
+  uint8x16_t t3 = vorrq_u8(t0, t1);
+  uint8x16_t t4 = vandq_u8(t3, high_bit);
+  uint64x2_t v64 = vreinterpretq_u64_u8(t4);
+  uint32x2_t v32 = vqmovn_u64(v64);
+  uint64x1_t result = vreinterpret_u64_u32(v32);
+  return vget_lane_u64(result, 0) == 0;
+}
+
+template <>
+really_inline void check_utf8<Architecture::ARM64>(
+    simd_input<Architecture::ARM64> in,
+    utf8_checking_state<Architecture::ARM64> &state) {
+  if (check_ascii_neon(in)) {
+    // All bytes are ascii. Therefore the byte that was just before must be
+    // ascii too. We only check the byte that was just before simd_input. Nines
+    // are arbitrary values.
+    const int8x16_t verror =
+        (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
+    state.has_error =
+        vorrq_s8(vreinterpretq_s8_u8(
+                     vcgtq_s8(state.previous.carried_continuations, verror)),
+                 state.has_error);
+  } else {
+    // it is not ascii so we have to do heavy work
+    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0),
+                                      &(state.previous), &(state.has_error));
+    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1),
+                                      &(state.previous), &(state.has_error));
+    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2),
+                                      &(state.previous), &(state.has_error));
+    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3),
+                                      &(state.previous), &(state.has_error));
+  }
+}
+
+template <>
+really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
+    utf8_checking_state<Architecture::ARM64> &state) {
+  uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error);
+  uint32x2_t v32 = vqmovn_u64(v64);
+  uint64x1_t result = vreinterpret_u64_u32(v32);
+  return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
+                                       : simdjson::SUCCESS;
+}
+
 } // namespace simdjson
 #endif
 #endif
@@ -37306,8 +37679,6 @@ bool ParsedJson::Iterator::next() {
 
 namespace simdjson {
 
-template <Architecture> struct simd_input;
-
 template <Architecture> uint64_t compute_quote_mask(uint64_t quote_bits);
 
 namespace {
@@ -37333,17 +37704,6 @@ void check_utf8(simd_input<T> in, utf8_checking_state<T> &state);
 template <Architecture T>
 ErrorValues check_utf8_errors(utf8_checking_state<T> &state);
 
-// a straightforward comparison of a mask against input.
-template <Architecture T>
-uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
-
-template <Architecture T> simd_input<T> fill_input(const uint8_t *ptr);
-
-// find all values less than or equal than the content of maxval (using unsigned
-// arithmetic)
-template <Architecture T>
-uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
-
 template <Architecture T>
 really_inline uint64_t find_odd_backslash_sequences(
     simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash);
@@ -37413,439 +37773,19 @@ int find_structural_bits(const char *buf, size_t len,
   return find_structural_bits((const uint8_t *)buf, len, pj);
 }
 
+// flatten out values in 'bits' assuming that they are are to have values of idx
+// plus their position in the bitvector, and store these indexes at
+// base_ptr[base] incrementing base as we go
+// will potentially store extra values beyond end of valid bits, so base_ptr
+// needs to be large enough to handle this
+template <Architecture T = Architecture::NATIVE>
+really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
+                                uint32_t idx, uint64_t bits);
+
 } // namespace simdjson
 
 #endif
 /* end file include/simdjson/stage1_find_marks.h */
-/* begin file include/simdjson/stage1_find_marks_flatten.h */
-#ifndef SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
-#define SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
-
-namespace simdjson {
-
-#ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking
-//
-// This is just a naive implementation. It should be normally
-// disable, but can be used for research purposes to compare
-// again our optimized version.
-really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
-                                uint32_t idx, uint64_t bits) {
-  uint32_t *out_ptr = base_ptr + base;
-  idx -= 64;
-  while (bits != 0) {
-    out_ptr[0] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    out_ptr++;
-  }
-  base = (out_ptr - base_ptr);
-}
-
-#else
-// flatten out values in 'bits' assuming that they are are to have values of idx
-// plus their position in the bitvector, and store these indexes at
-// base_ptr[base] incrementing base as we go
-// will potentially store extra values beyond end of valid bits, so base_ptr
-// needs to be large enough to handle this
-really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
-                                uint32_t idx, uint64_t bits) {
-  // In some instances, the next branch is expensive because it is mispredicted.
-  // Unfortunately, in other cases,
-  // it helps tremendously.
-  if (bits == 0)
-    return;
-  uint32_t cnt = hamming(bits);
-  uint32_t next_base = base + cnt;
-  idx -= 64;
-  base_ptr += base;
-  {
-    base_ptr[0] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[1] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[2] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[3] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[4] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[5] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[6] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[7] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr += 8;
-  }
-  // We hope that the next branch is easily predicted.
-  if (cnt > 8) {
-    base_ptr[0] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[1] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[2] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[3] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[4] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[5] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[6] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr[7] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    base_ptr += 8;
-  }
-  if (cnt > 16) { // unluckly: we rarely get here
-    // since it means having one structural or pseudo-structral element
-    // every 4 characters (possible with inputs like "","","",...).
-    do {
-      base_ptr[0] = idx + trailing_zeroes(bits);
-      bits = bits & (bits - 1);
-      base_ptr++;
-    } while (bits != 0);
-  }
-  base = next_base;
-}
-#endif // SIMDJSON_NAIVE_FLATTEN
-} // namespace simdjson
-
-#endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H/* end file include/simdjson/stage1_find_marks_flatten.h */
-/* begin file include/simdjson/stage1_find_marks_flatten_haswell.h */
-#ifndef SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
-#define SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
-
-// This file provides the same function as
-// stage1_find_marks_flatten.h, but uses Intel intrinsics.
-// This should provide better performance on Visual Studio
-// and other compilers that do a conservative optimization.
-
-// Specifically, on x64 processors with BMI,
-// x & (x - 1) should be mapped to
-// the blsr instruction. By using the
-// _blsr_u64 intrinsic, we
-// ensure that this will happen.
-/////////
-
-
-#ifdef IS_X86_64
-
-TARGET_HASWELL
-namespace simdjson {
-namespace haswell {
-
-// flatten out values in 'bits' assuming that they are are to have values of idx
-// plus their position in the bitvector, and store these indexes at
-// base_ptr[base] incrementing base as we go
-// will potentially store extra values beyond end of valid bits, so base_ptr
-// needs to be large enough to handle this
-really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
-                                uint32_t idx, uint64_t bits) {
-  // In some instances, the next branch is expensive because it is mispredicted.
-  // Unfortunately, in other cases,
-  // it helps tremendously.
-  if (bits == 0)
-    return;
-  uint32_t cnt = _mm_popcnt_u64(bits);
-  uint32_t next_base = base + cnt;
-  idx -= 64;
-  base_ptr += base;
-  {
-    base_ptr[0] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[1] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[2] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[3] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[4] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[5] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[6] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[7] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr += 8;
-  }
-  // We hope that the next branch is easily predicted.
-  if (cnt > 8) {
-    base_ptr[0] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[1] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[2] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[3] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[4] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[5] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[6] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr[7] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-    base_ptr += 8;
-  }
-  if (cnt > 16) { // unluckly: we rarely get here
-    // since it means having one structural or pseudo-structral element
-    // every 4 characters (possible with inputs like "","","",...).
-    do {
-      base_ptr[0] = idx + trailing_zeroes(bits);
-      bits = _blsr_u64(bits);
-      base_ptr++;
-    } while (bits != 0);
-  }
-  base = next_base;
-}
-} // namespace haswell
-} // namespace simdjson
-UNTARGET_REGION
-#endif // IS_X86_64
-#endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
-/* end file include/simdjson/stage1_find_marks_flatten_haswell.h */
-/* begin file include/simdjson/stage1_find_marks_macros.h */
-#ifndef SIMDJSON_STAGE1_FIND_MARKS_MACROS_H
-#define SIMDJSON_STAGE1_FIND_MARKS_MACROS_H
-
-// return a bitvector indicating where we have characters that end an odd-length
-// sequence of backslashes (and thus change the behavior of the next character
-// to follow). A even-length sequence of backslashes, and, for that matter, the
-// largest even-length prefix of our odd-length sequence of backslashes, simply
-// modify the behavior of the backslashes themselves.
-// We also update the prev_iter_ends_odd_backslash reference parameter to
-// indicate whether we end an iteration on an odd-length sequence of
-// backslashes, which modifies our subsequent search for odd-length
-// sequences of backslashes in an obvious way.
-// We need to compile that code for multiple architectures. However, target
-// attributes can be used only once by function definition. Huge macro seemed
-// better than huge code duplication. uint64_t
-// FIND_ODD_BACKSLASH_SEQUENCES(Architecture T, simd_input<T> in, uint64_t
-// &prev_iter_ends_odd_backslash)
-#define FIND_ODD_BACKSLASH_SEQUENCES(T, in, prev_iter_ends_odd_backslash)      \
-  {                                                                            \
-    const uint64_t even_bits = 0x5555555555555555ULL;                          \
-    const uint64_t odd_bits = ~even_bits;                                      \
-    uint64_t bs_bits = cmp_mask_against_input<T>(in, '\\');                    \
-    uint64_t start_edges = bs_bits & ~(bs_bits << 1);                          \
-    /* flip lowest if we have an odd-length run at the end of the prior        \
-     * iteration */                                                            \
-    uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;       \
-    uint64_t even_starts = start_edges & even_start_mask;                      \
-    uint64_t odd_starts = start_edges & ~even_start_mask;                      \
-    uint64_t even_carries = bs_bits + even_starts;                             \
-                                                                               \
-    uint64_t odd_carries;                                                      \
-    /* must record the carry-out of our odd-carries out of bit 63; this        \
-     * indicates whether the sense of any edge going to the next iteration     \
-     * should be flipped */                                                    \
-    bool iter_ends_odd_backslash =                                             \
-        add_overflow(bs_bits, odd_starts, &odd_carries);                       \
-                                                                               \
-    odd_carries |= prev_iter_ends_odd_backslash; /* push in bit zero as a      \
-                                                  * potential end if we had an \
-                                                  * odd-numbered run at the    \
-                                                  * end of the previous        \
-                                                  * iteration */               \
-    prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;  \
-    uint64_t even_carry_ends = even_carries & ~bs_bits;                        \
-    uint64_t odd_carry_ends = odd_carries & ~bs_bits;                          \
-    uint64_t even_start_odd_end = even_carry_ends & odd_bits;                  \
-    uint64_t odd_start_even_end = odd_carry_ends & even_bits;                  \
-    uint64_t odd_ends = even_start_odd_end | odd_start_even_end;               \
-    return odd_ends;                                                           \
-  }
-
-// return both the quote mask (which is a half-open mask that covers the first
-// quote
-// in an unescaped quote pair and everything in the quote pair) and the quote
-// bits, which are the simple
-// unescaped quoted bits. We also update the prev_iter_inside_quote value to
-// tell the next iteration
-// whether we finished the final iteration inside a quote pair; if so, this
-// inverts our behavior of
-// whether we're inside quotes for the next iteration.
-// Note that we don't do any error checking to see if we have backslash
-// sequences outside quotes; these
-// backslash sequences (of any length) will be detected elsewhere.
-// We need to compile that code for multiple architectures. However, target
-// attributes can be used only once by function definition. Huge macro seemed
-// better than huge code duplication. uint64_t
-// FIND_QUOTE_MASK_AND_BITS(Architecture T, simd_input<T> in, uint64_t odd_ends,
-//    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits, uint64_t
-//    &error_mask)
-#define FIND_QUOTE_MASK_AND_BITS(T, in, odd_ends, prev_iter_inside_quote,      \
-                                 quote_bits, error_mask)                       \
-  {                                                                            \
-    quote_bits = cmp_mask_against_input<T>(in, '"');                           \
-    quote_bits = quote_bits & ~odd_ends;                                       \
-    uint64_t quote_mask = compute_quote_mask<T>(quote_bits);                   \
-    quote_mask ^= prev_iter_inside_quote;                                      \
-    /* All Unicode characters may be placed within the                         \
-     * quotation marks, except for the characters that MUST be escaped:        \
-     * quotation mark, reverse solidus, and the control characters (U+0000     \
-     * through U+001F).                                                        \
-     * https://tools.ietf.org/html/rfc8259 */                                  \
-    uint64_t unescaped = unsigned_lteq_against_input<T>(in, 0x1F);             \
-    error_mask |= quote_mask & unescaped;                                      \
-    /* right shift of a signed value expected to be well-defined and standard  \
-     * compliant as of C++20,                                                  \
-     * John Regher from Utah U. says this is fine code */                      \
-    prev_iter_inside_quote =                                                   \
-        static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63);         \
-    return quote_mask;                                                         \
-  }
-
-// Find structural bits in a 64-byte chunk.
-// We need to compile that code for multiple architectures. However, target
-// attributes can be used only once by function definition. Huge macro seemed
-// better than huge code duplication. void FIND_STRUCTURAL_BITS_64(
-//                              Architecture T,
-//                              const uint8_t *buf,
-//                              size_t idx,
-//                              uint32_t *base_ptr,
-//                              uint32_t &base,
-//                              uint64_t &prev_iter_ends_odd_backslash,
-//                              uint64_t &prev_iter_inside_quote,
-//                              uint64_t &prev_iter_ends_pseudo_pred,
-//                              uint64_t &structurals,
-//                              uint64_t &error_mask,
-//                              utf8_checking_state<T> &utf8_state, flatten
-//                              function)
-#define FIND_STRUCTURAL_BITS_64(                                               \
-    T, buf, idx, base_ptr, base, prev_iter_ends_odd_backslash,                 \
-    prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals,           \
-    error_mask, utf8_state, flat)                                              \
-  {                                                                            \
-    simd_input<T> in = fill_input<T>(buf);                                     \
-    check_utf8<T>(in, utf8_state);                                             \
-    /* detect odd sequences of backslashes */                                  \
-    uint64_t odd_ends =                                                        \
-        find_odd_backslash_sequences<T>(in, prev_iter_ends_odd_backslash);     \
-                                                                               \
-    /* detect insides of quote pairs ("quote_mask") and also our quote_bits    \
-     * themselves */                                                           \
-    uint64_t quote_bits;                                                       \
-    uint64_t quote_mask = find_quote_mask_and_bits<T>(                         \
-        in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);         \
-                                                                               \
-    /* take the previous iterations structural bits, not our current           \
-     * iteration,                                                              \
-     * and flatten */                                                          \
-    flat(base_ptr, base, idx, structurals);                                    \
-                                                                               \
-    uint64_t whitespace;                                                       \
-    find_whitespace_and_structurals<T>(in, whitespace, structurals);           \
-                                                                               \
-    /* fixup structurals to reflect quotes and add pseudo-structural           \
-     * characters */                                                           \
-    structurals =                                                              \
-        finalize_structurals(structurals, whitespace, quote_mask, quote_bits,  \
-                             prev_iter_ends_pseudo_pred);                      \
-  }
-
-// We need to compile that code for multiple architectures. However, target
-// attributes can be used only once by function definition. Huge macro seemed
-// better than huge code duplication. ErrorValues
-// FIND_STRUCTURAL_BITS(Architecture T, const uint8_t *buf, size_t len,
-// ParsedJson &pj, flatten function)
-#define FIND_STRUCTURAL_BITS(T, buf, len, pj, flat)                            \
-  {                                                                            \
-    if (len > pj.byte_capacity) {                                              \
-      std::cerr << "Your ParsedJson object only supports documents up to "     \
-                << pj.byte_capacity << " bytes but you are trying to process " \
-                << len << " bytes" << std::endl;                               \
-      return simdjson::CAPACITY;                                               \
-    }                                                                          \
-    uint32_t *base_ptr = pj.structural_indexes;                                \
-    uint32_t base = 0;                                                         \
-    utf8_checking_state<T> utf8_state;                                         \
-                                                                               \
-    /* we have padded the input out to 64 byte multiple with the remainder     \
-     * being zeros persistent state across loop does the last iteration end    \
-     * with an odd-length sequence of backslashes? */                          \
-                                                                               \
-    /* either 0 or 1, but a 64-bit value */                                    \
-    uint64_t prev_iter_ends_odd_backslash = 0ULL;                              \
-    /* does the previous iteration end inside a double-quote pair? */          \
-    uint64_t prev_iter_inside_quote =                                          \
-        0ULL; /* either all zeros or all ones                                  \
-               * does the previous iteration end on something that is a        \
-               * predecessor of a pseudo-structural character - i.e.           \
-               * whitespace or a structural character effectively the very     \
-               * first char is considered to follow "whitespace" for the       \
-               * purposes of pseudo-structural character detection so we       \
-               * initialize to 1 */                                            \
-    uint64_t prev_iter_ends_pseudo_pred = 1ULL;                                \
-                                                                               \
-    /* structurals are persistent state across loop as we flatten them on the  \
-     * subsequent iteration into our array pointed to be base_ptr.             \
-     * This is harmless on the first iteration as structurals==0               \
-     * and is done for performance reasons; we can hide some of the latency of \
-     * the                                                                     \
-     * expensive carryless multiply in the previous step with this work */     \
-    uint64_t structurals = 0;                                                  \
-                                                                               \
-    size_t lenminus64 = len < 64 ? 0 : len - 64;                               \
-    size_t idx = 0;                                                            \
-    uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII \
-                                code points < 0x20) */                         \
-                                                                               \
-    for (; idx < lenminus64; idx += 64) {                                      \
-      FIND_STRUCTURAL_BITS_64(                                                 \
-          T, &buf[idx], idx, base_ptr, base, prev_iter_ends_odd_backslash,     \
-          prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals,     \
-          error_mask, utf8_state, flat);                                       \
-    }                                                                          \
-    /* If we have a final chunk of less than 64 bytes, pad it to 64 with       \
-     * spaces  before processing it (otherwise, we risk invalidating the UTF-8 \
-     * checks). */                                                             \
-    if (idx < len) {                                                           \
-      uint8_t tmp_buf[64];                                                     \
-      memset(tmp_buf, 0x20, 64);                                               \
-      memcpy(tmp_buf, buf + idx, len - idx);                                   \
-      FIND_STRUCTURAL_BITS_64(                                                 \
-          T, &tmp_buf[0], idx, base_ptr, base, prev_iter_ends_odd_backslash,   \
-          prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals,     \
-          error_mask, utf8_state, flat);                                       \
-      idx += 64;                                                               \
-    }                                                                          \
-                                                                               \
-    /* is last string quote closed? */                                         \
-    if (prev_iter_inside_quote) {                                              \
-      return simdjson::UNCLOSED_STRING;                                        \
-    }                                                                          \
-                                                                               \
-    /* finally, flatten out the remaining structurals from the last iteration  \
-     */                                                                        \
-    flat(base_ptr, base, idx, structurals);                                    \
-                                                                               \
-    pj.n_structural_indexes = base;                                            \
-    /* a valid JSON file cannot have zero structural indexes - we should have  \
-     * found something */                                                      \
-    if (pj.n_structural_indexes == 0u) {                                       \
-      return simdjson::EMPTY;                                                  \
-    }                                                                          \
-    if (base_ptr[pj.n_structural_indexes - 1] > len) {                         \
-      return simdjson::UNEXPECTED_ERROR;                                       \
-    }                                                                          \
-    if (len != base_ptr[pj.n_structural_indexes - 1]) {                        \
-      /* the string might not be NULL terminated, but we add a virtual NULL    \
-       * ending                                                                \
-       * character. */                                                         \
-      base_ptr[pj.n_structural_indexes++] = len;                               \
-    }                                                                          \
-    /* make it safe to dereference one beyond this array */                    \
-    base_ptr[pj.n_structural_indexes] = 0;                                     \
-    if (error_mask) {                                                          \
-      return simdjson::UNESCAPED_CHARS;                                        \
-    }                                                                          \
-    return check_utf8_errors<T>(utf8_state);                                   \
-  }
-
-#endif // SIMDJSON_STAGE1_FIND_MARKS_MACROS_H/* end file include/simdjson/stage1_find_marks_macros.h */
 /* begin file include/simdjson/stage1_find_marks_westmere.h */
 #ifndef SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H
 #define SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H
@@ -37855,23 +37795,6 @@ UNTARGET_REGION
 
 TARGET_WESTMERE
 namespace simdjson {
-template <> struct simd_input<Architecture::WESTMERE> {
-  __m128i v0;
-  __m128i v1;
-  __m128i v2;
-  __m128i v3;
-};
-
-template <>
-really_inline simd_input<Architecture::WESTMERE>
-fill_input<Architecture::WESTMERE>(const uint8_t *ptr) {
-  struct simd_input<Architecture::WESTMERE> in;
-  in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
-  in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
-  in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
-  in.v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
-  return in;
-}
 
 template <>
 really_inline uint64_t
@@ -37880,106 +37803,6 @@ compute_quote_mask<Architecture::WESTMERE>(uint64_t quote_bits) {
       _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
 }
 
-template <> struct utf8_checking_state<Architecture::WESTMERE> {
-  __m128i has_error = _mm_setzero_si128();
-  processed_utf_bytes previous{
-      _mm_setzero_si128(), // raw_bytes
-      _mm_setzero_si128(), // high_nibbles
-      _mm_setzero_si128()  // carried_continuations
-  };
-};
-
-template <>
-really_inline void check_utf8<Architecture::WESTMERE>(
-    simd_input<Architecture::WESTMERE> in,
-    utf8_checking_state<Architecture::WESTMERE> &state) {
-  __m128i high_bit = _mm_set1_epi8(0x80u);
-  if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), high_bit)) == 1) {
-    // it is ascii, we just check continuation
-    state.has_error =
-        _mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
-                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                                  9, 9, 9, 9, 9, 1)),
-                     state.has_error);
-  } else {
-    // it is not ascii so we have to do heavy work
-    state.previous =
-        check_utf8_bytes(in.v0, &(state.previous), &(state.has_error));
-    state.previous =
-        check_utf8_bytes(in.v1, &(state.previous), &(state.has_error));
-  }
-
-  if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), high_bit)) == 1) {
-    // it is ascii, we just check continuation
-    state.has_error =
-        _mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
-                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                                  9, 9, 9, 9, 9, 1)),
-                     state.has_error);
-  } else {
-    // it is not ascii so we have to do heavy work
-    state.previous =
-        check_utf8_bytes(in.v2, &(state.previous), &(state.has_error));
-    state.previous =
-        check_utf8_bytes(in.v3, &(state.previous), &(state.has_error));
-  }
-}
-
-template <>
-really_inline ErrorValues check_utf8_errors<Architecture::WESTMERE>(
-    utf8_checking_state<Architecture::WESTMERE> &state) {
-  return _mm_testz_si128(state.has_error, state.has_error) == 0
-             ? simdjson::UTF8_ERROR
-             : simdjson::SUCCESS;
-}
-
-template <>
-really_inline uint64_t cmp_mask_against_input<Architecture::WESTMERE>(
-    simd_input<Architecture::WESTMERE> in, uint8_t m) {
-  const __m128i mask = _mm_set1_epi8(m);
-  __m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
-  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
-  __m128i cmp_res_1 = _mm_cmpeq_epi8(in.v1, mask);
-  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
-  __m128i cmp_res_2 = _mm_cmpeq_epi8(in.v2, mask);
-  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
-  __m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask);
-  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
-  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
-}
-
-template <>
-really_inline uint64_t unsigned_lteq_against_input<Architecture::WESTMERE>(
-    simd_input<Architecture::WESTMERE> in, uint8_t m) {
-  const __m128i maxval = _mm_set1_epi8(m);
-  __m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v0), maxval);
-  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
-  __m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v1), maxval);
-  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
-  __m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v2), maxval);
-  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
-  __m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v3), maxval);
-  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
-  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
-}
-
-template <>
-really_inline uint64_t find_odd_backslash_sequences<Architecture::WESTMERE>(
-    simd_input<Architecture::WESTMERE> in,
-    uint64_t &prev_iter_ends_odd_backslash) {
-  FIND_ODD_BACKSLASH_SEQUENCES(Architecture::WESTMERE, in,
-                               prev_iter_ends_odd_backslash);
-}
-
-template <>
-really_inline uint64_t find_quote_mask_and_bits<Architecture::WESTMERE>(
-    simd_input<Architecture::WESTMERE> in, uint64_t odd_ends,
-    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
-    uint64_t &error_mask) {
-  FIND_QUOTE_MASK_AND_BITS(Architecture::WESTMERE, in, odd_ends,
-                           prev_iter_inside_quote, quote_bits, error_mask)
-}
-
 template <>
 really_inline void find_whitespace_and_structurals<Architecture::WESTMERE>(
     simd_input<Architecture::WESTMERE> in, uint64_t &whitespace,
@@ -38036,7 +37859,8 @@ really_inline void find_whitespace_and_structurals<Architecture::WESTMERE>(
 UNTARGET_REGION
 
 #endif // IS_X86_64
-#endif // SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H/* end file include/simdjson/stage1_find_marks_westmere.h */
+#endif // SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H
+/* end file include/simdjson/stage1_find_marks_westmere.h */
 /* begin file include/simdjson/stage1_find_marks_haswell.h */
 #ifndef SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
 #define SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
@@ -38046,19 +37870,6 @@ UNTARGET_REGION
 
 TARGET_HASWELL
 namespace simdjson {
-template <> struct simd_input<Architecture::HASWELL> {
-  __m256i lo;
-  __m256i hi;
-};
-
-template <>
-really_inline simd_input<Architecture::HASWELL>
-fill_input<Architecture::HASWELL>(const uint8_t *ptr) {
-  struct simd_input<Architecture::HASWELL> in;
-  in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
-  in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
-  return in;
-}
 
 template <>
 really_inline uint64_t
@@ -38070,86 +37881,6 @@ compute_quote_mask<Architecture::HASWELL>(uint64_t quote_bits) {
   return quote_mask;
 }
 
-template <> struct utf8_checking_state<Architecture::HASWELL> {
-  __m256i has_error;
-  avx_processed_utf_bytes previous;
-  utf8_checking_state() {
-    has_error = _mm256_setzero_si256();
-    previous.raw_bytes = _mm256_setzero_si256();
-    previous.high_nibbles = _mm256_setzero_si256();
-    previous.carried_continuations = _mm256_setzero_si256();
-  }
-};
-
-template <>
-really_inline void check_utf8<Architecture::HASWELL>(
-    simd_input<Architecture::HASWELL> in,
-    utf8_checking_state<Architecture::HASWELL> &state) {
-  __m256i high_bit = _mm256_set1_epi8(0x80u);
-  if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) {
-    // it is ascii, we just check continuation
-    state.has_error = _mm256_or_si256(
-        _mm256_cmpgt_epi8(state.previous.carried_continuations,
-                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 1)),
-        state.has_error);
-  } else {
-    // it is not ascii so we have to do heavy work
-    state.previous =
-        avx_check_utf8_bytes(in.lo, &(state.previous), &(state.has_error));
-    state.previous =
-        avx_check_utf8_bytes(in.hi, &(state.previous), &(state.has_error));
-  }
-}
-
-template <>
-really_inline ErrorValues check_utf8_errors<Architecture::HASWELL>(
-    utf8_checking_state<Architecture::HASWELL> &state) {
-  return _mm256_testz_si256(state.has_error, state.has_error) == 0
-             ? simdjson::UTF8_ERROR
-             : simdjson::SUCCESS;
-}
-
-template <>
-really_inline uint64_t cmp_mask_against_input<Architecture::HASWELL>(
-    simd_input<Architecture::HASWELL> in, uint8_t m) {
-  const __m256i mask = _mm256_set1_epi8(m);
-  __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
-  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-  __m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
-  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
-  return res_0 | (res_1 << 32);
-}
-
-template <>
-really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
-    simd_input<Architecture::HASWELL> in, uint8_t m) {
-  const __m256i maxval = _mm256_set1_epi8(m);
-  __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval);
-  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-  __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval);
-  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
-  return res_0 | (res_1 << 32);
-}
-
-template <>
-really_inline uint64_t find_odd_backslash_sequences<Architecture::HASWELL>(
-    simd_input<Architecture::HASWELL> in,
-    uint64_t &prev_iter_ends_odd_backslash) {
-  FIND_ODD_BACKSLASH_SEQUENCES(Architecture::HASWELL, in,
-                               prev_iter_ends_odd_backslash);
-}
-
-template <>
-really_inline uint64_t find_quote_mask_and_bits<Architecture::HASWELL>(
-    simd_input<Architecture::HASWELL> in, uint64_t odd_ends,
-    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
-    uint64_t &error_mask) {
-  FIND_QUOTE_MASK_AND_BITS(Architecture::HASWELL, in, odd_ends,
-                           prev_iter_inside_quote, quote_bits, error_mask)
-}
-
 template <>
 really_inline void find_whitespace_and_structurals<Architecture::HASWELL>(
     simd_input<Architecture::HASWELL> in, uint64_t &whitespace,
@@ -38243,7 +37974,8 @@ really_inline void find_whitespace_and_structurals<Architecture::HASWELL>(
 UNTARGET_REGION
 
 #endif // IS_X86_64
-#endif // SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H/* end file include/simdjson/stage1_find_marks_haswell.h */
+#endif // SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
+/* end file include/simdjson/stage1_find_marks_haswell.h */
 /* begin file include/simdjson/stage1_find_marks_arm64.h */
 #ifndef SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
 #define SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
@@ -38251,48 +37983,6 @@ UNTARGET_REGION
 
 #ifdef IS_ARM64
 namespace simdjson {
-template <> struct simd_input<Architecture::ARM64> {
-  uint8x16_t i0;
-  uint8x16_t i1;
-  uint8x16_t i2;
-  uint8x16_t i3;
-};
-
-template <>
-really_inline simd_input<Architecture::ARM64>
-fill_input<Architecture::ARM64>(const uint8_t *ptr) {
-  struct simd_input<Architecture::ARM64> in;
-  in.i0 = vld1q_u8(ptr + 0);
-  in.i1 = vld1q_u8(ptr + 16);
-  in.i2 = vld1q_u8(ptr + 32);
-  in.i3 = vld1q_u8(ptr + 48);
-  return in;
-}
-
-really_inline uint16_t neon_movemask(uint8x16_t input) {
-  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
-  uint8x16_t minput = vandq_u8(input, bit_mask);
-  uint8x16_t tmp = vpaddq_u8(minput, minput);
-  tmp = vpaddq_u8(tmp, tmp);
-  tmp = vpaddq_u8(tmp, tmp);
-  return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
-}
-
-really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
-                                          uint8x16_t p2, uint8x16_t p3) {
-  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
-  uint8x16_t t0 = vandq_u8(p0, bit_mask);
-  uint8x16_t t1 = vandq_u8(p1, bit_mask);
-  uint8x16_t t2 = vandq_u8(p2, bit_mask);
-  uint8x16_t t3 = vandq_u8(p3, bit_mask);
-  uint8x16_t sum0 = vpaddq_u8(t0, t1);
-  uint8x16_t sum1 = vpaddq_u8(t2, t3);
-  sum0 = vpaddq_u8(sum0, sum1);
-  sum0 = vpaddq_u8(sum0, sum0);
-  return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
-}
 
 template <>
 really_inline uint64_t
@@ -38304,101 +37994,6 @@ compute_quote_mask<Architecture::ARM64>(uint64_t quote_bits) {
 #endif
 }
 
-template <> struct utf8_checking_state<Architecture::ARM64> {
-  int8x16_t has_error{};
-  processed_utf_bytes previous{};
-};
-
-// Checks that all bytes are ascii
-really_inline bool check_ascii_neon(simd_input<Architecture::ARM64> in) {
-  // checking if the most significant bit is always equal to 0.
-  uint8x16_t high_bit = vdupq_n_u8(0x80);
-  uint8x16_t t0 = vorrq_u8(in.i0, in.i1);
-  uint8x16_t t1 = vorrq_u8(in.i2, in.i3);
-  uint8x16_t t3 = vorrq_u8(t0, t1);
-  uint8x16_t t4 = vandq_u8(t3, high_bit);
-  uint64x2_t v64 = vreinterpretq_u64_u8(t4);
-  uint32x2_t v32 = vqmovn_u64(v64);
-  uint64x1_t result = vreinterpret_u64_u32(v32);
-  return vget_lane_u64(result, 0) == 0;
-}
-
-template <>
-really_inline void check_utf8<Architecture::ARM64>(
-    simd_input<Architecture::ARM64> in,
-    utf8_checking_state<Architecture::ARM64> &state) {
-  if (check_ascii_neon(in)) {
-    // All bytes are ascii. Therefore the byte that was just before must be
-    // ascii too. We only check the byte that was just before simd_input. Nines
-    // are arbitrary values.
-    const int8x16_t verror =
-        (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
-    state.has_error =
-        vorrq_s8(vreinterpretq_s8_u8(
-                     vcgtq_s8(state.previous.carried_continuations, verror)),
-                 state.has_error);
-  } else {
-    // it is not ascii so we have to do heavy work
-    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0),
-                                      &(state.previous), &(state.has_error));
-    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1),
-                                      &(state.previous), &(state.has_error));
-    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2),
-                                      &(state.previous), &(state.has_error));
-    state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3),
-                                      &(state.previous), &(state.has_error));
-  }
-}
-
-template <>
-really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
-    utf8_checking_state<Architecture::ARM64> &state) {
-  uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error);
-  uint32x2_t v32 = vqmovn_u64(v64);
-  uint64x1_t result = vreinterpret_u64_u32(v32);
-  return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
-                                       : simdjson::SUCCESS;
-}
-
-template <>
-really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
-    simd_input<Architecture::ARM64> in, uint8_t m) {
-  const uint8x16_t mask = vmovq_n_u8(m);
-  uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
-  uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
-  uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
-  uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
-  return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
-}
-
-template <>
-really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
-    simd_input<Architecture::ARM64> in, uint8_t m) {
-  const uint8x16_t mask = vmovq_n_u8(m);
-  uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
-  uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
-  uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
-  uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
-  return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
-}
-
-template <>
-really_inline uint64_t find_odd_backslash_sequences<Architecture::ARM64>(
-    simd_input<Architecture::ARM64> in,
-    uint64_t &prev_iter_ends_odd_backslash) {
-  FIND_ODD_BACKSLASH_SEQUENCES(Architecture::ARM64, in,
-                               prev_iter_ends_odd_backslash);
-}
-
-template <>
-really_inline uint64_t find_quote_mask_and_bits<Architecture::ARM64>(
-    simd_input<Architecture::ARM64> in, uint64_t odd_ends,
-    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
-    uint64_t &error_mask) {
-  FIND_QUOTE_MASK_AND_BITS(Architecture::ARM64, in, odd_ends,
-                           prev_iter_inside_quote, quote_bits, error_mask)
-}
-
 template <>
 really_inline void find_whitespace_and_structurals<Architecture::ARM64>(
     simd_input<Architecture::ARM64> in, uint64_t &whitespace,
@@ -38450,7 +38045,8 @@ really_inline void find_whitespace_and_structurals<Architecture::ARM64>(
 } // namespace simdjson
 
 #endif // IS_ARM64
-#endif // SIMDJSON_STAGE1_FIND_MARKS_ARM64_H/* end file include/simdjson/stage1_find_marks_arm64.h */
+#endif // SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
+/* end file include/simdjson/stage1_find_marks_arm64.h */
 /* begin file include/simdjson/stringparsing.h */
 #ifndef SIMDJSON_STRINGPARSING_H
 #define SIMDJSON_STRINGPARSING_H
@@ -38541,7 +38137,7 @@ parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src,
                                                 uint8_t *dst);
 
 template <Architecture T>
-WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
+WARN_UNUSED 
     really_inline bool
     parse_string(UNUSED const uint8_t *buf, UNUSED size_t len, ParsedJson &pj,
                  UNUSED const uint32_t depth, UNUSED uint32_t offset);
@@ -38552,95 +38148,6 @@ WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
 
 #endif
 /* end file include/simdjson/stringparsing.h */
-/* begin file include/simdjson/stringparsing_macros.h */
-#ifndef SIMDJSON_STRINGPARSING_MACROS_H
-#define SIMDJSON_STRINGPARSING_MACROS_H
-
-// We need to compile that code for multiple architectures. However, target
-// attributes can be used only once by function definition. Huge macro seemed
-// better than huge code duplication.ç
-// bool PARSE_STRING(Architecture T, const uint8_t *buf, size_t len, ParsedJson
-//                  &pj,const uint32_t depth, uint32_t offset)
-#define PARSE_STRING(T, buf, len, pj, depth, offset)                           \
-  {                                                                            \
-    pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');             \
-    const uint8_t *src =                                                       \
-        &buf[offset + 1]; /* we know that buf at offset is a " */              \
-    uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);               \
-    const uint8_t *const start_of_string = dst;                                \
-    while (1) {                                                                \
-      parse_string_helper helper = find_bs_bits_and_quote_bits<T>(src, dst);   \
-      if (((helper.bs_bits - 1) & helper.quote_bits) != 0) {                   \
-        /* we encountered quotes first. Move dst to point to quotes and exit   \
-         */                                                                    \
-                                                                               \
-        /* find out where the quote is... */                                   \
-        uint32_t quote_dist = trailing_zeroes(helper.quote_bits);              \
-                                                                               \
-        /* NULL termination is still handy if you expect all your strings to   \
-         * be NULL terminated? */                                              \
-        /* It comes at a small cost */                                         \
-        dst[quote_dist] = 0;                                                   \
-                                                                               \
-        uint32_t str_length = (dst - start_of_string) + quote_dist;            \
-        memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t));      \
-        /*****************************                                         \
-         * Above, check for overflow in case someone has a crazy string        \
-         * (>=4GB?)                 _                                          \
-         * But only add the overflow check when the document itself exceeds    \
-         * 4GB                                                                 \
-         * Currently unneeded because we refuse to parse docs larger or equal  \
-         * to 4GB.                                                             \
-         ****************************/                                         \
-                                                                               \
-        /* we advance the point, accounting for the fact that we have a NULL   \
-         * termination         */                                              \
-        pj.current_string_buf_loc = dst + quote_dist + 1;                      \
-        return true;                                                           \
-      }                                                                        \
-      if (((helper.quote_bits - 1) & helper.bs_bits) != 0) {                   \
-        /* find out where the backspace is */                                  \
-        uint32_t bs_dist = trailing_zeroes(helper.bs_bits);                    \
-        uint8_t escape_char = src[bs_dist + 1];                                \
-        /* we encountered backslash first. Handle backslash */                 \
-        if (escape_char == 'u') {                                              \
-          /* move src/dst up to the start; they will be further adjusted       \
-             within the unicode codepoint handling code. */                    \
-          src += bs_dist;                                                      \
-          dst += bs_dist;                                                      \
-          if (!handle_unicode_codepoint(&src, &dst)) {                         \
-            return false;                                                      \
-          }                                                                    \
-        } else {                                                               \
-          /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and \
-           * write bs_dist+1 characters to output                              \
-           * note this may reach beyond the part of the buffer we've actually  \
-           * seen. I think this is ok */                                       \
-          uint8_t escape_result = escape_map[escape_char];                     \
-          if (escape_result == 0u) {                                           \
-            return false; /* bogus escape value is an error */                 \
-          }                                                                    \
-          dst[bs_dist] = escape_result;                                        \
-          src += bs_dist + 2;                                                  \
-          dst += bs_dist + 1;                                                  \
-        }                                                                      \
-      } else {                                                                 \
-        /* they are the same. Since they can't co-occur, it means we           \
-         * encountered neither. */                                             \
-        if constexpr (T == Architecture::WESTMERE) {                           \
-          src += 16;                                                           \
-          dst += 16;                                                           \
-        } else {                                                               \
-          src += 32;                                                           \
-          dst += 32;                                                           \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    /* can't be reached */                                                     \
-    return true;                                                               \
-  }
-
-#endif/* end file include/simdjson/stringparsing_macros.h */
 /* begin file include/simdjson/stringparsing_westmere.h */
 #ifndef SIMDJSON_STRINGPARSING_WESTMERE_H
 #define SIMDJSON_STRINGPARSING_WESTMERE_H
@@ -38666,21 +38173,122 @@ find_bs_bits_and_quote_bits<Architecture::WESTMERE>(const uint8_t *src,
       static_cast<uint32_t>(_mm_movemask_epi8(quote_mask)) // quote_bits
   };
 }
-
-template <>
-WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
-    really_inline bool
-    parse_string<Architecture::WESTMERE>(UNUSED const uint8_t *buf,
-                                         UNUSED size_t len, ParsedJson &pj,
-                                         UNUSED const uint32_t depth,
-                                         UNUSED uint32_t offset) {
-  PARSE_STRING(Architecture::WESTMERE, buf, len, pj, depth, offset);
-}
 } // namespace simdjson
 UNTARGET_REGION
-#endif
 
-#endif/* end file include/simdjson/stringparsing_westmere.h */
+#define TARGETED_ARCHITECTURE Architecture::WESTMERE
+#define TARGETED_REGION TARGET_WESTMERE
+// This file contains the common code every implementation uses
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is include already includes
+// "simdjson/stringparsing.h" (this simplifies amalgation)
+
+#ifdef TARGETED_ARCHITECTURE
+#ifdef TARGETED_REGION
+
+TARGETED_REGION
+namespace simdjson {
+
+template <>
+WARN_UNUSED
+    really_inline bool
+    parse_string<TARGETED_ARCHITECTURE>(UNUSED const uint8_t *buf,
+                                        UNUSED size_t len, ParsedJson &pj,
+                                        UNUSED const uint32_t depth,
+                                        UNUSED uint32_t offset) {
+  pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
+  const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */
+  uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
+  const uint8_t *const start_of_string = dst;
+  while (1) {
+    parse_string_helper helper =
+        find_bs_bits_and_quote_bits<TARGETED_ARCHITECTURE>(src, dst);
+    if (((helper.bs_bits - 1) & helper.quote_bits) != 0) {
+      /* we encountered quotes first. Move dst to point to quotes and exit
+       */
+
+      /* find out where the quote is... */
+      uint32_t quote_dist = trailing_zeroes(helper.quote_bits);
+
+      /* NULL termination is still handy if you expect all your strings to
+       * be NULL terminated? */
+      /* It comes at a small cost */
+      dst[quote_dist] = 0;
+
+      uint32_t str_length = (dst - start_of_string) + quote_dist;
+      memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t));
+      /*****************************
+       * Above, check for overflow in case someone has a crazy string
+       * (>=4GB?)                 _
+       * But only add the overflow check when the document itself exceeds
+       * 4GB
+       * Currently unneeded because we refuse to parse docs larger or equal
+       * to 4GB.
+       ****************************/
+
+      /* we advance the point, accounting for the fact that we have a NULL
+       * termination         */
+      pj.current_string_buf_loc = dst + quote_dist + 1;
+      return true;
+    }
+    if (((helper.quote_bits - 1) & helper.bs_bits) != 0) {
+      /* find out where the backspace is */
+      uint32_t bs_dist = trailing_zeroes(helper.bs_bits);
+      uint8_t escape_char = src[bs_dist + 1];
+      /* we encountered backslash first. Handle backslash */
+      if (escape_char == 'u') {
+        /* move src/dst up to the start; they will be further adjusted
+           within the unicode codepoint handling code. */
+        src += bs_dist;
+        dst += bs_dist;
+        if (!handle_unicode_codepoint(&src, &dst)) {
+          return false;
+        }
+      } else {
+        /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+         * write bs_dist+1 characters to output
+         * note this may reach beyond the part of the buffer we've actually
+         * seen. I think this is ok */
+        uint8_t escape_result = escape_map[escape_char];
+        if (escape_result == 0u) {
+          return false; /* bogus escape value is an error */
+        }
+        dst[bs_dist] = escape_result;
+        src += bs_dist + 2;
+        dst += bs_dist + 1;
+      }
+    } else {
+      /* they are the same. Since they can't co-occur, it means we
+       * encountered neither. */
+      if constexpr (TARGETED_ARCHITECTURE == Architecture::WESTMERE) {
+        src += 16;
+        dst += 16;
+      } else {
+        src += 32;
+        dst += 32;
+      }
+    }
+  }
+  /* can't be reached */
+  return true;
+}
+
+} // namespace simdjson
+UNTARGET_REGION
+
+#else
+#error TARGETED_REGION must be specified before including.
+#endif // TARGETED_REGION
+#else
+#error TARGETED_ARCHITECTURE must be specified before including.
+#endif // TARGETED_ARCHITECTURE
+#undef TARGETED_ARCHITECTURE
+#undef TARGETED_REGION
+
+#endif // IS_X86_64
+
+#endif
+/* end file include/simdjson/stringparsing_westmere.h */
 /* begin file include/simdjson/stringparsing_haswell.h */
 #ifndef SIMDJSON_STRINGPARSING_HASWELL_H
 #define SIMDJSON_STRINGPARSING_HASWELL_H
@@ -38707,22 +38315,122 @@ find_bs_bits_and_quote_bits<Architecture::HASWELL>(const uint8_t *src,
       static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask)) // quote_bits
   };
 }
+} // namespace simdjson
+UNTARGET_REGION
+
+#define TARGETED_ARCHITECTURE Architecture::HASWELL
+#define TARGETED_REGION TARGET_HASWELL
+// This file contains the common code every implementation uses
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is include already includes
+// "simdjson/stringparsing.h" (this simplifies amalgation)
+
+#ifdef TARGETED_ARCHITECTURE
+#ifdef TARGETED_REGION
+
+TARGETED_REGION
+namespace simdjson {
 
 template <>
-WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
+WARN_UNUSED
     really_inline bool
-    parse_string<Architecture::HASWELL>(UNUSED const uint8_t *buf,
+    parse_string<TARGETED_ARCHITECTURE>(UNUSED const uint8_t *buf,
                                         UNUSED size_t len, ParsedJson &pj,
                                         UNUSED const uint32_t depth,
                                         UNUSED uint32_t offset) {
-  PARSE_STRING(Architecture::HASWELL, buf, len, pj, depth, offset);
+  pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
+  const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */
+  uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
+  const uint8_t *const start_of_string = dst;
+  while (1) {
+    parse_string_helper helper =
+        find_bs_bits_and_quote_bits<TARGETED_ARCHITECTURE>(src, dst);
+    if (((helper.bs_bits - 1) & helper.quote_bits) != 0) {
+      /* we encountered quotes first. Move dst to point to quotes and exit
+       */
+
+      /* find out where the quote is... */
+      uint32_t quote_dist = trailing_zeroes(helper.quote_bits);
+
+      /* NULL termination is still handy if you expect all your strings to
+       * be NULL terminated? */
+      /* It comes at a small cost */
+      dst[quote_dist] = 0;
+
+      uint32_t str_length = (dst - start_of_string) + quote_dist;
+      memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t));
+      /*****************************
+       * Above, check for overflow in case someone has a crazy string
+       * (>=4GB?)                 _
+       * But only add the overflow check when the document itself exceeds
+       * 4GB
+       * Currently unneeded because we refuse to parse docs larger or equal
+       * to 4GB.
+       ****************************/
+
+      /* we advance the point, accounting for the fact that we have a NULL
+       * termination         */
+      pj.current_string_buf_loc = dst + quote_dist + 1;
+      return true;
+    }
+    if (((helper.quote_bits - 1) & helper.bs_bits) != 0) {
+      /* find out where the backspace is */
+      uint32_t bs_dist = trailing_zeroes(helper.bs_bits);
+      uint8_t escape_char = src[bs_dist + 1];
+      /* we encountered backslash first. Handle backslash */
+      if (escape_char == 'u') {
+        /* move src/dst up to the start; they will be further adjusted
+           within the unicode codepoint handling code. */
+        src += bs_dist;
+        dst += bs_dist;
+        if (!handle_unicode_codepoint(&src, &dst)) {
+          return false;
+        }
+      } else {
+        /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+         * write bs_dist+1 characters to output
+         * note this may reach beyond the part of the buffer we've actually
+         * seen. I think this is ok */
+        uint8_t escape_result = escape_map[escape_char];
+        if (escape_result == 0u) {
+          return false; /* bogus escape value is an error */
+        }
+        dst[bs_dist] = escape_result;
+        src += bs_dist + 2;
+        dst += bs_dist + 1;
+      }
+    } else {
+      /* they are the same. Since they can't co-occur, it means we
+       * encountered neither. */
+      if constexpr (TARGETED_ARCHITECTURE == Architecture::WESTMERE) {
+        src += 16;
+        dst += 16;
+      } else {
+        src += 32;
+        dst += 32;
+      }
+    }
+  }
+  /* can't be reached */
+  return true;
 }
 
 } // namespace simdjson
 UNTARGET_REGION
-#endif
 
-#endif/* end file include/simdjson/stringparsing_haswell.h */
+#else
+#error TARGETED_REGION must be specified before including.
+#endif // TARGETED_REGION
+#else
+#error TARGETED_ARCHITECTURE must be specified before including.
+#endif // TARGETED_ARCHITECTURE
+#undef TARGETED_ARCHITECTURE
+#undef TARGETED_REGION
+
+#endif // IS_X86_64
+
+#endif
+/* end file include/simdjson/stringparsing_haswell.h */
 /* begin file include/simdjson/stringparsing_arm64.h */
 #ifndef SIMDJSON_STRINGPARSING_ARM64_H
 #define SIMDJSON_STRINGPARSING_ARM64_H
@@ -38766,17 +38474,118 @@ find_bs_bits_and_quote_bits<Architecture::ARM64>(const uint8_t *src,
   };
 }
 
-template <>
-WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
-    really_inline bool
-    parse_string<Architecture::ARM64>(UNUSED const uint8_t *buf,
-                                      UNUSED size_t len, ParsedJson &pj,
-                                      UNUSED const uint32_t depth,
-                                      UNUSED uint32_t offset) {
-  PARSE_STRING(Architecture::ARM64, buf, len, pj, depth, offset);
-}
 } // namespace simdjson
-#endif
+
+#define TARGETED_ARCHITECTURE Architecture::ARM64
+#define TARGETED_REGION TARGET_ARM64
+// This file contains the common code every implementation uses
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is include already includes
+// "simdjson/stringparsing.h" (this simplifies amalgation)
+
+#ifdef TARGETED_ARCHITECTURE
+#ifdef TARGETED_REGION
+
+TARGETED_REGION
+namespace simdjson {
+
+template <>
+WARN_UNUSED
+    really_inline bool
+    parse_string<TARGETED_ARCHITECTURE>(UNUSED const uint8_t *buf,
+                                        UNUSED size_t len, ParsedJson &pj,
+                                        UNUSED const uint32_t depth,
+                                        UNUSED uint32_t offset) {
+  pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
+  const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */
+  uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
+  const uint8_t *const start_of_string = dst;
+  while (1) {
+    parse_string_helper helper =
+        find_bs_bits_and_quote_bits<TARGETED_ARCHITECTURE>(src, dst);
+    if (((helper.bs_bits - 1) & helper.quote_bits) != 0) {
+      /* we encountered quotes first. Move dst to point to quotes and exit
+       */
+
+      /* find out where the quote is... */
+      uint32_t quote_dist = trailing_zeroes(helper.quote_bits);
+
+      /* NULL termination is still handy if you expect all your strings to
+       * be NULL terminated? */
+      /* It comes at a small cost */
+      dst[quote_dist] = 0;
+
+      uint32_t str_length = (dst - start_of_string) + quote_dist;
+      memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t));
+      /*****************************
+       * Above, check for overflow in case someone has a crazy string
+       * (>=4GB?)                 _
+       * But only add the overflow check when the document itself exceeds
+       * 4GB
+       * Currently unneeded because we refuse to parse docs larger or equal
+       * to 4GB.
+       ****************************/
+
+      /* we advance the point, accounting for the fact that we have a NULL
+       * termination         */
+      pj.current_string_buf_loc = dst + quote_dist + 1;
+      return true;
+    }
+    if (((helper.quote_bits - 1) & helper.bs_bits) != 0) {
+      /* find out where the backspace is */
+      uint32_t bs_dist = trailing_zeroes(helper.bs_bits);
+      uint8_t escape_char = src[bs_dist + 1];
+      /* we encountered backslash first. Handle backslash */
+      if (escape_char == 'u') {
+        /* move src/dst up to the start; they will be further adjusted
+           within the unicode codepoint handling code. */
+        src += bs_dist;
+        dst += bs_dist;
+        if (!handle_unicode_codepoint(&src, &dst)) {
+          return false;
+        }
+      } else {
+        /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+         * write bs_dist+1 characters to output
+         * note this may reach beyond the part of the buffer we've actually
+         * seen. I think this is ok */
+        uint8_t escape_result = escape_map[escape_char];
+        if (escape_result == 0u) {
+          return false; /* bogus escape value is an error */
+        }
+        dst[bs_dist] = escape_result;
+        src += bs_dist + 2;
+        dst += bs_dist + 1;
+      }
+    } else {
+      /* they are the same. Since they can't co-occur, it means we
+       * encountered neither. */
+      if constexpr (TARGETED_ARCHITECTURE == Architecture::WESTMERE) {
+        src += 16;
+        dst += 16;
+      } else {
+        src += 32;
+        dst += 32;
+      }
+    }
+  }
+  /* can't be reached */
+  return true;
+}
+
+} // namespace simdjson
+UNTARGET_REGION
+
+#else
+#error TARGETED_REGION must be specified before including.
+#endif // TARGETED_REGION
+#else
+#error TARGETED_ARCHITECTURE must be specified before including.
+#endif // TARGETED_ARCHITECTURE
+#undef TARGETED_ARCHITECTURE
+#undef TARGETED_REGION
+
+#endif // IS_ARM64
 #endif
 /* end file include/simdjson/stringparsing_arm64.h */
 /* begin file include/simdjson/numberparsing.h */
@@ -39433,7 +39242,7 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) {
 }
 
 template <Architecture T = Architecture::NATIVE>
-WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER int
+WARN_UNUSED int
 unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
 
 template <Architecture T = Architecture::NATIVE>
@@ -39463,38 +39272,12 @@ int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj,
   }
   bool reallocated = false;
   if (realloc_if_needed) {
-#if ALLOW_SAME_PAGE_BUFFER_OVERRUN
-// realloc is needed if the end of the memory crosses a page
-#ifdef _MSC_VER
-    SYSTEM_INFO sysInfo;
-    GetSystemInfo(&sysInfo);
-    long page_size = sysInfo.dwPageSize;
-#else
-    long page_size = sysconf(_SC_PAGESIZE);
-#endif
-    //////////////
-    // We want to check that buf + len - 1 and buf + len - 1 + SIMDJSON_PADDING
-    // are in the same page.
-    // That is, we want to check that
-    // (buf + len - 1) / page_size == (buf + len - 1 + SIMDJSON_PADDING) /
-    // page_size That's true if (buf + len - 1) % page_size + SIMDJSON_PADDING <
-    // page_size.
-    ///////////
-    if ((reinterpret_cast<uintptr_t>(buf + len - 1) % page_size) +
-            SIMDJSON_PADDING <
-        static_cast<uintptr_t>(page_size)) {
-#else // SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN
-    if (true) { // if not SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN, we always
-                // reallocate
-#endif
       const uint8_t *tmp_buf = buf;
       buf = (uint8_t *)allocate_padded_buffer(len);
       if (buf == NULL)
         return simdjson::MEMALLOC;
       memcpy((void *)buf, tmp_buf, len);
       reallocated = true;
-    } // if (true) OR if ( (reinterpret_cast<uintptr_t>(buf + len - 1) %
-      // page_size ) + SIMDJSON_PADDING < static_cast<uintptr_t>(page_size) ) {
   }   // if(realloc_if_needed) {
   int stage1_is_ok = simdjson::find_structural_bits<T>(buf, len, pj);
   if (stage1_is_ok != simdjson::SUCCESS) {