Update amalgamated cpp

2019-08-23 09:56:56 -07:00 · 2019-08-23 09:56:56 -07:00 · 5e5592178d
parent b01222518d
commit 5e5592178d
3 changed files with 114 additions and 158 deletions
--- a/singleheader/amalgamation_demo.cpp
+++ b/singleheader/amalgamation_demo.cpp
@ -1,4 +1,4 @@
-/* auto-generated on Sun Aug 18 15:06:50 DST 2019. Do not edit! */
+/* auto-generated on Fri Aug 23 09:54:21 DST 2019. Do not edit! */

 #include <iostream>
 #include "simdjson.h"
--- a/singleheader/simdjson.cpp
+++ b/singleheader/simdjson.cpp
@ -1,4 +1,4 @@
-/* auto-generated on Sun Aug 18 15:06:50 DST 2019. Do not edit! */
+/* auto-generated on Fri Aug 23 09:54:21 DST 2019. Do not edit! */
 #include "simdjson.h"

 /* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
@ -495,13 +495,13 @@ static const Architecture ARCHITECTURE = Architecture::ARM64;
 #ifdef IS_X86_64


-TARGET_HASWELL
+
 namespace simdjson::haswell {

 static const Architecture ARCHITECTURE = Architecture::HASWELL;

 } // namespace simdjson::haswell
-UNTARGET_REGION
+

 #endif // IS_X86_64

@ -515,13 +515,12 @@ UNTARGET_REGION
 #ifdef IS_X86_64


-TARGET_WESTMERE
 namespace simdjson::westmere {

 static const Architecture ARCHITECTURE = Architecture::WESTMERE;

 } // namespace simdjson::westmere
-UNTARGET_REGION
+

 #endif // IS_X86_64

@ -620,22 +619,25 @@ struct simd_input<Architecture::HASWELL> {
    this->hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
  }

+  template <typename F>
+  really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
+    uint64_t r0 = static_cast<uint32_t>(_mm256_movemask_epi8(chunk_to_mask(this->lo)));
+    uint64_t r1 =                       _mm256_movemask_epi8(chunk_to_mask(this->hi));
+    return r0 | (r1 << 32);
+  }
+
  really_inline uint64_t eq(uint8_t m) {
    const __m256i mask = _mm256_set1_epi8(m);
-    __m256i cmp_res_0 = _mm256_cmpeq_epi8(this->lo, mask);
-    uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-    __m256i cmp_res_1 = _mm256_cmpeq_epi8(this->hi, mask);
-    uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
-    return res_0 | (res_1 << 32);
+    return this->build_bitmask([&] (auto chunk) {
+      return _mm256_cmpeq_epi8(chunk, mask);
+    });
  }

  really_inline uint64_t lteq(uint8_t m) {
    const __m256i maxval = _mm256_set1_epi8(m);
-    __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, this->lo), maxval);
-    uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-    __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, this->hi), maxval);
-    uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
-    return res_0 | (res_1 << 32);
+    return this->build_bitmask([&] (auto chunk) {
+      return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, chunk), maxval);
+    });
  }

 }; // struct simd_input
@ -670,30 +672,27 @@ struct simd_input<Architecture::WESTMERE> {
    this->v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
  }

+  template <typename F>
+  really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
+    uint64_t r0 = static_cast<uint32_t>(_mm_movemask_epi8(chunk_to_mask(this->v0)));
+    uint64_t r1 =                       _mm_movemask_epi8(chunk_to_mask(this->v1));
+    uint64_t r2 =                       _mm_movemask_epi8(chunk_to_mask(this->v2));
+    uint64_t r3 =                       _mm_movemask_epi8(chunk_to_mask(this->v3));
+    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+  }
+
  really_inline uint64_t eq(uint8_t m) {
    const __m128i mask = _mm_set1_epi8(m);
-    __m128i cmp_res_0 = _mm_cmpeq_epi8(this->v0, mask);
-    uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
-    __m128i cmp_res_1 = _mm_cmpeq_epi8(this->v1, mask);
-    uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
-    __m128i cmp_res_2 = _mm_cmpeq_epi8(this->v2, mask);
-    uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
-    __m128i cmp_res_3 = _mm_cmpeq_epi8(this->v3, mask);
-    uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
-    return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
+    return this->build_bitmask([&](auto chunk) {
+      return _mm_cmpeq_epi8(chunk, mask);
+    });
  }

  really_inline uint64_t lteq(uint8_t m) {
    const __m128i maxval = _mm_set1_epi8(m);
-    __m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v0), maxval);
-    uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
-    __m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v1), maxval);
-    uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
-    __m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v2), maxval);
-    uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
-    __m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v3), maxval);
-    uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
-    return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
+    return this->build_bitmask([&](auto chunk) {
+      return _mm_cmpeq_epi8(_mm_max_epu8(maxval, chunk), maxval);
+    });
  }

 }; // struct simd_input
@ -1600,7 +1599,7 @@ static really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint3
 #endif // SIMDJSON_NAIVE_FLATTEN
 // This file contains the common code every implementation uses in stage1
 // It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is include already includes
+// We assume the file in which it is included already includes
 // "simdjson/stage1_find_marks.h" (this simplifies amalgation)

 // return a bitvector indicating where we have characters that end an odd-length
@ -1612,7 +1611,7 @@ static really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint3
 // indicate whether we end an iteration on an odd-length sequence of
 // backslashes, which modifies our subsequent search for odd-length
 // sequences of backslashes in an obvious way.
-static really_inline uint64_t find_odd_backslash_sequences(
+really_inline uint64_t find_odd_backslash_sequences(
    simd_input<ARCHITECTURE> in,
    uint64_t &prev_iter_ends_odd_backslash) {
  const uint64_t even_bits = 0x5555555555555555ULL;
@ -1659,7 +1658,7 @@ static really_inline uint64_t find_odd_backslash_sequences(
 // Note that we don't do any error checking to see if we have backslash
 // sequences outside quotes; these
 // backslash sequences (of any length) will be detected elsewhere.
-static really_inline uint64_t find_quote_mask_and_bits(
+really_inline uint64_t find_quote_mask_and_bits(
    simd_input<ARCHITECTURE> in, uint64_t odd_ends,
    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
    uint64_t &error_mask) {
@ -1682,7 +1681,7 @@ static really_inline uint64_t find_quote_mask_and_bits(
  return quote_mask;
 }

-static really_inline uint64_t finalize_structurals(
+really_inline uint64_t finalize_structurals(
    uint64_t structurals, uint64_t whitespace, uint64_t quote_mask,
    uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) {
  // mask off anything inside quotes
@ -1716,7 +1715,7 @@ static really_inline uint64_t finalize_structurals(
 }

 // Find structural bits in a 64-byte chunk.
-static really_inline void find_structural_bits_64(
+really_inline void find_structural_bits_64(
    const uint8_t *buf, size_t idx, uint32_t *base_ptr, uint32_t &base,
    uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote,
    uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals,
@ -1748,7 +1747,7 @@ static really_inline void find_structural_bits_64(
                                     quote_bits, prev_iter_ends_pseudo_pred);
 }

-static int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
+int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
  if (len > pj.byte_capacity) {
    std::cerr << "Your ParsedJson object only supports documents up to "
              << pj.byte_capacity << " bytes but you are trying to process "
@ -1877,77 +1876,60 @@ static really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTUR
  uint64_t &whitespace, uint64_t &structurals) {

  #ifdef SIMDJSON_NAIVE_STRUCTURAL
-  // You should never need this naive approach, but it can be useful
-  // for research purposes
-  const __m256i mask_open_brace = _mm256_set1_epi8(0x7b);
-  __m256i struct_lo = _mm256_cmpeq_epi8(in.lo, mask_open_brace);
-  __m256i struct_hi = _mm256_cmpeq_epi8(in.hi, mask_open_brace);
-  const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
-  struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_brace));
-  struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_brace));
-  const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
-  struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_open_bracket));
-  struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_open_bracket));
-  const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
-  struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_bracket));
-  struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_bracket));
-  const __m256i mask_column = _mm256_set1_epi8(0x3a);
-  struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_column));
-  struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_column));
-  const __m256i mask_comma = _mm256_set1_epi8(0x2c);
-  struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_comma));
-  struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_comma));
-  uint64_t structural_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(struct_lo));
-  uint64_t structural_res_1 = _mm256_movemask_epi8(struct_hi);
-  structurals = (structural_res_0 | (structural_res_1 << 32));

-  const __m256i mask_space = _mm256_set1_epi8(0x20);
-  __m256i space_lo = _mm256_cmpeq_epi8(in.lo, mask_space);
-  __m256i space_hi = _mm256_cmpeq_epi8(in.hi, mask_space);
-  const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
-  space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_linefeed));
-  space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_linefeed));
-  const __m256i mask_tab = _mm256_set1_epi8(0x09);
-  space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_tab));
-  space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_tab));
-  const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
-  space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_carriage));
-  space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_carriage));
+    // You should never need this naive approach, but it can be useful
+    // for research purposes
+    const __m256i mask_open_brace = _mm256_set1_epi8(0x7b);
+    const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
+    const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
+    const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
+    const __m256i mask_column = _mm256_set1_epi8(0x3a);
+    const __m256i mask_comma = _mm256_set1_epi8(0x2c);
+    structurals = in->build_bitmask([&](auto in) {
+      __m256i structurals = _mm256_cmpeq_epi8(in, mask_open_brace);
+      structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_brace));
+      structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_open_bracket));
+      structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_bracket));
+      structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_column));
+      structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_comma));
+      return structurals;
+    });

-  uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(space_lo));
-  uint64_t ws_res_1 = _mm256_movemask_epi8(space_hi);
-  whitespace = (ws_res_0 | (ws_res_1 << 32));
-  // end of naive approach
+    const __m256i mask_space = _mm256_set1_epi8(0x20);
+    const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
+    const __m256i mask_tab = _mm256_set1_epi8(0x09);
+    const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
+    whitespace = in->build_bitmask([&](auto in) {
+      __m256i space = _mm256_cmpeq_epi8(in, mask_space);
+      space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_linefeed));
+      space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_tab));
+      space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_carriage));
+    });
+    // end of naive approach

  #else  // SIMDJSON_NAIVE_STRUCTURAL
-  // clang-format off
-  const __m256i structural_table =
-      _mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
-                        44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
-  const __m256i white_table = _mm256_setr_epi8(
-      32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
-      32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
-  // clang-format on
-  const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
-  const __m256i struct_mask = _mm256_set1_epi8(32);

-  __m256i lo_white = _mm256_cmpeq_epi8(in.lo, _mm256_shuffle_epi8(white_table, in.lo));
-  __m256i hi_white = _mm256_cmpeq_epi8(in.hi, _mm256_shuffle_epi8(white_table, in.hi));
-  uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(lo_white));
-  uint64_t ws_res_1 = _mm256_movemask_epi8(hi_white);
-  whitespace = (ws_res_0 | (ws_res_1 << 32));
-  __m256i lo_struct_r1 = _mm256_add_epi8(struct_offset, in.lo);
-  __m256i hi_struct_r1 = _mm256_add_epi8(struct_offset, in.hi);
-  __m256i lo_struct_r2 = _mm256_or_si256(in.lo, struct_mask);
-  __m256i hi_struct_r2 = _mm256_or_si256(in.hi, struct_mask);
-  __m256i lo_struct_r3 = _mm256_shuffle_epi8(structural_table, lo_struct_r1);
-  __m256i hi_struct_r3 = _mm256_shuffle_epi8(structural_table, hi_struct_r1);
-  __m256i lo_struct = _mm256_cmpeq_epi8(lo_struct_r2, lo_struct_r3);
-  __m256i hi_struct = _mm256_cmpeq_epi8(hi_struct_r2, hi_struct_r3);
+    // clang-format off
+    const __m256i structural_table =
+        _mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
+                          44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
+    const __m256i white_table = _mm256_setr_epi8(
+        32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
+        32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
+    // clang-format on
+    const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
+    const __m256i struct_mask = _mm256_set1_epi8(32);
+
+    whitespace = in.build_bitmask([&](auto chunk) {
+        return _mm256_cmpeq_epi8(chunk, _mm256_shuffle_epi8(white_table, chunk));
+    });
+    structurals = in.build_bitmask([&](auto chunk) {
+      __m256i struct_r1 = _mm256_add_epi8(struct_offset, chunk);
+      __m256i struct_r2 = _mm256_or_si256(chunk, struct_mask);
+      __m256i struct_r3 = _mm256_shuffle_epi8(structural_table, struct_r1);
+      return _mm256_cmpeq_epi8(struct_r2, struct_r3);
+    });

-  uint64_t structural_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(lo_struct));
-  uint64_t structural_res_1 = _mm256_movemask_epi8(hi_struct);
-  structurals = (structural_res_0 | (structural_res_1 << 32));
  #endif // else SIMDJSON_NAIVE_STRUCTURAL
 }

@ -2019,7 +2001,7 @@ static really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint3

 // This file contains the common code every implementation uses in stage1
 // It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is include already includes
+// We assume the file in which it is included already includes
 // "simdjson/stage1_find_marks.h" (this simplifies amalgation)

 // return a bitvector indicating where we have characters that end an odd-length
@ -2031,7 +2013,7 @@ static really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint3
 // indicate whether we end an iteration on an odd-length sequence of
 // backslashes, which modifies our subsequent search for odd-length
 // sequences of backslashes in an obvious way.
-static really_inline uint64_t find_odd_backslash_sequences(
+really_inline uint64_t find_odd_backslash_sequences(
    simd_input<ARCHITECTURE> in,
    uint64_t &prev_iter_ends_odd_backslash) {
  const uint64_t even_bits = 0x5555555555555555ULL;
@ -2078,7 +2060,7 @@ static really_inline uint64_t find_odd_backslash_sequences(
 // Note that we don't do any error checking to see if we have backslash
 // sequences outside quotes; these
 // backslash sequences (of any length) will be detected elsewhere.
-static really_inline uint64_t find_quote_mask_and_bits(
+really_inline uint64_t find_quote_mask_and_bits(
    simd_input<ARCHITECTURE> in, uint64_t odd_ends,
    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
    uint64_t &error_mask) {
@ -2101,7 +2083,7 @@ static really_inline uint64_t find_quote_mask_and_bits(
  return quote_mask;
 }

-static really_inline uint64_t finalize_structurals(
+really_inline uint64_t finalize_structurals(
    uint64_t structurals, uint64_t whitespace, uint64_t quote_mask,
    uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) {
  // mask off anything inside quotes
@ -2135,7 +2117,7 @@ static really_inline uint64_t finalize_structurals(
 }

 // Find structural bits in a 64-byte chunk.
-static really_inline void find_structural_bits_64(
+really_inline void find_structural_bits_64(
    const uint8_t *buf, size_t idx, uint32_t *base_ptr, uint32_t &base,
    uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote,
    uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals,
@ -2167,7 +2149,7 @@ static really_inline void find_structural_bits_64(
                                     quote_bits, prev_iter_ends_pseudo_pred);
 }

-static int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
+int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
  if (len > pj.byte_capacity) {
    std::cerr << "Your ParsedJson object only supports documents up to "
              << pj.byte_capacity << " bytes but you are trying to process "
@ -2302,45 +2284,16 @@ static really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTUR
  const __m128i struct_offset = _mm_set1_epi8(0xd4u);
  const __m128i struct_mask = _mm_set1_epi8(32);

-  __m128i white0 = _mm_cmpeq_epi8(in.v0, _mm_shuffle_epi8(white_table, in.v0));
-  __m128i white1 = _mm_cmpeq_epi8(in.v1, _mm_shuffle_epi8(white_table, in.v1));
-  __m128i white2 = _mm_cmpeq_epi8(in.v2, _mm_shuffle_epi8(white_table, in.v2));
-  __m128i white3 = _mm_cmpeq_epi8(in.v3, _mm_shuffle_epi8(white_table, in.v3));
-  uint64_t ws_res_0 = _mm_movemask_epi8(white0);
-  uint64_t ws_res_1 = _mm_movemask_epi8(white1);
-  uint64_t ws_res_2 = _mm_movemask_epi8(white2);
-  uint64_t ws_res_3 = _mm_movemask_epi8(white3);
+  whitespace = in.build_bitmask([&](auto chunk) {
+      return _mm_cmpeq_epi8(chunk, _mm_shuffle_epi8(white_table, chunk));
+  });

-  whitespace =
-      (ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));
-
-  __m128i struct1_r1 = _mm_add_epi8(struct_offset, in.v0);
-  __m128i struct2_r1 = _mm_add_epi8(struct_offset, in.v1);
-  __m128i struct3_r1 = _mm_add_epi8(struct_offset, in.v2);
-  __m128i struct4_r1 = _mm_add_epi8(struct_offset, in.v3);
-
-  __m128i struct1_r2 = _mm_or_si128(in.v0, struct_mask);
-  __m128i struct2_r2 = _mm_or_si128(in.v1, struct_mask);
-  __m128i struct3_r2 = _mm_or_si128(in.v2, struct_mask);
-  __m128i struct4_r2 = _mm_or_si128(in.v3, struct_mask);
-
-  __m128i struct1_r3 = _mm_shuffle_epi8(structural_table, struct1_r1);
-  __m128i struct2_r3 = _mm_shuffle_epi8(structural_table, struct2_r1);
-  __m128i struct3_r3 = _mm_shuffle_epi8(structural_table, struct3_r1);
-  __m128i struct4_r3 = _mm_shuffle_epi8(structural_table, struct4_r1);
-
-  __m128i struct1 = _mm_cmpeq_epi8(struct1_r2, struct1_r3);
-  __m128i struct2 = _mm_cmpeq_epi8(struct2_r2, struct2_r3);
-  __m128i struct3 = _mm_cmpeq_epi8(struct3_r2, struct3_r3);
-  __m128i struct4 = _mm_cmpeq_epi8(struct4_r2, struct4_r3);
-
-  uint64_t structural_res_0 = _mm_movemask_epi8(struct1);
-  uint64_t structural_res_1 = _mm_movemask_epi8(struct2);
-  uint64_t structural_res_2 = _mm_movemask_epi8(struct3);
-  uint64_t structural_res_3 = _mm_movemask_epi8(struct4);
-
-  structurals = (structural_res_0 | (structural_res_1 << 16) |
-                  (structural_res_2 << 32) | (structural_res_3 << 48));
+  structurals = in.build_bitmask([&](auto chunk) {
+    __m128i struct_r1 = _mm_add_epi8(struct_offset, chunk);
+    __m128i struct_r2 = _mm_or_si128(chunk, struct_mask);
+    __m128i struct_r3 = _mm_shuffle_epi8(structural_table, struct_r1);
+    return _mm_cmpeq_epi8(struct_r2, struct_r3);
+  });
 }

 // This file contains a non-architecture-specific version of "flatten" used in stage1.
@ -2434,7 +2387,7 @@ static really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint3
 #endif // SIMDJSON_NAIVE_FLATTEN
 // This file contains the common code every implementation uses in stage1
 // It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is include already includes
+// We assume the file in which it is included already includes
 // "simdjson/stage1_find_marks.h" (this simplifies amalgation)

 // return a bitvector indicating where we have characters that end an odd-length
@ -2446,7 +2399,7 @@ static really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint3
 // indicate whether we end an iteration on an odd-length sequence of
 // backslashes, which modifies our subsequent search for odd-length
 // sequences of backslashes in an obvious way.
-static really_inline uint64_t find_odd_backslash_sequences(
+really_inline uint64_t find_odd_backslash_sequences(
    simd_input<ARCHITECTURE> in,
    uint64_t &prev_iter_ends_odd_backslash) {
  const uint64_t even_bits = 0x5555555555555555ULL;
@ -2493,7 +2446,7 @@ static really_inline uint64_t find_odd_backslash_sequences(
 // Note that we don't do any error checking to see if we have backslash
 // sequences outside quotes; these
 // backslash sequences (of any length) will be detected elsewhere.
-static really_inline uint64_t find_quote_mask_and_bits(
+really_inline uint64_t find_quote_mask_and_bits(
    simd_input<ARCHITECTURE> in, uint64_t odd_ends,
    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
    uint64_t &error_mask) {
@ -2516,7 +2469,7 @@ static really_inline uint64_t find_quote_mask_and_bits(
  return quote_mask;
 }

-static really_inline uint64_t finalize_structurals(
+really_inline uint64_t finalize_structurals(
    uint64_t structurals, uint64_t whitespace, uint64_t quote_mask,
    uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) {
  // mask off anything inside quotes
@ -2550,7 +2503,7 @@ static really_inline uint64_t finalize_structurals(
 }

 // Find structural bits in a 64-byte chunk.
-static really_inline void find_structural_bits_64(
+really_inline void find_structural_bits_64(
    const uint8_t *buf, size_t idx, uint32_t *base_ptr, uint32_t &base,
    uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote,
    uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals,
@ -2582,7 +2535,7 @@ static really_inline void find_structural_bits_64(
                                     quote_bits, prev_iter_ends_pseudo_pred);
 }

-static int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
+int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
  if (len > pj.byte_capacity) {
    std::cerr << "Your ParsedJson object only supports documents up to "
              << pj.byte_capacity << " bytes but you are trying to process "
@ -2803,7 +2756,6 @@ struct parse_string_helper {

 #ifdef IS_ARM64

-#include "amd64/architecture.h"

 namespace simdjson::arm64 {

--- a/singleheader/simdjson.h
+++ b/singleheader/simdjson.h
@ -1,4 +1,4 @@
-/* auto-generated on Sun Aug 18 15:06:50 DST 2019. Do not edit! */
+/* auto-generated on Fri Aug 23 09:54:21 DST 2019. Do not edit! */
 /* begin file include/simdjson/simdjson_version.h */
 // /include/simdjson/simdjson_version.h automatically generated by release.py,
 // do not change by hand
@ -36438,13 +36438,17 @@ public:
  // (in case of repeated keys, this only finds the first one).
  // We seek the key using C's strcmp so if your JSON strings contain
  // NULL chars, this would trigger a false positive: if you expect that
-  // to be the case, take extra precautions.
+  // to be the case, take extra precautions. 
+  // Furthermore, we do the comparison character-by-character
+  // without taking into account Unicode equivalence.
  inline bool move_to_key(const char *key);
  // when at {, go one level deep, looking for a given key
  // if successful, we are left pointing at the value,
  // if not, we are still pointing at the object ({)
  // (in case of repeated keys, this only finds the first one).
  // The string we search for can contain NULL values.
+  // Furthermore, we do the comparison character-by-character
+  // without taking into account Unicode equivalence.
  inline bool move_to_key(const char *key, uint32_t length);

  // when at a key location within an object, this moves to the accompanying