Merge pull request #201 from lemire/Multiple_implementation_refactoring_stage2

Stage2 refactored to simplify multiple implementations
2019-07-03 17:32:44 -04:00 · 2019-07-03 17:32:44 -04:00 · 477b058f74
parent 43143f6434 0df6d83f08
commit 477b058f74
9 changed files with 669 additions and 647 deletions
--- a/benchmark/parse.cpp
+++ b/benchmark/parse.cpp
@ -157,7 +157,8 @@ int main(int argc, char *argv[]) {
      break;
    }
    unified.start();
-    isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
+    // The default template is simdjson::instruction_set::native.
+    isok = isok && (simdjson::SUCCESS == simdjson::unified_machine<>(p.data(), p.size(), pj));
    unified.end(results);
    cy2 += results[0];
    cl2 += results[1];
@ -188,7 +189,7 @@ int main(int argc, char *argv[]) {
    auto start = std::chrono::steady_clock::now();
    // The default template is simdjson::instruction_set::native.
    isok = (simdjson::find_structural_bits<>(p.data(), p.size(), pj) == simdjson::SUCCESS);
-    isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
+    isok = isok && (simdjson::SUCCESS == simdjson::unified_machine<>(p.data(), p.size(), pj));
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> secs = end - start;
    res[i] = secs.count();
--- a/include/simdjson/jsonparser.h
+++ b/include/simdjson/jsonparser.h
@ -26,7 +26,6 @@ using json_parse_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj,
 // Pointer that holds the json_parse implementation corresponding to the available SIMD instruction set
 extern json_parse_functype *json_parse_ptr;

-
 // json_parse_implementation is the generic function, it is specialized for various 
 // SIMD instruction sets, e.g., as json_parse_implementation<instruction_set::avx2>
 // or json_parse_implementation<instruction_set::neon> 
@ -69,7 +68,7 @@ int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj, bo
    pj.errorcode = stage1_is_ok;
    return pj.errorcode;
  } 
-  int res = unified_machine(buf, len, pj);
+  int res = unified_machine<T>(buf, len, pj);
  if(reallocated) { aligned_free((void*)buf);}
  return res;
 }
--- a/include/simdjson/numberparsing.h
+++ b/include/simdjson/numberparsing.h
@ -6,6 +6,12 @@
 #include "simdjson/parsedjson.h"
 #include "simdjson/portability.h"

+#ifdef JSON_TEST_NUMBERS // for unit testing
+void foundInvalidNumber(const uint8_t *buf);
+void foundInteger(int64_t result, const uint8_t *buf);
+void foundFloat(double result, const uint8_t *buf);
+#endif
+
 namespace simdjson {
 // Allowable floating-point values range from  std::numeric_limits<double>::lowest() 
 // to std::numeric_limits<double>::max(), so from 
@ -376,9 +382,6 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
  return is_structural_or_whitespace(*p);
 }

-
-
-
 // parse the number at buf + offset
 // define JSON_TEST_NUMBERS for unit testing
 //
--- a/include/simdjson/stage1_find_marks.h
+++ b/include/simdjson/stage1_find_marks.h
@ -33,11 +33,11 @@
 #define TRANSPOSE

 namespace simdjson {
-template<simdjson::instruction_set>
+template<instruction_set>
 struct simd_input;
 #ifdef __AVX2__
 template<>
-struct simd_input<simdjson::instruction_set::avx2>
+struct simd_input<instruction_set::avx2>
 {
  __m256i lo;
  __m256i hi;
@ -45,7 +45,7 @@ struct simd_input<simdjson::instruction_set::avx2>
 #endif

 #ifdef __ARM_NEON
-template<> struct simd_input<simdjson::instruction_set::neon>
+template<> struct simd_input<instruction_set::neon>
 {
 #ifndef TRANSPOSE
  uint8x16_t i0;
@ -111,7 +111,7 @@ uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16
 }
 #endif

-template<simdjson::instruction_set T>
+template<instruction_set T>
 uint64_t compute_quote_mask(uint64_t quote_bits);

 // In practice, if you have NEON or __PCLMUL__, you would
@ -121,7 +121,7 @@ uint64_t compute_quote_mask(uint64_t quote_bits);
 // Also: we don't know of an instance where AVX2 is supported but 
 // where clmul is not supported, so check for both, to be sure.
 #ifdef SIMDJSON_AVOID_CLMUL
-template<simdjson::instruction_set T> really_inline
+template<instruction_set T> really_inline
 uint64_t compute_quote_mask(uint64_t quote_bits)
 {
  uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
@ -133,12 +133,12 @@ uint64_t compute_quote_mask(uint64_t quote_bits)
  return quote_mask;
 }
 #else
-template<simdjson::instruction_set>
+template<instruction_set>
 uint64_t compute_quote_mask(uint64_t quote_bits);

 #ifdef __AVX2__ 
 template<> really_inline
-uint64_t compute_quote_mask<simdjson::instruction_set::avx2>(uint64_t quote_bits) {
+uint64_t compute_quote_mask<instruction_set::avx2>(uint64_t quote_bits) {
  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
      _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
  return quote_mask;
@ -147,7 +147,7 @@ uint64_t compute_quote_mask<simdjson::instruction_set::avx2>(uint64_t quote_bits

 #ifdef __ARM_NEON
 template<> really_inline
-uint64_t compute_quote_mask<simdjson::instruction_set::neon>(uint64_t quote_bits) {
+uint64_t compute_quote_mask<instruction_set::neon>(uint64_t quote_bits) {
 #ifdef __PCLMUL__ // Might cause problems on runtime dispatch
  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
                                          _mm_set_epi64x(0ULL, quote_bits),
@ -161,7 +161,7 @@ uint64_t compute_quote_mask<simdjson::instruction_set::neon>(uint64_t quote_bits
 #endif

 #ifdef SIMDJSON_UTF8VALIDATE
-template<simdjson::instruction_set T>really_inline
+template<instruction_set T>really_inline
 void check_utf8(simd_input<T> in,
                __m256i &has_error,
                struct avx_processed_utf_bytes &previous) {
@ -182,13 +182,13 @@ void check_utf8(simd_input<T> in,
 }
 #endif

-template<simdjson::instruction_set T>
+template<instruction_set T>
 simd_input<T> fill_input(const uint8_t * ptr);

 #ifdef __AVX2__
 template<> really_inline
-simd_input<simdjson::instruction_set::avx2> fill_input<simdjson::instruction_set::avx2>(const uint8_t * ptr) {
-  struct simd_input<simdjson::instruction_set::avx2> in;
+simd_input<instruction_set::avx2> fill_input<instruction_set::avx2>(const uint8_t * ptr) {
+  struct simd_input<instruction_set::avx2> in;
  in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
  in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
  return in;
@ -197,8 +197,8 @@ simd_input<simdjson::instruction_set::avx2> fill_input<simdjson::instruction_set

 #ifdef __ARM_NEON
 template<> really_inline
-simd_input<simdjson::instruction_set::neon> fill_input<simdjson::instruction_set::neon>(const uint8_t * ptr) {
-  struct simd_input<simdjson::instruction_set::neon> in;
+simd_input<instruction_set::neon> fill_input<instruction_set::neon>(const uint8_t * ptr) {
+  struct simd_input<instruction_set::neon> in;
 #ifndef TRANSPOSE
  in.i0 = vld1q_u8(ptr + 0);
  in.i1 = vld1q_u8(ptr + 16);
@ -213,12 +213,12 @@ simd_input<simdjson::instruction_set::neon> fill_input<simdjson::instruction_set

 // a straightforward comparison of a mask against input. 5 uops; would be
 // cheaper in AVX512.
-template<simdjson::instruction_set T>
+template<instruction_set T>
 uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);

 #ifdef __AVX2__
 template<> really_inline
-uint64_t cmp_mask_against_input<simdjson::instruction_set::avx2>(simd_input<simdjson::instruction_set::avx2> in, uint8_t m) {
+uint64_t cmp_mask_against_input<instruction_set::avx2>(simd_input<instruction_set::avx2> in, uint8_t m) {

  const __m256i mask = _mm256_set1_epi8(m);
  __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
@ -231,7 +231,7 @@ uint64_t cmp_mask_against_input<simdjson::instruction_set::avx2>(simd_input<simd

 #ifdef __ARM_NEON
 template<> really_inline
-uint64_t cmp_mask_against_input<simdjson::instruction_set::neon>(simd_input<simdjson::instruction_set::neon> in, uint8_t m) {
+uint64_t cmp_mask_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
  const uint8x16_t mask = vmovq_n_u8(m); 
  uint8x16_t cmp_res_0 = vceqq_u8(in.i.val[0], mask); 
  uint8x16_t cmp_res_1 = vceqq_u8(in.i.val[1], mask); 
@ -242,12 +242,12 @@ uint64_t cmp_mask_against_input<simdjson::instruction_set::neon>(simd_input<simd
 #endif

 // find all values less than or equal than the content of maxval (using unsigned arithmetic) 
-template<simdjson::instruction_set T>
+template<instruction_set T>
 uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);

 #ifdef __AVX2__
 template<> really_inline
-uint64_t unsigned_lteq_against_input<simdjson::instruction_set::avx2>(simd_input<simdjson::instruction_set::avx2> in, uint8_t m) {
+uint64_t unsigned_lteq_against_input<instruction_set::avx2>(simd_input<instruction_set::avx2> in, uint8_t m) {
  const __m256i maxval = _mm256_set1_epi8(m);
  __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.lo),maxval);
  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
@ -259,7 +259,7 @@ uint64_t unsigned_lteq_against_input<simdjson::instruction_set::avx2>(simd_input

 #ifdef __ARM_NEON
 template<> really_inline
-uint64_t unsigned_lteq_against_input<simdjson::instruction_set::neon>(simd_input<simdjson::instruction_set::neon> in, uint8_t m) {
+uint64_t unsigned_lteq_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
  const uint8x16_t mask = vmovq_n_u8(m); 
  uint8x16_t cmp_res_0 = vcleq_u8(in.i.val[0], mask); 
  uint8x16_t cmp_res_1 = vcleq_u8(in.i.val[1], mask); 
@ -278,7 +278,7 @@ uint64_t unsigned_lteq_against_input<simdjson::instruction_set::neon>(simd_input
 // indicate whether we end an iteration on an odd-length sequence of
 // backslashes, which modifies our subsequent search for odd-length
 // sequences of backslashes in an obvious way.
-template<simdjson::instruction_set T> really_inline
+template<instruction_set T> really_inline
 uint64_t find_odd_backslash_sequences(simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash) {
  const uint64_t even_bits = 0x5555555555555555ULL;
  const uint64_t odd_bits = ~even_bits;
@ -323,7 +323,7 @@ uint64_t find_odd_backslash_sequences(simd_input<T> in, uint64_t &prev_iter_ends
 // Note that we don't do any error checking to see if we have backslash
 // sequences outside quotes; these
 // backslash sequences (of any length) will be detected elsewhere.
-template<simdjson::instruction_set T> really_inline
+template<instruction_set T> really_inline
 uint64_t find_quote_mask_and_bits(simd_input<T> in, uint64_t odd_ends,
    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits, uint64_t &error_mask) {
  quote_bits = cmp_mask_against_input<T>(in, '"');
@ -352,14 +352,14 @@ uint64_t find_quote_mask_and_bits(simd_input<T> in, uint64_t odd_ends,
 // we are also interested in the four whitespace characters
 // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
 // these go into the next 2 buckets of the comparison (8/16)
-template<simdjson::instruction_set T>
+template<instruction_set T>
 void find_whitespace_and_structurals(simd_input<T> in,
                                     uint64_t &whitespace,
                                     uint64_t &structurals);

 #ifdef __AVX2__
 template<> really_inline
-void find_whitespace_and_structurals<simdjson::instruction_set::avx2>(simd_input<simdjson::instruction_set::avx2> in,
+void find_whitespace_and_structurals<instruction_set::avx2>(simd_input<instruction_set::avx2> in,
                                                     uint64_t &whitespace,
                                                     uint64_t &structurals) {
 #ifdef SIMDJSON_NAIVE_STRUCTURAL
@ -451,8 +451,8 @@ void find_whitespace_and_structurals<simdjson::instruction_set::avx2>(simd_input

 #ifdef __ARM_NEON
 template<> really_inline
-void find_whitespace_and_structurals<simdjson::instruction_set::neon>(
-                                                  simd_input<simdjson::instruction_set::neon> in,
+void find_whitespace_and_structurals<instruction_set::neon>(
+                                                  simd_input<instruction_set::neon> in,
                                                  uint64_t &whitespace,
                                                  uint64_t &structurals) {
 #ifndef FUNKY_BAD_TABLE
@ -698,7 +698,7 @@ really_inline uint64_t finalize_structurals(
  return structurals;
 }

-template<simdjson::instruction_set T = simdjson::instruction_set::native>
+template<instruction_set T = instruction_set::native>
 WARN_UNUSED
 /*never_inline*/ int find_structural_bits(const uint8_t *buf, size_t len,
                                           ParsedJson &pj) {
@ -849,7 +849,7 @@ WARN_UNUSED
 #endif
 }

-template<simdjson::instruction_set T = simdjson::instruction_set::native>
+template<instruction_set T = instruction_set::native>
 WARN_UNUSED
 int find_structural_bits(const char *buf, size_t len, ParsedJson &pj) {
  return find_structural_bits<T>(reinterpret_cast<const uint8_t *>(buf), len, pj);
--- a/include/simdjson/stage2_build_tape.h
+++ b/include/simdjson/stage2_build_tape.h
@ -1,18 +1,588 @@
 #ifndef SIMDJSON_STAGE2_BUILD_TAPE_H
 #define SIMDJSON_STAGE2_BUILD_TAPE_H

+#include <cassert>
+#include <cstring>
+#include <iostream>
+
 #include "simdjson/common_defs.h"
+#include "simdjson/jsoncharutils.h"
+#include "simdjson/numberparsing.h"
+#include "simdjson/parsedjson.h"
+#include "simdjson/stringparsing.h"
+#include "simdjson/simdjson.h"
+
+#define PATH_SEP '/'

 namespace simdjson {
-struct ParsedJson;
-
 void init_state_machine();

 WARN_UNUSED
-int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
+really_inline bool is_valid_true_atom(const uint8_t *loc) {
+  uint64_t tv = *reinterpret_cast<const uint64_t *>("true    ");
+  uint64_t mask4 = 0x00000000ffffffff;
+  uint32_t error = 0;
+  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
+  // this can read up to 7 bytes beyond the buffer size, but we require
+  // SIMDJSON_PADDING of padding
+  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
+  std::memcpy(&locval, loc, sizeof(uint64_t));
+  error = (locval & mask4) ^ tv;
+  error |= is_not_structural_or_whitespace(loc[4]);
+  return error == 0;
+}

 WARN_UNUSED
-int unified_machine(const char *buf, size_t len, ParsedJson &pj);
+really_inline bool is_valid_false_atom(const uint8_t *loc) {
+  // We have to use an integer constant because the space in the cast
+  // below would lead to values illegally being qualified
+  // uint64_t fv = *reinterpret_cast<const uint64_t *>("false   ");
+  // using this constant (that is the same false) but nulls out the
+  // unused bits solves that
+  uint64_t fv = 0x00000065736c6166; // takes into account endianness
+  uint64_t mask5 = 0x000000ffffffffff;
+  // we can't use the 32 bit value for checking for errors otherwise
+  // the last character of false (it being 5 byte long!) would be
+  // ignored
+  uint64_t error = 0;
+  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
+  // this can read up to 7 bytes beyond the buffer size, but we require 
+  // SIMDJSON_PADDING of padding
+  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
+  std::memcpy(&locval, loc, sizeof(uint64_t));
+  error = (locval & mask5) ^ fv;
+  error |= is_not_structural_or_whitespace(loc[5]);
+  return error == 0;
+}
+
+WARN_UNUSED
+really_inline bool is_valid_null_atom(const uint8_t *loc) {
+  uint64_t nv = *reinterpret_cast<const uint64_t *>("null    ");
+  uint64_t mask4 = 0x00000000ffffffff;
+  uint32_t error = 0;
+  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
+  // this can read up to 7 bytes beyond the buffer size, but we require 
+  // SIMDJSON_PADDING of padding
+  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
+  std::memcpy(&locval, loc, sizeof(uint64_t));
+  error = (locval & mask4) ^ nv;
+  error |= is_not_structural_or_whitespace(loc[4]);
+  return error == 0;
+}
+
+
+/************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+template<instruction_set T = instruction_set::native>
+WARN_UNUSED  ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
+int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
+#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
+  memset((uint8_t*)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
+#endif
+  uint32_t i = 0; // index of the structural character (0,1,2,3...)
+  uint32_t idx;   // location of the structural character in the input (buf)
+  uint8_t c; // used to track the (structural) character we are looking at, updated
+        // by UPDATE_CHAR macro
+  uint32_t depth = 0; // could have an arbitrary starting depth
+  pj.init(); // sets isvalid to false
+  if(pj.bytecapacity < len) {
+      pj.errorcode = simdjson::CAPACITY;
+      return pj.errorcode;
+  }
+// this macro reads the next structural character, updating idx, i and c.
+#define UPDATE_CHAR()                                                          \
+  {                                                                            \
+    idx = pj.structural_indexes[i++];                                          \
+    c = buf[idx];                                                              \
+  }
+
+
+  ////////////////////////////// START STATE /////////////////////////////
+#ifdef SIMDJSON_USE_COMPUTED_GOTO 
+  pj.ret_address[depth] = &&start_continue;
+#else
+  pj.ret_address[depth] = 's';
+#endif
+  pj.containing_scope_offset[depth] = pj.get_current_loc();
+  pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
+  // the root is used, if nothing else, to capture the size of the tape
+  depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
+  if (depth >= pj.depthcapacity) {
+    goto fail;
+  }
+
+  UPDATE_CHAR();
+  switch (c) {
+  case '{':
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+#ifdef SIMDJSON_USE_COMPUTED_GOTO 
+    pj.ret_address[depth] = &&start_continue;
+#else
+    pj.ret_address[depth] = 's';
+#endif
+    depth++;
+    if (depth >= pj.depthcapacity) {
+      goto fail;
+    }
+    pj.write_tape(0, c); // strangely, moving this to object_begin slows things down
+    goto object_begin;
+  case '[':
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+#ifdef SIMDJSON_USE_COMPUTED_GOTO 
+    pj.ret_address[depth] = &&start_continue;
+#else
+    pj.ret_address[depth] = 's';
+#endif    
+    depth++;
+    if (depth >= pj.depthcapacity) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    goto array_begin;
+#define SIMDJSON_ALLOWANYTHINGINROOT
+    // A JSON text is a serialized value.  Note that certain previous
+    // specifications of JSON constrained a JSON text to be an object or an
+    // array.  Implementations that generate only objects or arrays where a
+    // JSON text is called for will be interoperable in the sense that all
+    // implementations will accept these as conforming JSON texts.
+    // https://tools.ietf.org/html/rfc8259
+#ifdef SIMDJSON_ALLOWANYTHINGINROOT
+  case '"': {
+    if (!parse_string<T>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    break;
+  }
+  case 't': {
+    // we need to make a copy to make sure that the string is space terminated.
+    // this only applies to the JSON document made solely of the true value.
+    // this will almost never be called in practice
+    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if(copy == nullptr) { 
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    pj.write_tape(0, c);
+    break;
+  }
+  case 'f': {
+    // we need to make a copy to make sure that the string is space terminated.
+    // this only applies to the JSON document made solely of the false value.
+    // this will almost never be called in practice
+    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if(copy == nullptr) { 
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    pj.write_tape(0, c);
+    break;
+  }
+  case 'n': {
+    // we need to make a copy to make sure that the string is space terminated.
+    // this only applies to the JSON document made solely of the null value.
+    // this will almost never be called in practice
+    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if(copy == nullptr) { 
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    pj.write_tape(0, c);
+    break;
+  }
+  case '0': 
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9': {
+    // we need to make a copy to make sure that the string is space terminated.
+    // this is done only for JSON documents made of a sole number
+    // this will almost never be called in practice. We terminate with a space
+    // because we do not want to allow NULLs in the middle of a number (whereas a
+    // space in the middle of a number would be identified in stage 1).
+    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if(copy == nullptr) { 
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = ' ';
+    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, false)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    break;
+  }
+  case '-': {
+    // we need to make a copy to make sure that the string is NULL terminated.
+    // this is done only for JSON documents made of a sole number
+    // this will almost never be called in practice
+    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    if(copy == nullptr) { 
+      goto fail;
+    }
+    memcpy(copy, buf, len);
+    copy[len] = '\0';
+    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
+      free(copy);
+      goto fail;
+    }
+    free(copy);
+    break;
+  }
+#endif // ALLOWANYTHINGINROOT
+  default:
+    goto fail;
+  }
+start_continue:
+  // the string might not be NULL terminated.
+  if(i + 1 == pj.n_structural_indexes) {
+    goto succeed;
+  } else {
+    goto fail;
+  }
+  ////////////////////////////// OBJECT STATES /////////////////////////////
+
+object_begin:
+  UPDATE_CHAR();
+  switch (c) {
+  case '"': {
+    if (!parse_string<T>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    goto object_key_state;
+  }
+  case '}':
+    goto scope_end; // could also go to object_continue
+  default:
+    goto fail;
+  }
+
+object_key_state:
+  UPDATE_CHAR();
+  if (c != ':') {
+    goto fail;
+  }
+  UPDATE_CHAR();
+  switch (c) {
+  case '"': {
+    if (!parse_string<T>(buf, len, pj, depth, idx)) {
+      goto fail; 
+    }
+    break;
+  }
+  case 't':
+    if (!is_valid_true_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'f':
+    if (!is_valid_false_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case 'n':
+    if (!is_valid_null_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break;
+  case '0': 
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9': {
+    if (!parse_number(buf, pj, idx, false)) {
+      goto fail;
+    }
+    break;
+  }
+  case '-': {
+    if (!parse_number(buf, pj, idx, true)) {
+      goto fail;
+    }
+    break;
+  }
+  case '{': {
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
+    // we have not yet encountered } so we need to come back for it
+#ifdef SIMDJSON_USE_COMPUTED_GOTO 
+    pj.ret_address[depth] = &&object_continue;
+#else
+    pj.ret_address[depth] = 'o';
+#endif
+    // we found an object inside an object, so we need to increment the depth
+    depth++;
+    if (depth >= pj.depthcapacity) {
+      goto fail;
+    }
+
+    goto object_begin;
+  }
+  case '[': {
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c);  // here the compilers knows what c is so this gets optimized
+    // we have not yet encountered } so we need to come back for it
+#ifdef SIMDJSON_USE_COMPUTED_GOTO 
+    pj.ret_address[depth] = &&object_continue;
+#else
+    pj.ret_address[depth] = 'o';
+#endif    
+    // we found an array inside an object, so we need to increment the depth
+    depth++;
+    if (depth >= pj.depthcapacity) {
+      goto fail;
+    }
+    goto array_begin;
+  }
+  default:
+    goto fail;
+  }
+
+object_continue:
+  UPDATE_CHAR();
+  switch (c) {
+  case ',':
+    UPDATE_CHAR();
+    if (c != '"') {
+      goto fail;
+    } else {
+      if (!parse_string<T>(buf, len, pj, depth, idx)) {
+        goto fail; 
+      }
+      goto object_key_state;
+    }
+  case '}':
+    goto scope_end;
+  default:
+    goto fail;
+  }
+
+  ////////////////////////////// COMMON STATE /////////////////////////////
+
+scope_end:
+  // write our tape location to the header scope
+  depth--;
+  pj.write_tape(pj.containing_scope_offset[depth], c);
+  pj.annotate_previousloc(pj.containing_scope_offset[depth],
+                          pj.get_current_loc());
+  // goto saved_state
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+  goto *pj.ret_address[depth];
+#else
+  if(pj.ret_address[depth] == 'a') {
+    goto array_continue;
+  } else if (pj.ret_address[depth] == 'o') {
+    goto object_continue;
+  } else goto start_continue;
+#endif
+
+  ////////////////////////////// ARRAY STATES /////////////////////////////
+array_begin:
+  UPDATE_CHAR();
+  if (c == ']') {
+    goto scope_end; // could also go to array_continue
+  }
+
+main_array_switch:
+  // we call update char on all paths in, so we can peek at c on the
+  // on paths that can accept a close square brace (post-, and at start)
+  switch (c) {
+  case '"': {
+    if (!parse_string<T>(buf, len, pj, depth, idx)) {
+      goto fail;
+    }
+    break;
+  }
+  case 't':
+    if (!is_valid_true_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break; 
+  case 'f':
+    if (!is_valid_false_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break; 
+  case 'n':
+    if (!is_valid_null_atom(buf + idx)) {
+      goto fail;
+    }
+    pj.write_tape(0, c);
+    break; // goto array_continue;
+
+  case '0': 
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9': {
+    if (!parse_number(buf, pj, idx, false)) {
+      goto fail;
+    }
+    break; // goto array_continue;
+  }
+  case '-': {
+    if (!parse_number(buf, pj, idx, true)) {
+      goto fail;
+    }
+    break; // goto array_continue;
+  }
+  case '{': {
+    // we have not yet encountered ] so we need to come back for it
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); //  here the compilers knows what c is so this gets optimized
+#ifdef SIMDJSON_USE_COMPUTED_GOTO 
+    pj.ret_address[depth] = &&array_continue;
+#else
+    pj.ret_address[depth] = 'a';
+#endif
+    // we found an object inside an array, so we need to increment the depth
+    depth++;
+    if (depth >= pj.depthcapacity) {
+      goto fail;
+    }
+
+    goto object_begin;
+  }
+  case '[': {
+    // we have not yet encountered ] so we need to come back for it
+    pj.containing_scope_offset[depth] = pj.get_current_loc();
+    pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
+#ifdef SIMDJSON_USE_COMPUTED_GOTO 
+    pj.ret_address[depth] = &&array_continue;
+#else
+    pj.ret_address[depth] = 'a';
+#endif
+    // we found an array inside an array, so we need to increment the depth
+    depth++;
+    if (depth >= pj.depthcapacity) {
+      goto fail;
+    }
+    goto array_begin;
+  }
+  default:
+    goto fail;
+  }
+
+array_continue:
+  UPDATE_CHAR();
+  switch (c) {
+  case ',':
+    UPDATE_CHAR();
+    goto main_array_switch;
+  case ']':
+    goto scope_end;
+  default:
+    goto fail;
+  }
+
+  ////////////////////////////// FINAL STATES /////////////////////////////
+
+succeed:
+  depth --;
+  if(depth != 0) {
+    fprintf(stderr, "internal bug\n");
+    abort();
+  }
+  if(pj.containing_scope_offset[depth] != 0) {
+    fprintf(stderr, "internal bug\n");
+    abort();
+  }
+  pj.annotate_previousloc(pj.containing_scope_offset[depth],
+                          pj.get_current_loc());
+  pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
+
+  pj.isvalid  = true;
+  pj.errorcode = simdjson::SUCCESS;
+  return pj.errorcode;
+fail:
+  // we do not need the next line because this is done by pj.init(), pessimistically.
+  // pj.isvalid  = false;
+  // At this point in the code, we have all the time in the world.
+  // Note that we know exactly where we are in the document so we could,
+  // without any overhead on the processing code, report a specific location.
+  // We could even trigger special code paths to assess what happened carefully,
+  // all without any added cost.
+  if (depth >= pj.depthcapacity) {
+    pj.errorcode = simdjson::DEPTH_ERROR;
+    return pj.errorcode;
+  }
+  switch(c) {
+    case '"':
+      pj.errorcode = simdjson::STRING_ERROR; 
+      return pj.errorcode;
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9': 
+    case '-': 
+      pj.errorcode = simdjson::NUMBER_ERROR;
+      return pj.errorcode;
+    case 't':
+      pj.errorcode = simdjson::T_ATOM_ERROR;
+      return pj.errorcode;
+    case 'n':
+      pj.errorcode = simdjson::N_ATOM_ERROR;
+      return pj.errorcode;
+    case 'f':
+      pj.errorcode = simdjson::F_ATOM_ERROR;
+      return pj.errorcode;
+    default: 
+      break;
+  }
+  pj.errorcode = simdjson::TAPE_ERROR;
+  return pj.errorcode; 
+}
+
+template<instruction_set T = instruction_set::native>
+int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
+  return unified_machine<T>(reinterpret_cast<const uint8_t*>(buf), len, pj);
+}
 }

 #endif
--- a/include/simdjson/stringparsing.h
+++ b/include/simdjson/stringparsing.h
@ -5,6 +5,11 @@
 #include "simdjson/jsoncharutils.h"
 #include "simdjson/parsedjson.h"

+#ifdef JSON_TEST_STRINGS
+void foundString(const uint8_t *buf, const uint8_t *parsed_begin, const uint8_t *parsed_end);
+void foundBadString(const uint8_t *buf);
+#endif
+
 namespace simdjson {
 // begin copypasta
 // These chars yield themselves: " \ /
@ -76,19 +81,19 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, uint8_t **d
 #include <arm_neon.h>
 #endif

-WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
-really_inline  bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
-                                ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
-#ifdef SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing
-  pj.write_tape(0, '"');// don't bother with the string parsing at all
-  return true; // always succeeds
-#else
-  pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
-  const uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
-  uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
-  const uint8_t *const start_of_string = dst;
-  while (1) {
+// Holds backslashes and quotes locations.
+struct parse_string_helper {
+  uint32_t bs_bits;
+  uint32_t quote_bits;
+};
+
+// Finds where the backslashes and quotes are located.
+template<instruction_set>
+parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst);
+
 #ifdef __AVX2__
+template<> really_inline
+parse_string_helper find_bs_bits_and_quote_bits<instruction_set::avx2> (const uint8_t *src, uint8_t *dst) {
    // this can read up to 31 bytes beyond the buffer size, but we require 
    // SIMDJSON_PADDING of padding
    static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
@ -96,12 +101,17 @@ really_inline  bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
    // store to dest unconditionally - we can overwrite the bits we don't like
    // later
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
-    auto bs_bits =
-        static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\'))));
    auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
-    auto quote_bits =
-        static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask));
-#else
+    return {
+      static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits
+      static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask)) // quote_bits
+    };
+}
+#endif
+
+#ifdef __ARM_NEON
+template<> really_inline
+parse_string_helper find_bs_bits_and_quote_bits<instruction_set::neon> (const uint8_t *src, uint8_t *dst) {
    // this can read up to 31 bytes beyond the buffer size, but we require 
    // SIMDJSON_PADDING of padding
    static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
@ -128,14 +138,32 @@ really_inline  bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
    uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1);
    sum0 = vpaddq_u8(sum0, sum1);
    sum0 = vpaddq_u8(sum0, sum0);
-    auto bs_bits =  vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0);
-    auto quote_bits =  vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1);
+    return {
+      vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits
+      vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits
+    };
+}
 #endif
-    if(((bs_bits - 1) & quote_bits) != 0 ) {
+
+template<instruction_set T>
+WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline 
+bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
+                                ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
+#ifdef SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing
+  pj.write_tape(0, '"');// don't bother with the string parsing at all
+  return true; // always succeeds
+#else
+  pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
+  const uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
+  uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
+  const uint8_t *const start_of_string = dst;
+  while (1) {
+    parse_string_helper helper = find_bs_bits_and_quote_bits<T>(src, dst);
+    if(((helper.bs_bits - 1) & helper.quote_bits) != 0 ) {
      // we encountered quotes first. Move dst to point to quotes and exit

      // find out where the quote is...
-      uint32_t quote_dist = trailingzeroes(quote_bits);
+      uint32_t quote_dist = trailingzeroes(helper.quote_bits);

      // NULL termination is still handy if you expect all your strings to be NULL terminated?
      // It comes at a small cost
@ -158,9 +186,9 @@ really_inline  bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
 #endif // JSON_TEST_STRINGS
      return true;
    } 
-    if(((quote_bits - 1) & bs_bits ) != 0 ) {
+    if(((helper.quote_bits - 1) & helper.bs_bits ) != 0 ) {
      // find out where the backspace is
-      uint32_t bs_dist = trailingzeroes(bs_bits);
+      uint32_t bs_dist = trailingzeroes(helper.bs_bits);
      uint8_t escape_char = src[bs_dist + 1];
      // we encountered backslash first. Handle backslash
      if (escape_char == 'u') {
--- a/src/stage2_build_tape.cpp
+++ b/src/stage2_build_tape.cpp
@ -1,580 +1 @@
-#include <cassert>
-#include <cstring>
-
-#include "simdjson/common_defs.h"
-#include "simdjson/jsoncharutils.h"
-#include "simdjson/numberparsing.h"
-#include "simdjson/parsedjson.h"
-#include "simdjson/stringparsing.h"
-#include "simdjson/simdjson.h"
-
-#include <iostream>
-#define PATH_SEP '/'
-
-namespace simdjson {
-
-WARN_UNUSED
-really_inline bool is_valid_true_atom(const uint8_t *loc) {
-  uint64_t tv = *reinterpret_cast<const uint64_t *>("true    ");
-  uint64_t mask4 = 0x00000000ffffffff;
-  uint32_t error = 0;
-  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
-  // this can read up to 7 bytes beyond the buffer size, but we require 
-  // SIMDJSON_PADDING of padding
-  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
-  std::memcpy(&locval, loc, sizeof(uint64_t));
-  error = (locval & mask4) ^ tv;
-  error |= is_not_structural_or_whitespace(loc[4]);
-  return error == 0;
-}
-
-WARN_UNUSED
-really_inline bool is_valid_false_atom(const uint8_t *loc) {
-  // We have to use an integer constant because the space in the cast
-  // below would lead to values illegally being qualified
-  // uint64_t fv = *reinterpret_cast<const uint64_t *>("false   ");
-  // using this constant (that is the same false) but nulls out the
-  // unused bits solves that
-  uint64_t fv = 0x00000065736c6166; // takes into account endianness
-  uint64_t mask5 = 0x000000ffffffffff;
-  // we can't use the 32 bit value for checking for errors otherwise
-  // the last character of false (it being 5 byte long!) would be
-  // ignored
-  uint64_t error = 0;
-  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
-  // this can read up to 7 bytes beyond the buffer size, but we require 
-  // SIMDJSON_PADDING of padding
-  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
-  std::memcpy(&locval, loc, sizeof(uint64_t));
-  error = (locval & mask5) ^ fv;
-  error |= is_not_structural_or_whitespace(loc[5]);
-  return error == 0;
-}
-
-WARN_UNUSED
-really_inline bool is_valid_null_atom(const uint8_t *loc) {
-  uint64_t nv = *reinterpret_cast<const uint64_t *>("null    ");
-  uint64_t mask4 = 0x00000000ffffffff;
-  uint32_t error = 0;
-  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
-  // this can read up to 7 bytes beyond the buffer size, but we require 
-  // SIMDJSON_PADDING of padding
-  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
-  std::memcpy(&locval, loc, sizeof(uint64_t));
-  error = (locval & mask4) ^ nv;
-  error |= is_not_structural_or_whitespace(loc[4]);
-  return error == 0;
-}
-
-
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED  ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
-int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
-#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
-  memset((uint8_t*)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
-#endif
-  uint32_t i = 0; // index of the structural character (0,1,2,3...)
-  uint32_t idx;   // location of the structural character in the input (buf)
-  uint8_t c; // used to track the (structural) character we are looking at, updated
-        // by UPDATE_CHAR macro
-  uint32_t depth = 0; // could have an arbitrary starting depth
-  pj.init(); // sets isvalid to false
-  if(pj.bytecapacity < len) {
-      pj.errorcode = CAPACITY;
-      return pj.errorcode;
-  }
-// this macro reads the next structural character, updating idx, i and c.
-#define UPDATE_CHAR()                                                          \
-  {                                                                            \
-    idx = pj.structural_indexes[i++];                                          \
-    c = buf[idx];                                                              \
-  }
-
-
-  ////////////////////////////// START STATE /////////////////////////////
-#ifdef SIMDJSON_USE_COMPUTED_GOTO 
-  pj.ret_address[depth] = &&start_continue;
-#else
-  pj.ret_address[depth] = 's';
-#endif
-  pj.containing_scope_offset[depth] = pj.get_current_loc();
-  pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
-  // the root is used, if nothing else, to capture the size of the tape
-  depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
-  if (depth >= pj.depthcapacity) {
-    goto fail;
-  }
-
-  UPDATE_CHAR();
-  switch (c) {
-  case '{':
-    pj.containing_scope_offset[depth] = pj.get_current_loc();
-#ifdef SIMDJSON_USE_COMPUTED_GOTO 
-    pj.ret_address[depth] = &&start_continue;
-#else
-    pj.ret_address[depth] = 's';
-#endif
-    depth++;
-    if (depth >= pj.depthcapacity) {
-      goto fail;
-    }
-    pj.write_tape(0, c); // strangely, moving this to object_begin slows things down
-    goto object_begin;
-  case '[':
-    pj.containing_scope_offset[depth] = pj.get_current_loc();
-#ifdef SIMDJSON_USE_COMPUTED_GOTO 
-    pj.ret_address[depth] = &&start_continue;
-#else
-    pj.ret_address[depth] = 's';
-#endif    
-    depth++;
-    if (depth >= pj.depthcapacity) {
-      goto fail;
-    }
-    pj.write_tape(0, c);
-    goto array_begin;
-#define SIMDJSON_ALLOWANYTHINGINROOT
-    // A JSON text is a serialized value.  Note that certain previous
-    // specifications of JSON constrained a JSON text to be an object or an
-    // array.  Implementations that generate only objects or arrays where a
-    // JSON text is called for will be interoperable in the sense that all
-    // implementations will accept these as conforming JSON texts.
-    // https://tools.ietf.org/html/rfc8259
-#ifdef SIMDJSON_ALLOWANYTHINGINROOT
-  case '"': {
-    if (!parse_string(buf, len, pj, depth, idx)) {
-      goto fail;
-    }
-    break;
-  }
-  case 't': {
-    // we need to make a copy to make sure that the string is space terminated.
-    // this only applies to the JSON document made solely of the true value.
-    // this will almost never be called in practice
-    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
-    if(copy == nullptr) { 
-      goto fail;
-    }
-    memcpy(copy, buf, len);
-    copy[len] = ' ';
-    if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
-      free(copy);
-      goto fail;
-    }
-    free(copy);
-    pj.write_tape(0, c);
-    break;
-  }
-  case 'f': {
-    // we need to make a copy to make sure that the string is space terminated.
-    // this only applies to the JSON document made solely of the false value.
-    // this will almost never be called in practice
-    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
-    if(copy == nullptr) { 
-      goto fail;
-    }
-    memcpy(copy, buf, len);
-    copy[len] = ' ';
-    if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
-      free(copy);
-      goto fail;
-    }
-    free(copy);
-    pj.write_tape(0, c);
-    break;
-  }
-  case 'n': {
-    // we need to make a copy to make sure that the string is space terminated.
-    // this only applies to the JSON document made solely of the null value.
-    // this will almost never be called in practice
-    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
-    if(copy == nullptr) { 
-      goto fail;
-    }
-    memcpy(copy, buf, len);
-    copy[len] = ' ';
-    if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
-      free(copy);
-      goto fail;
-    }
-    free(copy);
-    pj.write_tape(0, c);
-    break;
-  }
-  case '0': 
-  case '1':
-  case '2':
-  case '3':
-  case '4':
-  case '5':
-  case '6':
-  case '7':
-  case '8':
-  case '9': {
-    // we need to make a copy to make sure that the string is space terminated.
-    // this is done only for JSON documents made of a sole number
-    // this will almost never be called in practice. We terminate with a space
-    // because we do not want to allow NULLs in the middle of a number (whereas a
-    // space in the middle of a number would be identified in stage 1).
-    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
-    if(copy == nullptr) { 
-      goto fail;
-    }
-    memcpy(copy, buf, len);
-    copy[len] = ' ';
-    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, false)) {
-      free(copy);
-      goto fail;
-    }
-    free(copy);
-    break;
-  }
-  case '-': {
-    // we need to make a copy to make sure that the string is NULL terminated.
-    // this is done only for JSON documents made of a sole number
-    // this will almost never be called in practice
-    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
-    if(copy == nullptr) { 
-      goto fail;
-    }
-    memcpy(copy, buf, len);
-    copy[len] = '\0';
-    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
-      free(copy);
-      goto fail;
-    }
-    free(copy);
-    break;
-  }
-#endif // ALLOWANYTHINGINROOT
-  default:
-    goto fail;
-  }
-start_continue:
-  // the string might not be NULL terminated.
-  if(i + 1 == pj.n_structural_indexes) {
-    goto succeed;
-  } else {
-    goto fail;
-  }
-  ////////////////////////////// OBJECT STATES /////////////////////////////
-
-object_begin:
-  UPDATE_CHAR();
-  switch (c) {
-  case '"': {
-    if (!parse_string(buf, len, pj, depth, idx)) {
-      goto fail;
-    }
-    goto object_key_state;
-  }
-  case '}':
-    goto scope_end; // could also go to object_continue
-  default:
-    goto fail;
-  }
-
-object_key_state:
-  UPDATE_CHAR();
-  if (c != ':') {
-    goto fail;
-  }
-  UPDATE_CHAR();
-  switch (c) {
-  case '"': {
-    if (!parse_string(buf, len, pj, depth, idx)) {
-      goto fail; 
-    }
-    break;
-  }
-  case 't':
-    if (!is_valid_true_atom(buf + idx)) {
-      goto fail;
-    }
-    pj.write_tape(0, c);
-    break;
-  case 'f':
-    if (!is_valid_false_atom(buf + idx)) {
-      goto fail;
-    }
-    pj.write_tape(0, c);
-    break;
-  case 'n':
-    if (!is_valid_null_atom(buf + idx)) {
-      goto fail;
-    }
-    pj.write_tape(0, c);
-    break;
-  case '0': 
-  case '1':
-  case '2':
-  case '3':
-  case '4':
-  case '5':
-  case '6':
-  case '7':
-  case '8':
-  case '9': {
-    if (!parse_number(buf, pj, idx, false)) {
-      goto fail;
-    }
-    break;
-  }
-  case '-': {
-    if (!parse_number(buf, pj, idx, true)) {
-      goto fail;
-    }
-    break;
-  }
-  case '{': {
-    pj.containing_scope_offset[depth] = pj.get_current_loc();
-    pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
-    // we have not yet encountered } so we need to come back for it
-#ifdef SIMDJSON_USE_COMPUTED_GOTO 
-    pj.ret_address[depth] = &&object_continue;
-#else
-    pj.ret_address[depth] = 'o';
-#endif
-    // we found an object inside an object, so we need to increment the depth
-    depth++;
-    if (depth >= pj.depthcapacity) {
-      goto fail;
-    }
-
-    goto object_begin;
-  }
-  case '[': {
-    pj.containing_scope_offset[depth] = pj.get_current_loc();
-    pj.write_tape(0, c);  // here the compilers knows what c is so this gets optimized
-    // we have not yet encountered } so we need to come back for it
-#ifdef SIMDJSON_USE_COMPUTED_GOTO 
-    pj.ret_address[depth] = &&object_continue;
-#else
-    pj.ret_address[depth] = 'o';
-#endif    
-    // we found an array inside an object, so we need to increment the depth
-    depth++;
-    if (depth >= pj.depthcapacity) {
-      goto fail;
-    }
-    goto array_begin;
-  }
-  default:
-    goto fail;
-  }
-
-object_continue:
-  UPDATE_CHAR();
-  switch (c) {
-  case ',':
-    UPDATE_CHAR();
-    if (c != '"') {
-      goto fail;
-    } else {
-      if (!parse_string(buf, len, pj, depth, idx)) {
-        goto fail; 
-      }
-      goto object_key_state;
-    }
-  case '}':
-    goto scope_end;
-  default:
-    goto fail;
-  }
-
-  ////////////////////////////// COMMON STATE /////////////////////////////
-
-scope_end:
-  // write our tape location to the header scope
-  depth--;
-  pj.write_tape(pj.containing_scope_offset[depth], c);
-  pj.annotate_previousloc(pj.containing_scope_offset[depth],
-                          pj.get_current_loc());
-  // goto saved_state
-#ifdef SIMDJSON_USE_COMPUTED_GOTO
-  goto *pj.ret_address[depth];
-#else
-  if(pj.ret_address[depth] == 'a') {
-    goto array_continue;
-  } else if (pj.ret_address[depth] == 'o') {
-    goto object_continue;
-  } else goto start_continue;
-#endif
-
-  ////////////////////////////// ARRAY STATES /////////////////////////////
-array_begin:
-  UPDATE_CHAR();
-  if (c == ']') {
-    goto scope_end; // could also go to array_continue
-  }
-
-main_array_switch:
-  // we call update char on all paths in, so we can peek at c on the
-  // on paths that can accept a close square brace (post-, and at start)
-  switch (c) {
-  case '"': {
-    if (!parse_string(buf, len, pj, depth, idx)) {
-      goto fail;
-    }
-    break;
-  }
-  case 't':
-    if (!is_valid_true_atom(buf + idx)) {
-      goto fail;
-    }
-    pj.write_tape(0, c);
-    break; 
-  case 'f':
-    if (!is_valid_false_atom(buf + idx)) {
-      goto fail;
-    }
-    pj.write_tape(0, c);
-    break; 
-  case 'n':
-    if (!is_valid_null_atom(buf + idx)) {
-      goto fail;
-    }
-    pj.write_tape(0, c);
-    break; // goto array_continue;
-
-  case '0': 
-  case '1':
-  case '2':
-  case '3':
-  case '4':
-  case '5':
-  case '6':
-  case '7':
-  case '8':
-  case '9': {
-    if (!parse_number(buf, pj, idx, false)) {
-      goto fail;
-    }
-    break; // goto array_continue;
-  }
-  case '-': {
-    if (!parse_number(buf, pj, idx, true)) {
-      goto fail;
-    }
-    break; // goto array_continue;
-  }
-  case '{': {
-    // we have not yet encountered ] so we need to come back for it
-    pj.containing_scope_offset[depth] = pj.get_current_loc();
-    pj.write_tape(0, c); //  here the compilers knows what c is so this gets optimized
-#ifdef SIMDJSON_USE_COMPUTED_GOTO 
-    pj.ret_address[depth] = &&array_continue;
-#else
-    pj.ret_address[depth] = 'a';
-#endif
-    // we found an object inside an array, so we need to increment the depth
-    depth++;
-    if (depth >= pj.depthcapacity) {
-      goto fail;
-    }
-
-    goto object_begin;
-  }
-  case '[': {
-    // we have not yet encountered ] so we need to come back for it
-    pj.containing_scope_offset[depth] = pj.get_current_loc();
-    pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
-#ifdef SIMDJSON_USE_COMPUTED_GOTO 
-    pj.ret_address[depth] = &&array_continue;
-#else
-    pj.ret_address[depth] = 'a';
-#endif
-    // we found an array inside an array, so we need to increment the depth
-    depth++;
-    if (depth >= pj.depthcapacity) {
-      goto fail;
-    }
-    goto array_begin;
-  }
-  default:
-    goto fail;
-  }
-
-array_continue:
-  UPDATE_CHAR();
-  switch (c) {
-  case ',':
-    UPDATE_CHAR();
-    goto main_array_switch;
-  case ']':
-    goto scope_end;
-  default:
-    goto fail;
-  }
-
-  ////////////////////////////// FINAL STATES /////////////////////////////
-
-succeed:
-  depth --;
-  if(depth != 0) {
-    fprintf(stderr, "internal bug\n");
-    abort();
-  }
-  if(pj.containing_scope_offset[depth] != 0) {
-    fprintf(stderr, "internal bug\n");
-    abort();
-  }
-  pj.annotate_previousloc(pj.containing_scope_offset[depth],
-                          pj.get_current_loc());
-  pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
-
-  pj.isvalid  = true;
-  pj.errorcode = SUCCESS;
-  return pj.errorcode;
-fail:
-  // we do not need the next line because this is done by pj.init(), pessimistically.
-  // pj.isvalid  = false;
-  // At this point in the code, we have all the time in the world.
-  // Note that we know exactly where we are in the document so we could,
-  // without any overhead on the processing code, report a specific location.
-  // We could even trigger special code paths to assess what happened carefully,
-  // all without any added cost.
-  if (depth >= pj.depthcapacity) {
-    pj.errorcode = DEPTH_ERROR;
-    return pj.errorcode;
-  }
-  switch(c) {
-    case '"':
-      pj.errorcode = STRING_ERROR; 
-      return pj.errorcode;
-    case '0':
-    case '1':
-    case '2':
-    case '3':
-    case '4':
-    case '5':
-    case '6':
-    case '7':
-    case '8':
-    case '9': 
-    case '-': 
-      pj.errorcode = NUMBER_ERROR;
-      return pj.errorcode;
-    case 't':
-      pj.errorcode = T_ATOM_ERROR;
-      return pj.errorcode;
-    case 'n':
-      pj.errorcode = N_ATOM_ERROR;
-      return pj.errorcode;
-    case 'f':
-      pj.errorcode = F_ATOM_ERROR;
-      return pj.errorcode;
-    default: 
-      break;
-  }
-  pj.errorcode = TAPE_ERROR;
-  return pj.errorcode; 
-}
-
-int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
-  return unified_machine(reinterpret_cast<const uint8_t*>(buf), len, pj);
-}
-}
+// File kept in case we want to reuse it soon. (many configuration files to edit)
--- a/tests/numberparsingcheck.cpp
+++ b/tests/numberparsingcheck.cpp
@ -38,7 +38,7 @@ bool is_in_bad_list(const char *buf) {
  return false;
 }

-inline void foundInvalidNumber(const uint8_t *buf) {
+void foundInvalidNumber(const uint8_t *buf) {
  invalid_count++;
  char *endptr;
  double expected = strtod((const char *)buf, &endptr);
@ -53,7 +53,7 @@ inline void foundInvalidNumber(const uint8_t *buf) {
  }
 }

-inline void foundInteger(int64_t result, const uint8_t *buf) {
+void foundInteger(int64_t result, const uint8_t *buf) {
  int_count++;
  char *endptr;
  long long expected = strtoll((const char *)buf, &endptr, 10);
@ -64,7 +64,7 @@ inline void foundInteger(int64_t result, const uint8_t *buf) {
  }
 }

-inline void foundFloat(double result, const uint8_t *buf) {
+void foundFloat(double result, const uint8_t *buf) {
  char *endptr;
  float_count++;
  double expected = strtod((const char *)buf, &endptr);
--- a/tests/stringparsingcheck.cpp
+++ b/tests/stringparsingcheck.cpp
@ -203,7 +203,7 @@ static bool parse_string(const char *p, char *output, char **end) {
 // end of borrowed code
 char *bigbuffer; // global variable

-inline void foundBadString(const uint8_t *buf) {
+void foundBadString(const uint8_t *buf) {
  bad_string++;
  char *end;
  if (parse_string((const char *)buf, bigbuffer, &end)) {
@ -226,7 +226,7 @@ void print_cmp_hex(const char *s1, const char *s2, size_t len) {
  }
 }

-inline void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
+void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
                        const uint8_t *parsed_end) {
  size_t thislen = parsed_end - parsed_begin;
  total_string_length += thislen;