Merge pull request #201 from lemire/Multiple_implementation_refactoring_stage2

Stage2 refactored to simplify multiple implementations
2019-07-03 17:32:44 -04:00 · 2019-07-03 17:32:44 -04:00 · 477b058f74
parent 43143f6434 0df6d83f08
commit 477b058f74
9 changed files with 669 additions and 647 deletions
--- a/benchmark/parse.cpp
+++ b/benchmark/parse.cpp
@ -157,7 +157,8 @@ int main(int argc, char *argv[]) {
      break;
    }
    unified.start();
-    isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
+    // The default template is simdjson::instruction_set::native.
    isok = isok && (simdjson::SUCCESS == simdjson::unified_machine<>(p.data(), p.size(), pj));
    unified.end(results);
    cy2 += results[0];
    cl2 += results[1];
@ -188,7 +189,7 @@ int main(int argc, char *argv[]) {
    auto start = std::chrono::steady_clock::now();
    // The default template is simdjson::instruction_set::native.
    isok = (simdjson::find_structural_bits<>(p.data(), p.size(), pj) == simdjson::SUCCESS);
-    isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
+    isok = isok && (simdjson::SUCCESS == simdjson::unified_machine<>(p.data(), p.size(), pj));
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> secs = end - start;
    res[i] = secs.count();
--- a/include/simdjson/jsonparser.h
+++ b/include/simdjson/jsonparser.h
@ -26,7 +26,6 @@ using json_parse_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj,
 // Pointer that holds the json_parse implementation corresponding to the available SIMD instruction set
 extern json_parse_functype *json_parse_ptr;
 // json_parse_implementation is the generic function, it is specialized for various 
 // SIMD instruction sets, e.g., as json_parse_implementation<instruction_set::avx2>
 // or json_parse_implementation<instruction_set::neon> 
@ -69,7 +68,7 @@ int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj, bo
    pj.errorcode = stage1_is_ok;
    return pj.errorcode;
  } 
-  int res = unified_machine(buf, len, pj);
+  int res = unified_machine<T>(buf, len, pj);
  if(reallocated) { aligned_free((void*)buf);}
  return res;
 }
--- a/include/simdjson/numberparsing.h
+++ b/include/simdjson/numberparsing.h
@ -6,6 +6,12 @@
 #include "simdjson/parsedjson.h"
 #include "simdjson/portability.h"
 #ifdef JSON_TEST_NUMBERS // for unit testing
 void foundInvalidNumber(const uint8_t *buf);
 void foundInteger(int64_t result, const uint8_t *buf);
 void foundFloat(double result, const uint8_t *buf);
 #endif
 namespace simdjson {
 // Allowable floating-point values range from  std::numeric_limits<double>::lowest() 
 // to std::numeric_limits<double>::max(), so from 
@ -376,9 +382,6 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
  return is_structural_or_whitespace(*p);
 }
 // parse the number at buf + offset
 // define JSON_TEST_NUMBERS for unit testing
 //
--- a/include/simdjson/stage1_find_marks.h
+++ b/include/simdjson/stage1_find_marks.h
@ -33,11 +33,11 @@
 #define TRANSPOSE
 namespace simdjson {
-template<simdjson::instruction_set>
+template<instruction_set>
 struct simd_input;
 #ifdef __AVX2__
 template<>
-struct simd_input<simdjson::instruction_set::avx2>
+struct simd_input<instruction_set::avx2>
 {
  __m256i lo;
  __m256i hi;
@ -45,7 +45,7 @@ struct simd_input<simdjson::instruction_set::avx2>
 #endif
 #ifdef __ARM_NEON
-template<> struct simd_input<simdjson::instruction_set::neon>
+template<> struct simd_input<instruction_set::neon>
 {
 #ifndef TRANSPOSE
  uint8x16_t i0;
@ -111,7 +111,7 @@ uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16
 }
 #endif
-template<simdjson::instruction_set T>
+template<instruction_set T>
 uint64_t compute_quote_mask(uint64_t quote_bits);
 // In practice, if you have NEON or __PCLMUL__, you would
@ -121,7 +121,7 @@ uint64_t compute_quote_mask(uint64_t quote_bits);
 // Also: we don't know of an instance where AVX2 is supported but 
 // where clmul is not supported, so check for both, to be sure.
 #ifdef SIMDJSON_AVOID_CLMUL
-template<simdjson::instruction_set T> really_inline
+template<instruction_set T> really_inline
 uint64_t compute_quote_mask(uint64_t quote_bits)
 {
  uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
@ -133,12 +133,12 @@ uint64_t compute_quote_mask(uint64_t quote_bits)
  return quote_mask;
 }
 #else
-template<simdjson::instruction_set>
+template<instruction_set>
 uint64_t compute_quote_mask(uint64_t quote_bits);
 #ifdef __AVX2__ 
 template<> really_inline
-uint64_t compute_quote_mask<simdjson::instruction_set::avx2>(uint64_t quote_bits) {
+uint64_t compute_quote_mask<instruction_set::avx2>(uint64_t quote_bits) {
  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
      _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
  return quote_mask;
@ -147,7 +147,7 @@ uint64_t compute_quote_mask<simdjson::instruction_set::avx2>(uint64_t quote_bits
 #ifdef __ARM_NEON
 template<> really_inline
-uint64_t compute_quote_mask<simdjson::instruction_set::neon>(uint64_t quote_bits) {
+uint64_t compute_quote_mask<instruction_set::neon>(uint64_t quote_bits) {
 #ifdef __PCLMUL__ // Might cause problems on runtime dispatch
  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
                                          _mm_set_epi64x(0ULL, quote_bits),
@ -161,7 +161,7 @@ uint64_t compute_quote_mask<simdjson::instruction_set::neon>(uint64_t quote_bits
 #endif
 #ifdef SIMDJSON_UTF8VALIDATE
-template<simdjson::instruction_set T>really_inline
+template<instruction_set T>really_inline
 void check_utf8(simd_input<T> in,
                __m256i &has_error,
                struct avx_processed_utf_bytes &previous) {
@ -182,13 +182,13 @@ void check_utf8(simd_input<T> in,
 }
 #endif
-template<simdjson::instruction_set T>
+template<instruction_set T>
 simd_input<T> fill_input(const uint8_t * ptr);
 #ifdef __AVX2__
 template<> really_inline
-simd_input<simdjson::instruction_set::avx2> fill_input<simdjson::instruction_set::avx2>(const uint8_t * ptr) {
+simd_input<instruction_set::avx2> fill_input<instruction_set::avx2>(const uint8_t * ptr) {
-  struct simd_input<simdjson::instruction_set::avx2> in;
+  struct simd_input<instruction_set::avx2> in;
  in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
  in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
  return in;
@ -197,8 +197,8 @@ simd_input<simdjson::instruction_set::avx2> fill_input<simdjson::instruction_set
 #ifdef __ARM_NEON
 template<> really_inline
-simd_input<simdjson::instruction_set::neon> fill_input<simdjson::instruction_set::neon>(const uint8_t * ptr) {
+simd_input<instruction_set::neon> fill_input<instruction_set::neon>(const uint8_t * ptr) {
-  struct simd_input<simdjson::instruction_set::neon> in;
+  struct simd_input<instruction_set::neon> in;
 #ifndef TRANSPOSE
  in.i0 = vld1q_u8(ptr + 0);
  in.i1 = vld1q_u8(ptr + 16);
@ -213,12 +213,12 @@ simd_input<simdjson::instruction_set::neon> fill_input<simdjson::instruction_set
 // a straightforward comparison of a mask against input. 5 uops; would be
 // cheaper in AVX512.
-template<simdjson::instruction_set T>
+template<instruction_set T>
 uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
 #ifdef __AVX2__
 template<> really_inline
-uint64_t cmp_mask_against_input<simdjson::instruction_set::avx2>(simd_input<simdjson::instruction_set::avx2> in, uint8_t m) {
+uint64_t cmp_mask_against_input<instruction_set::avx2>(simd_input<instruction_set::avx2> in, uint8_t m) {
  const __m256i mask = _mm256_set1_epi8(m);
  __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
@ -231,7 +231,7 @@ uint64_t cmp_mask_against_input<simdjson::instruction_set::avx2>(simd_input<simd
 #ifdef __ARM_NEON
 template<> really_inline
-uint64_t cmp_mask_against_input<simdjson::instruction_set::neon>(simd_input<simdjson::instruction_set::neon> in, uint8_t m) {
+uint64_t cmp_mask_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
  const uint8x16_t mask = vmovq_n_u8(m); 
  uint8x16_t cmp_res_0 = vceqq_u8(in.i.val[0], mask); 
  uint8x16_t cmp_res_1 = vceqq_u8(in.i.val[1], mask); 
@ -242,12 +242,12 @@ uint64_t cmp_mask_against_input<simdjson::instruction_set::neon>(simd_input<simd
 #endif
 // find all values less than or equal than the content of maxval (using unsigned arithmetic) 
-template<simdjson::instruction_set T>
+template<instruction_set T>
 uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
 #ifdef __AVX2__
 template<> really_inline
-uint64_t unsigned_lteq_against_input<simdjson::instruction_set::avx2>(simd_input<simdjson::instruction_set::avx2> in, uint8_t m) {
+uint64_t unsigned_lteq_against_input<instruction_set::avx2>(simd_input<instruction_set::avx2> in, uint8_t m) {
  const __m256i maxval = _mm256_set1_epi8(m);
  __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.lo),maxval);
  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
@ -259,7 +259,7 @@ uint64_t unsigned_lteq_against_input<simdjson::instruction_set::avx2>(simd_input
 #ifdef __ARM_NEON
 template<> really_inline
-uint64_t unsigned_lteq_against_input<simdjson::instruction_set::neon>(simd_input<simdjson::instruction_set::neon> in, uint8_t m) {
+uint64_t unsigned_lteq_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
  const uint8x16_t mask = vmovq_n_u8(m); 
  uint8x16_t cmp_res_0 = vcleq_u8(in.i.val[0], mask); 
  uint8x16_t cmp_res_1 = vcleq_u8(in.i.val[1], mask); 
@ -278,7 +278,7 @@ uint64_t unsigned_lteq_against_input<simdjson::instruction_set::neon>(simd_input
 // indicate whether we end an iteration on an odd-length sequence of
 // backslashes, which modifies our subsequent search for odd-length
 // sequences of backslashes in an obvious way.
-template<simdjson::instruction_set T> really_inline
+template<instruction_set T> really_inline
 uint64_t find_odd_backslash_sequences(simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash) {
  const uint64_t even_bits = 0x5555555555555555ULL;
  const uint64_t odd_bits = ~even_bits;
@ -323,7 +323,7 @@ uint64_t find_odd_backslash_sequences(simd_input<T> in, uint64_t &prev_iter_ends
 // Note that we don't do any error checking to see if we have backslash
 // sequences outside quotes; these
 // backslash sequences (of any length) will be detected elsewhere.
-template<simdjson::instruction_set T> really_inline
+template<instruction_set T> really_inline
 uint64_t find_quote_mask_and_bits(simd_input<T> in, uint64_t odd_ends,
    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits, uint64_t &error_mask) {
  quote_bits = cmp_mask_against_input<T>(in, '"');
@ -352,14 +352,14 @@ uint64_t find_quote_mask_and_bits(simd_input<T> in, uint64_t odd_ends,
 // we are also interested in the four whitespace characters
 // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
 // these go into the next 2 buckets of the comparison (8/16)
-template<simdjson::instruction_set T>
+template<instruction_set T>
 void find_whitespace_and_structurals(simd_input<T> in,
                                     uint64_t &whitespace,
                                     uint64_t &structurals);
 #ifdef __AVX2__
 template<> really_inline
-void find_whitespace_and_structurals<simdjson::instruction_set::avx2>(simd_input<simdjson::instruction_set::avx2> in,
+void find_whitespace_and_structurals<instruction_set::avx2>(simd_input<instruction_set::avx2> in,
                                                     uint64_t &whitespace,
                                                     uint64_t &structurals) {
 #ifdef SIMDJSON_NAIVE_STRUCTURAL
@ -451,8 +451,8 @@ void find_whitespace_and_structurals<simdjson::instruction_set::avx2>(simd_input
 #ifdef __ARM_NEON
 template<> really_inline
-void find_whitespace_and_structurals<simdjson::instruction_set::neon>(
+void find_whitespace_and_structurals<instruction_set::neon>(
-                                                  simd_input<simdjson::instruction_set::neon> in,
+                                                  simd_input<instruction_set::neon> in,
                                                  uint64_t &whitespace,
                                                  uint64_t &structurals) {
 #ifndef FUNKY_BAD_TABLE
@ -698,7 +698,7 @@ really_inline uint64_t finalize_structurals(
  return structurals;
 }
-template<simdjson::instruction_set T = simdjson::instruction_set::native>
+template<instruction_set T = instruction_set::native>
 WARN_UNUSED
 /*never_inline*/ int find_structural_bits(const uint8_t *buf, size_t len,
                                           ParsedJson &pj) {
@ -849,7 +849,7 @@ WARN_UNUSED
 #endif
 }
-template<simdjson::instruction_set T = simdjson::instruction_set::native>
+template<instruction_set T = instruction_set::native>
 WARN_UNUSED
 int find_structural_bits(const char *buf, size_t len, ParsedJson &pj) {
  return find_structural_bits<T>(reinterpret_cast<const uint8_t *>(buf), len, pj);
--- a/include/simdjson/stage2_build_tape.h
+++ b/include/simdjson/stage2_build_tape.h
@ -1,18 +1,588 @@
 #ifndef SIMDJSON_STAGE2_BUILD_TAPE_H
 #define SIMDJSON_STAGE2_BUILD_TAPE_H
 #include <cassert>
 #include <cstring>
 #include <iostream>
 #include "simdjson/common_defs.h"
 #include "simdjson/jsoncharutils.h"
 #include "simdjson/numberparsing.h"
 #include "simdjson/parsedjson.h"
 #include "simdjson/stringparsing.h"
 #include "simdjson/simdjson.h"
 #define PATH_SEP '/'
 namespace simdjson {
 struct ParsedJson;
 void init_state_machine();
 WARN_UNUSED
-int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
+really_inline bool is_valid_true_atom(const uint8_t *loc) {
  uint64_t tv = *reinterpret_cast<const uint64_t *>("true    ");
  uint64_t mask4 = 0x00000000ffffffff;
  uint32_t error = 0;
  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
  // this can read up to 7 bytes beyond the buffer size, but we require
  // SIMDJSON_PADDING of padding
  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
  std::memcpy(&locval, loc, sizeof(uint64_t));
  error = (locval & mask4) ^ tv;
  error |= is_not_structural_or_whitespace(loc[4]);
  return error == 0;
 }
 WARN_UNUSED
-int unified_machine(const char *buf, size_t len, ParsedJson &pj);
+really_inline bool is_valid_false_atom(const uint8_t *loc) {
  // We have to use an integer constant because the space in the cast
  // below would lead to values illegally being qualified
  // uint64_t fv = *reinterpret_cast<const uint64_t *>("false   ");
  // using this constant (that is the same false) but nulls out the
  // unused bits solves that
  uint64_t fv = 0x00000065736c6166; // takes into account endianness
  uint64_t mask5 = 0x000000ffffffffff;
  // we can't use the 32 bit value for checking for errors otherwise
  // the last character of false (it being 5 byte long!) would be
  // ignored
  uint64_t error = 0;
  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
  // this can read up to 7 bytes beyond the buffer size, but we require 
  // SIMDJSON_PADDING of padding
  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
  std::memcpy(&locval, loc, sizeof(uint64_t));
  error = (locval & mask5) ^ fv;
  error |= is_not_structural_or_whitespace(loc[5]);
  return error == 0;
 }
 WARN_UNUSED
 really_inline bool is_valid_null_atom(const uint8_t *loc) {
  uint64_t nv = *reinterpret_cast<const uint64_t *>("null    ");
  uint64_t mask4 = 0x00000000ffffffff;
  uint32_t error = 0;
  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
  // this can read up to 7 bytes beyond the buffer size, but we require 
  // SIMDJSON_PADDING of padding
  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
  std::memcpy(&locval, loc, sizeof(uint64_t));
  error = (locval & mask4) ^ nv;
  error |= is_not_structural_or_whitespace(loc[4]);
  return error == 0;
 }
 /************
 * The JSON is parsed to a tape, see the accompanying tape.md file
 * for documentation.
 ***********/
 template<instruction_set T = instruction_set::native>
 WARN_UNUSED  ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
 int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
 #ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
  memset((uint8_t*)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
 #endif
  uint32_t i = 0; // index of the structural character (0,1,2,3...)
  uint32_t idx;   // location of the structural character in the input (buf)
  uint8_t c; // used to track the (structural) character we are looking at, updated
        // by UPDATE_CHAR macro
  uint32_t depth = 0; // could have an arbitrary starting depth
  pj.init(); // sets isvalid to false
  if(pj.bytecapacity < len) {
      pj.errorcode = simdjson::CAPACITY;
      return pj.errorcode;
  }
 // this macro reads the next structural character, updating idx, i and c.
 #define UPDATE_CHAR()                                                          \
  {                                                                            \
    idx = pj.structural_indexes[i++];                                          \
    c = buf[idx];                                                              \
  }
  ////////////////////////////// START STATE /////////////////////////////
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
  pj.ret_address[depth] = &&start_continue;
 #else
  pj.ret_address[depth] = 's';
 #endif
  pj.containing_scope_offset[depth] = pj.get_current_loc();
  pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
  // the root is used, if nothing else, to capture the size of the tape
  depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
  if (depth >= pj.depthcapacity) {
    goto fail;
  }
  UPDATE_CHAR();
  switch (c) {
  case '{':
    pj.containing_scope_offset[depth] = pj.get_current_loc();
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
    pj.ret_address[depth] = &&start_continue;
 #else
    pj.ret_address[depth] = 's';
 #endif
    depth++;
    if (depth >= pj.depthcapacity) {
      goto fail;
    }
    pj.write_tape(0, c); // strangely, moving this to object_begin slows things down
    goto object_begin;
  case '[':
    pj.containing_scope_offset[depth] = pj.get_current_loc();
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
    pj.ret_address[depth] = &&start_continue;
 #else
    pj.ret_address[depth] = 's';
 #endif    
    depth++;
    if (depth >= pj.depthcapacity) {
      goto fail;
    }
    pj.write_tape(0, c);
    goto array_begin;
 #define SIMDJSON_ALLOWANYTHINGINROOT
    // A JSON text is a serialized value.  Note that certain previous
    // specifications of JSON constrained a JSON text to be an object or an
    // array.  Implementations that generate only objects or arrays where a
    // JSON text is called for will be interoperable in the sense that all
    // implementations will accept these as conforming JSON texts.
    // https://tools.ietf.org/html/rfc8259
 #ifdef SIMDJSON_ALLOWANYTHINGINROOT
  case '"': {
    if (!parse_string<T>(buf, len, pj, depth, idx)) {
      goto fail;
    }
    break;
  }
  case 't': {
    // we need to make a copy to make sure that the string is space terminated.
    // this only applies to the JSON document made solely of the true value.
    // this will almost never be called in practice
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
    if(copy == nullptr) { 
      goto fail;
    }
    memcpy(copy, buf, len);
    copy[len] = ' ';
    if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
      free(copy);
      goto fail;
    }
    free(copy);
    pj.write_tape(0, c);
    break;
  }
  case 'f': {
    // we need to make a copy to make sure that the string is space terminated.
    // this only applies to the JSON document made solely of the false value.
    // this will almost never be called in practice
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
    if(copy == nullptr) { 
      goto fail;
    }
    memcpy(copy, buf, len);
    copy[len] = ' ';
    if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
      free(copy);
      goto fail;
    }
    free(copy);
    pj.write_tape(0, c);
    break;
  }
  case 'n': {
    // we need to make a copy to make sure that the string is space terminated.
    // this only applies to the JSON document made solely of the null value.
    // this will almost never be called in practice
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
    if(copy == nullptr) { 
      goto fail;
    }
    memcpy(copy, buf, len);
    copy[len] = ' ';
    if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
      free(copy);
      goto fail;
    }
    free(copy);
    pj.write_tape(0, c);
    break;
  }
  case '0': 
  case '1':
  case '2':
  case '3':
  case '4':
  case '5':
  case '6':
  case '7':
  case '8':
  case '9': {
    // we need to make a copy to make sure that the string is space terminated.
    // this is done only for JSON documents made of a sole number
    // this will almost never be called in practice. We terminate with a space
    // because we do not want to allow NULLs in the middle of a number (whereas a
    // space in the middle of a number would be identified in stage 1).
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
    if(copy == nullptr) { 
      goto fail;
    }
    memcpy(copy, buf, len);
    copy[len] = ' ';
    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, false)) {
      free(copy);
      goto fail;
    }
    free(copy);
    break;
  }
  case '-': {
    // we need to make a copy to make sure that the string is NULL terminated.
    // this is done only for JSON documents made of a sole number
    // this will almost never be called in practice
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
    if(copy == nullptr) { 
      goto fail;
    }
    memcpy(copy, buf, len);
    copy[len] = '\0';
    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
      free(copy);
      goto fail;
    }
    free(copy);
    break;
  }
 #endif // ALLOWANYTHINGINROOT
  default:
    goto fail;
  }
 start_continue:
  // the string might not be NULL terminated.
  if(i + 1 == pj.n_structural_indexes) {
    goto succeed;
  } else {
    goto fail;
  }
  ////////////////////////////// OBJECT STATES /////////////////////////////
 object_begin:
  UPDATE_CHAR();
  switch (c) {
  case '"': {
    if (!parse_string<T>(buf, len, pj, depth, idx)) {
      goto fail;
    }
    goto object_key_state;
  }
  case '}':
    goto scope_end; // could also go to object_continue
  default:
    goto fail;
  }
 object_key_state:
  UPDATE_CHAR();
  if (c != ':') {
    goto fail;
  }
  UPDATE_CHAR();
  switch (c) {
  case '"': {
    if (!parse_string<T>(buf, len, pj, depth, idx)) {
      goto fail; 
    }
    break;
  }
  case 't':
    if (!is_valid_true_atom(buf + idx)) {
      goto fail;
    }
    pj.write_tape(0, c);
    break;
  case 'f':
    if (!is_valid_false_atom(buf + idx)) {
      goto fail;
    }
    pj.write_tape(0, c);
    break;
  case 'n':
    if (!is_valid_null_atom(buf + idx)) {
      goto fail;
    }
    pj.write_tape(0, c);
    break;
  case '0': 
  case '1':
  case '2':
  case '3':
  case '4':
  case '5':
  case '6':
  case '7':
  case '8':
  case '9': {
    if (!parse_number(buf, pj, idx, false)) {
      goto fail;
    }
    break;
  }
  case '-': {
    if (!parse_number(buf, pj, idx, true)) {
      goto fail;
    }
    break;
  }
  case '{': {
    pj.containing_scope_offset[depth] = pj.get_current_loc();
    pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
    // we have not yet encountered } so we need to come back for it
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
    pj.ret_address[depth] = &&object_continue;
 #else
    pj.ret_address[depth] = 'o';
 #endif
    // we found an object inside an object, so we need to increment the depth
    depth++;
    if (depth >= pj.depthcapacity) {
      goto fail;
    }
    goto object_begin;
  }
  case '[': {
    pj.containing_scope_offset[depth] = pj.get_current_loc();
    pj.write_tape(0, c);  // here the compilers knows what c is so this gets optimized
    // we have not yet encountered } so we need to come back for it
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
    pj.ret_address[depth] = &&object_continue;
 #else
    pj.ret_address[depth] = 'o';
 #endif    
    // we found an array inside an object, so we need to increment the depth
    depth++;
    if (depth >= pj.depthcapacity) {
      goto fail;
    }
    goto array_begin;
  }
  default:
    goto fail;
  }
 object_continue:
  UPDATE_CHAR();
  switch (c) {
  case ',':
    UPDATE_CHAR();
    if (c != '"') {
      goto fail;
    } else {
      if (!parse_string<T>(buf, len, pj, depth, idx)) {
        goto fail; 
      }
      goto object_key_state;
    }
  case '}':
    goto scope_end;
  default:
    goto fail;
  }
  ////////////////////////////// COMMON STATE /////////////////////////////
 scope_end:
  // write our tape location to the header scope
  depth--;
  pj.write_tape(pj.containing_scope_offset[depth], c);
  pj.annotate_previousloc(pj.containing_scope_offset[depth],
                          pj.get_current_loc());
  // goto saved_state
 #ifdef SIMDJSON_USE_COMPUTED_GOTO
  goto *pj.ret_address[depth];
 #else
  if(pj.ret_address[depth] == 'a') {
    goto array_continue;
  } else if (pj.ret_address[depth] == 'o') {
    goto object_continue;
  } else goto start_continue;
 #endif
  ////////////////////////////// ARRAY STATES /////////////////////////////
 array_begin:
  UPDATE_CHAR();
  if (c == ']') {
    goto scope_end; // could also go to array_continue
  }
 main_array_switch:
  // we call update char on all paths in, so we can peek at c on the
  // on paths that can accept a close square brace (post-, and at start)
  switch (c) {
  case '"': {
    if (!parse_string<T>(buf, len, pj, depth, idx)) {
      goto fail;
    }
    break;
  }
  case 't':
    if (!is_valid_true_atom(buf + idx)) {
      goto fail;
    }
    pj.write_tape(0, c);
    break; 
  case 'f':
    if (!is_valid_false_atom(buf + idx)) {
      goto fail;
    }
    pj.write_tape(0, c);
    break; 
  case 'n':
    if (!is_valid_null_atom(buf + idx)) {
      goto fail;
    }
    pj.write_tape(0, c);
    break; // goto array_continue;
  case '0': 
  case '1':
  case '2':
  case '3':
  case '4':
  case '5':
  case '6':
  case '7':
  case '8':
  case '9': {
    if (!parse_number(buf, pj, idx, false)) {
      goto fail;
    }
    break; // goto array_continue;
  }
  case '-': {
    if (!parse_number(buf, pj, idx, true)) {
      goto fail;
    }
    break; // goto array_continue;
  }
  case '{': {
    // we have not yet encountered ] so we need to come back for it
    pj.containing_scope_offset[depth] = pj.get_current_loc();
    pj.write_tape(0, c); //  here the compilers knows what c is so this gets optimized
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
    pj.ret_address[depth] = &&array_continue;
 #else
    pj.ret_address[depth] = 'a';
 #endif
    // we found an object inside an array, so we need to increment the depth
    depth++;
    if (depth >= pj.depthcapacity) {
      goto fail;
    }
    goto object_begin;
  }
  case '[': {
    // we have not yet encountered ] so we need to come back for it
    pj.containing_scope_offset[depth] = pj.get_current_loc();
    pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
    pj.ret_address[depth] = &&array_continue;
 #else
    pj.ret_address[depth] = 'a';
 #endif
    // we found an array inside an array, so we need to increment the depth
    depth++;
    if (depth >= pj.depthcapacity) {
      goto fail;
    }
    goto array_begin;
  }
  default:
    goto fail;
  }
 array_continue:
  UPDATE_CHAR();
  switch (c) {
  case ',':
    UPDATE_CHAR();
    goto main_array_switch;
  case ']':
    goto scope_end;
  default:
    goto fail;
  }
  ////////////////////////////// FINAL STATES /////////////////////////////
 succeed:
  depth --;
  if(depth != 0) {
    fprintf(stderr, "internal bug\n");
    abort();
  }
  if(pj.containing_scope_offset[depth] != 0) {
    fprintf(stderr, "internal bug\n");
    abort();
  }
  pj.annotate_previousloc(pj.containing_scope_offset[depth],
                          pj.get_current_loc());
  pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
  pj.isvalid  = true;
  pj.errorcode = simdjson::SUCCESS;
  return pj.errorcode;
 fail:
  // we do not need the next line because this is done by pj.init(), pessimistically.
  // pj.isvalid  = false;
  // At this point in the code, we have all the time in the world.
  // Note that we know exactly where we are in the document so we could,
  // without any overhead on the processing code, report a specific location.
  // We could even trigger special code paths to assess what happened carefully,
  // all without any added cost.
  if (depth >= pj.depthcapacity) {
    pj.errorcode = simdjson::DEPTH_ERROR;
    return pj.errorcode;
  }
  switch(c) {
    case '"':
      pj.errorcode = simdjson::STRING_ERROR; 
      return pj.errorcode;
    case '0':
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9': 
    case '-': 
      pj.errorcode = simdjson::NUMBER_ERROR;
      return pj.errorcode;
    case 't':
      pj.errorcode = simdjson::T_ATOM_ERROR;
      return pj.errorcode;
    case 'n':
      pj.errorcode = simdjson::N_ATOM_ERROR;
      return pj.errorcode;
    case 'f':
      pj.errorcode = simdjson::F_ATOM_ERROR;
      return pj.errorcode;
    default: 
      break;
  }
  pj.errorcode = simdjson::TAPE_ERROR;
  return pj.errorcode; 
 }
 template<instruction_set T = instruction_set::native>
 int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
  return unified_machine<T>(reinterpret_cast<const uint8_t*>(buf), len, pj);
 }
 }
 #endif
--- a/include/simdjson/stringparsing.h
+++ b/include/simdjson/stringparsing.h
@ -5,6 +5,11 @@
 #include "simdjson/jsoncharutils.h"
 #include "simdjson/parsedjson.h"
 #ifdef JSON_TEST_STRINGS
 void foundString(const uint8_t *buf, const uint8_t *parsed_begin, const uint8_t *parsed_end);
 void foundBadString(const uint8_t *buf);
 #endif
 namespace simdjson {
 // begin copypasta
 // These chars yield themselves: " \ /
@ -76,19 +81,19 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, uint8_t **d
 #include <arm_neon.h>
 #endif
-WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
+// Holds backslashes and quotes locations.
-really_inline  bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
+struct parse_string_helper {
-                                ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
+  uint32_t bs_bits;
-#ifdef SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing
+  uint32_t quote_bits;
-  pj.write_tape(0, '"');// don't bother with the string parsing at all
+};
-  return true; // always succeeds
+
-#else
+// Finds where the backslashes and quotes are located.
-  pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
+template<instruction_set>
-  const uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
+parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst);
-  uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
+
  const uint8_t *const start_of_string = dst;
  while (1) {
 #ifdef __AVX2__
 template<> really_inline
 parse_string_helper find_bs_bits_and_quote_bits<instruction_set::avx2> (const uint8_t *src, uint8_t *dst) {
    // this can read up to 31 bytes beyond the buffer size, but we require 
    // SIMDJSON_PADDING of padding
    static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
@ -96,12 +101,17 @@ really_inline  bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
    // store to dest unconditionally - we can overwrite the bits we don't like
    // later
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
    auto bs_bits =
        static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\'))));
    auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
-    auto quote_bits =
+    return {
-        static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask));
+      static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits
-#else
+      static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask)) // quote_bits
    };
 }
 #endif
 #ifdef __ARM_NEON
 template<> really_inline
 parse_string_helper find_bs_bits_and_quote_bits<instruction_set::neon> (const uint8_t *src, uint8_t *dst) {
    // this can read up to 31 bytes beyond the buffer size, but we require 
    // SIMDJSON_PADDING of padding
    static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
@ -128,14 +138,32 @@ really_inline  bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
    uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1);
    sum0 = vpaddq_u8(sum0, sum1);
    sum0 = vpaddq_u8(sum0, sum0);
-    auto bs_bits =  vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0);
+    return {
-    auto quote_bits =  vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1);
+      vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits
      vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits
    };
 }
 #endif
-    if(((bs_bits - 1) & quote_bits) != 0 ) {
+
 template<instruction_set T>
 WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline 
 bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
                                ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
 #ifdef SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing
  pj.write_tape(0, '"');// don't bother with the string parsing at all
  return true; // always succeeds
 #else
  pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
  const uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
  uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
  const uint8_t *const start_of_string = dst;
  while (1) {
    parse_string_helper helper = find_bs_bits_and_quote_bits<T>(src, dst);
    if(((helper.bs_bits - 1) & helper.quote_bits) != 0 ) {
      // we encountered quotes first. Move dst to point to quotes and exit
      // find out where the quote is...
-      uint32_t quote_dist = trailingzeroes(quote_bits);
+      uint32_t quote_dist = trailingzeroes(helper.quote_bits);
      // NULL termination is still handy if you expect all your strings to be NULL terminated?
      // It comes at a small cost
@ -158,9 +186,9 @@ really_inline  bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
 #endif // JSON_TEST_STRINGS
      return true;
    } 
-    if(((quote_bits - 1) & bs_bits ) != 0 ) {
+    if(((helper.quote_bits - 1) & helper.bs_bits ) != 0 ) {
      // find out where the backspace is
-      uint32_t bs_dist = trailingzeroes(bs_bits);
+      uint32_t bs_dist = trailingzeroes(helper.bs_bits);
      uint8_t escape_char = src[bs_dist + 1];
      // we encountered backslash first. Handle backslash
      if (escape_char == 'u') {
--- a/src/stage2_build_tape.cpp
+++ b/src/stage2_build_tape.cpp
@ -1,580 +1 @@
-#include <cassert>
+// File kept in case we want to reuse it soon. (many configuration files to edit)
 #include <cstring>
 #include "simdjson/common_defs.h"
 #include "simdjson/jsoncharutils.h"
 #include "simdjson/numberparsing.h"
 #include "simdjson/parsedjson.h"
 #include "simdjson/stringparsing.h"
 #include "simdjson/simdjson.h"
 #include <iostream>
 #define PATH_SEP '/'
 namespace simdjson {
 WARN_UNUSED
 really_inline bool is_valid_true_atom(const uint8_t *loc) {
  uint64_t tv = *reinterpret_cast<const uint64_t *>("true    ");
  uint64_t mask4 = 0x00000000ffffffff;
  uint32_t error = 0;
  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
  // this can read up to 7 bytes beyond the buffer size, but we require 
  // SIMDJSON_PADDING of padding
  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
  std::memcpy(&locval, loc, sizeof(uint64_t));
  error = (locval & mask4) ^ tv;
  error |= is_not_structural_or_whitespace(loc[4]);
  return error == 0;
 }
 WARN_UNUSED
 really_inline bool is_valid_false_atom(const uint8_t *loc) {
  // We have to use an integer constant because the space in the cast
  // below would lead to values illegally being qualified
  // uint64_t fv = *reinterpret_cast<const uint64_t *>("false   ");
  // using this constant (that is the same false) but nulls out the
  // unused bits solves that
  uint64_t fv = 0x00000065736c6166; // takes into account endianness
  uint64_t mask5 = 0x000000ffffffffff;
  // we can't use the 32 bit value for checking for errors otherwise
  // the last character of false (it being 5 byte long!) would be
  // ignored
  uint64_t error = 0;
  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
  // this can read up to 7 bytes beyond the buffer size, but we require 
  // SIMDJSON_PADDING of padding
  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
  std::memcpy(&locval, loc, sizeof(uint64_t));
  error = (locval & mask5) ^ fv;
  error |= is_not_structural_or_whitespace(loc[5]);
  return error == 0;
 }
 WARN_UNUSED
 really_inline bool is_valid_null_atom(const uint8_t *loc) {
  uint64_t nv = *reinterpret_cast<const uint64_t *>("null    ");
  uint64_t mask4 = 0x00000000ffffffff;
  uint32_t error = 0;
  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
  // this can read up to 7 bytes beyond the buffer size, but we require 
  // SIMDJSON_PADDING of padding
  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
  std::memcpy(&locval, loc, sizeof(uint64_t));
  error = (locval & mask4) ^ nv;
  error |= is_not_structural_or_whitespace(loc[4]);
  return error == 0;
 }
 /************
 * The JSON is parsed to a tape, see the accompanying tape.md file
 * for documentation.
 ***********/
 WARN_UNUSED  ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
 int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
 #ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
  memset((uint8_t*)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
 #endif
  uint32_t i = 0; // index of the structural character (0,1,2,3...)
  uint32_t idx;   // location of the structural character in the input (buf)
  uint8_t c; // used to track the (structural) character we are looking at, updated
        // by UPDATE_CHAR macro
  uint32_t depth = 0; // could have an arbitrary starting depth
  pj.init(); // sets isvalid to false
  if(pj.bytecapacity < len) {
      pj.errorcode = CAPACITY;
      return pj.errorcode;
  }
 // this macro reads the next structural character, updating idx, i and c.
 #define UPDATE_CHAR()                                                          \
  {                                                                            \
    idx = pj.structural_indexes[i++];                                          \
    c = buf[idx];                                                              \
  }
  ////////////////////////////// START STATE /////////////////////////////
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
  pj.ret_address[depth] = &&start_continue;
 #else
  pj.ret_address[depth] = 's';
 #endif
  pj.containing_scope_offset[depth] = pj.get_current_loc();
  pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
  // the root is used, if nothing else, to capture the size of the tape
  depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
  if (depth >= pj.depthcapacity) {
    goto fail;
  }
  UPDATE_CHAR();
  switch (c) {
  case '{':
    pj.containing_scope_offset[depth] = pj.get_current_loc();
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
    pj.ret_address[depth] = &&start_continue;
 #else
    pj.ret_address[depth] = 's';
 #endif
    depth++;
    if (depth >= pj.depthcapacity) {
      goto fail;
    }
    pj.write_tape(0, c); // strangely, moving this to object_begin slows things down
    goto object_begin;
  case '[':
    pj.containing_scope_offset[depth] = pj.get_current_loc();
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
    pj.ret_address[depth] = &&start_continue;
 #else
    pj.ret_address[depth] = 's';
 #endif    
    depth++;
    if (depth >= pj.depthcapacity) {
      goto fail;
    }
    pj.write_tape(0, c);
    goto array_begin;
 #define SIMDJSON_ALLOWANYTHINGINROOT
    // A JSON text is a serialized value.  Note that certain previous
    // specifications of JSON constrained a JSON text to be an object or an
    // array.  Implementations that generate only objects or arrays where a
    // JSON text is called for will be interoperable in the sense that all
    // implementations will accept these as conforming JSON texts.
    // https://tools.ietf.org/html/rfc8259
 #ifdef SIMDJSON_ALLOWANYTHINGINROOT
  case '"': {
    if (!parse_string(buf, len, pj, depth, idx)) {
      goto fail;
    }
    break;
  }
  case 't': {
    // we need to make a copy to make sure that the string is space terminated.
    // this only applies to the JSON document made solely of the true value.
    // this will almost never be called in practice
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
    if(copy == nullptr) { 
      goto fail;
    }
    memcpy(copy, buf, len);
    copy[len] = ' ';
    if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
      free(copy);
      goto fail;
    }
    free(copy);
    pj.write_tape(0, c);
    break;
  }
  case 'f': {
    // we need to make a copy to make sure that the string is space terminated.
    // this only applies to the JSON document made solely of the false value.
    // this will almost never be called in practice
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
    if(copy == nullptr) { 
      goto fail;
    }
    memcpy(copy, buf, len);
    copy[len] = ' ';
    if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
      free(copy);
      goto fail;
    }
    free(copy);
    pj.write_tape(0, c);
    break;
  }
  case 'n': {
    // we need to make a copy to make sure that the string is space terminated.
    // this only applies to the JSON document made solely of the null value.
    // this will almost never be called in practice
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
    if(copy == nullptr) { 
      goto fail;
    }
    memcpy(copy, buf, len);
    copy[len] = ' ';
    if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
      free(copy);
      goto fail;
    }
    free(copy);
    pj.write_tape(0, c);
    break;
  }
  case '0': 
  case '1':
  case '2':
  case '3':
  case '4':
  case '5':
  case '6':
  case '7':
  case '8':
  case '9': {
    // we need to make a copy to make sure that the string is space terminated.
    // this is done only for JSON documents made of a sole number
    // this will almost never be called in practice. We terminate with a space
    // because we do not want to allow NULLs in the middle of a number (whereas a
    // space in the middle of a number would be identified in stage 1).
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
    if(copy == nullptr) { 
      goto fail;
    }
    memcpy(copy, buf, len);
    copy[len] = ' ';
    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, false)) {
      free(copy);
      goto fail;
    }
    free(copy);
    break;
  }
  case '-': {
    // we need to make a copy to make sure that the string is NULL terminated.
    // this is done only for JSON documents made of a sole number
    // this will almost never be called in practice
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
    if(copy == nullptr) { 
      goto fail;
    }
    memcpy(copy, buf, len);
    copy[len] = '\0';
    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
      free(copy);
      goto fail;
    }
    free(copy);
    break;
  }
 #endif // ALLOWANYTHINGINROOT
  default:
    goto fail;
  }
 start_continue:
  // the string might not be NULL terminated.
  if(i + 1 == pj.n_structural_indexes) {
    goto succeed;
  } else {
    goto fail;
  }
  ////////////////////////////// OBJECT STATES /////////////////////////////
 object_begin:
  UPDATE_CHAR();
  switch (c) {
  case '"': {
    if (!parse_string(buf, len, pj, depth, idx)) {
      goto fail;
    }
    goto object_key_state;
  }
  case '}':
    goto scope_end; // could also go to object_continue
  default:
    goto fail;
  }
 object_key_state:
  UPDATE_CHAR();
  if (c != ':') {
    goto fail;
  }
  UPDATE_CHAR();
  switch (c) {
  case '"': {
    if (!parse_string(buf, len, pj, depth, idx)) {
      goto fail; 
    }
    break;
  }
  case 't':
    if (!is_valid_true_atom(buf + idx)) {
      goto fail;
    }
    pj.write_tape(0, c);
    break;
  case 'f':
    if (!is_valid_false_atom(buf + idx)) {
      goto fail;
    }
    pj.write_tape(0, c);
    break;
  case 'n':
    if (!is_valid_null_atom(buf + idx)) {
      goto fail;
    }
    pj.write_tape(0, c);
    break;
  case '0': 
  case '1':
  case '2':
  case '3':
  case '4':
  case '5':
  case '6':
  case '7':
  case '8':
  case '9': {
    if (!parse_number(buf, pj, idx, false)) {
      goto fail;
    }
    break;
  }
  case '-': {
    if (!parse_number(buf, pj, idx, true)) {
      goto fail;
    }
    break;
  }
  case '{': {
    pj.containing_scope_offset[depth] = pj.get_current_loc();
    pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
    // we have not yet encountered } so we need to come back for it
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
    pj.ret_address[depth] = &&object_continue;
 #else
    pj.ret_address[depth] = 'o';
 #endif
    // we found an object inside an object, so we need to increment the depth
    depth++;
    if (depth >= pj.depthcapacity) {
      goto fail;
    }
    goto object_begin;
  }
  case '[': {
    pj.containing_scope_offset[depth] = pj.get_current_loc();
    pj.write_tape(0, c);  // here the compilers knows what c is so this gets optimized
    // we have not yet encountered } so we need to come back for it
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
    pj.ret_address[depth] = &&object_continue;
 #else
    pj.ret_address[depth] = 'o';
 #endif    
    // we found an array inside an object, so we need to increment the depth
    depth++;
    if (depth >= pj.depthcapacity) {
      goto fail;
    }
    goto array_begin;
  }
  default:
    goto fail;
  }
 object_continue:
  UPDATE_CHAR();
  switch (c) {
  case ',':
    UPDATE_CHAR();
    if (c != '"') {
      goto fail;
    } else {
      if (!parse_string(buf, len, pj, depth, idx)) {
        goto fail; 
      }
      goto object_key_state;
    }
  case '}':
    goto scope_end;
  default:
    goto fail;
  }
  ////////////////////////////// COMMON STATE /////////////////////////////
 scope_end:
  // write our tape location to the header scope
  depth--;
  pj.write_tape(pj.containing_scope_offset[depth], c);
  pj.annotate_previousloc(pj.containing_scope_offset[depth],
                          pj.get_current_loc());
  // goto saved_state
 #ifdef SIMDJSON_USE_COMPUTED_GOTO
  goto *pj.ret_address[depth];
 #else
  if(pj.ret_address[depth] == 'a') {
    goto array_continue;
  } else if (pj.ret_address[depth] == 'o') {
    goto object_continue;
  } else goto start_continue;
 #endif
  ////////////////////////////// ARRAY STATES /////////////////////////////
 array_begin:
  UPDATE_CHAR();
  if (c == ']') {
    goto scope_end; // could also go to array_continue
  }
 main_array_switch:
  // we call update char on all paths in, so we can peek at c on the
  // on paths that can accept a close square brace (post-, and at start)
  switch (c) {
  case '"': {
    if (!parse_string(buf, len, pj, depth, idx)) {
      goto fail;
    }
    break;
  }
  case 't':
    if (!is_valid_true_atom(buf + idx)) {
      goto fail;
    }
    pj.write_tape(0, c);
    break; 
  case 'f':
    if (!is_valid_false_atom(buf + idx)) {
      goto fail;
    }
    pj.write_tape(0, c);
    break; 
  case 'n':
    if (!is_valid_null_atom(buf + idx)) {
      goto fail;
    }
    pj.write_tape(0, c);
    break; // goto array_continue;
  case '0': 
  case '1':
  case '2':
  case '3':
  case '4':
  case '5':
  case '6':
  case '7':
  case '8':
  case '9': {
    if (!parse_number(buf, pj, idx, false)) {
      goto fail;
    }
    break; // goto array_continue;
  }
  case '-': {
    if (!parse_number(buf, pj, idx, true)) {
      goto fail;
    }
    break; // goto array_continue;
  }
  case '{': {
    // we have not yet encountered ] so we need to come back for it
    pj.containing_scope_offset[depth] = pj.get_current_loc();
    pj.write_tape(0, c); //  here the compilers knows what c is so this gets optimized
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
    pj.ret_address[depth] = &&array_continue;
 #else
    pj.ret_address[depth] = 'a';
 #endif
    // we found an object inside an array, so we need to increment the depth
    depth++;
    if (depth >= pj.depthcapacity) {
      goto fail;
    }
    goto object_begin;
  }
  case '[': {
    // we have not yet encountered ] so we need to come back for it
    pj.containing_scope_offset[depth] = pj.get_current_loc();
    pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
 #ifdef SIMDJSON_USE_COMPUTED_GOTO 
    pj.ret_address[depth] = &&array_continue;
 #else
    pj.ret_address[depth] = 'a';
 #endif
    // we found an array inside an array, so we need to increment the depth
    depth++;
    if (depth >= pj.depthcapacity) {
      goto fail;
    }
    goto array_begin;
  }
  default:
    goto fail;
  }
 array_continue:
  UPDATE_CHAR();
  switch (c) {
  case ',':
    UPDATE_CHAR();
    goto main_array_switch;
  case ']':
    goto scope_end;
  default:
    goto fail;
  }
  ////////////////////////////// FINAL STATES /////////////////////////////
 succeed:
  depth --;
  if(depth != 0) {
    fprintf(stderr, "internal bug\n");
    abort();
  }
  if(pj.containing_scope_offset[depth] != 0) {
    fprintf(stderr, "internal bug\n");
    abort();
  }
  pj.annotate_previousloc(pj.containing_scope_offset[depth],
                          pj.get_current_loc());
  pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
  pj.isvalid  = true;
  pj.errorcode = SUCCESS;
  return pj.errorcode;
 fail:
  // we do not need the next line because this is done by pj.init(), pessimistically.
  // pj.isvalid  = false;
  // At this point in the code, we have all the time in the world.
  // Note that we know exactly where we are in the document so we could,
  // without any overhead on the processing code, report a specific location.
  // We could even trigger special code paths to assess what happened carefully,
  // all without any added cost.
  if (depth >= pj.depthcapacity) {
    pj.errorcode = DEPTH_ERROR;
    return pj.errorcode;
  }
  switch(c) {
    case '"':
      pj.errorcode = STRING_ERROR; 
      return pj.errorcode;
    case '0':
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9': 
    case '-': 
      pj.errorcode = NUMBER_ERROR;
      return pj.errorcode;
    case 't':
      pj.errorcode = T_ATOM_ERROR;
      return pj.errorcode;
    case 'n':
      pj.errorcode = N_ATOM_ERROR;
      return pj.errorcode;
    case 'f':
      pj.errorcode = F_ATOM_ERROR;
      return pj.errorcode;
    default: 
      break;
  }
  pj.errorcode = TAPE_ERROR;
  return pj.errorcode; 
 }
 int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
  return unified_machine(reinterpret_cast<const uint8_t*>(buf), len, pj);
 }
 }
--- a/tests/numberparsingcheck.cpp
+++ b/tests/numberparsingcheck.cpp
@ -38,7 +38,7 @@ bool is_in_bad_list(const char *buf) {
  return false;
 }
-inline void foundInvalidNumber(const uint8_t *buf) {
+void foundInvalidNumber(const uint8_t *buf) {
  invalid_count++;
  char *endptr;
  double expected = strtod((const char *)buf, &endptr);
@ -53,7 +53,7 @@ inline void foundInvalidNumber(const uint8_t *buf) {
  }
 }
-inline void foundInteger(int64_t result, const uint8_t *buf) {
+void foundInteger(int64_t result, const uint8_t *buf) {
  int_count++;
  char *endptr;
  long long expected = strtoll((const char *)buf, &endptr, 10);
@ -64,7 +64,7 @@ inline void foundInteger(int64_t result, const uint8_t *buf) {
  }
 }
-inline void foundFloat(double result, const uint8_t *buf) {
+void foundFloat(double result, const uint8_t *buf) {
  char *endptr;
  float_count++;
  double expected = strtod((const char *)buf, &endptr);
--- a/tests/stringparsingcheck.cpp
+++ b/tests/stringparsingcheck.cpp
@ -203,7 +203,7 @@ static bool parse_string(const char *p, char *output, char **end) {
 // end of borrowed code
 char *bigbuffer; // global variable
-inline void foundBadString(const uint8_t *buf) {
+void foundBadString(const uint8_t *buf) {
  bad_string++;
  char *end;
  if (parse_string((const char *)buf, bigbuffer, &end)) {
@ -226,7 +226,7 @@ void print_cmp_hex(const char *s1, const char *s2, size_t len) {
  }
 }
-inline void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
+void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
                        const uint8_t *parsed_end) {
  size_t thislen = parsed_end - parsed_begin;
  total_string_length += thislen;