Fix for issues 32, 50, 131, 137

* Improving portability. * Revisiting faulty logic regarding same-page overruns. * Disabling same-page overruns under VS. * Clarifying the documentation * Fix for issue 131 + being more explicit regarding memory realloc. * Fix for issue 137. * removing "using namespace std" throughout. Fix for 50 * Introducing typed malloc/free. * Introducing a custom class (padded_string) that solves several minor usability issues. * Updating amalgamation for testing.
2019-05-09 17:59:51 -04:00 · 2019-05-09 17:59:51 -04:00 · e370a65383
parent c5a3f9ccd4
commit e370a65383
31 changed files with 1109 additions and 366 deletions
--- a/README.md
+++ b/README.md
@ -66,7 +66,7 @@ Under Windows, we build some tools using the windows/dirent_portable.h file (whi
 const char * filename = ... //

 // use whatever means you want to get a string (UTF-8) of your JSON document
-std::string_view p = get_corpus(filename); // you are responsible for freeing p.data()
+padded_string p = get_corpus(filename); 
 ParsedJson pj;
 pj.allocateCapacity(p.size()); // allocate memory for parsing up to p.size() bytes
 const int res = json_parse(p, pj); // do the parsing, return 0 on success
@ -75,8 +75,6 @@ if (res != 0) {
    // You can use the "simdjson/simdjson.h" header to access the error message
    std::cout << "Error parsing:" << simdjson::errorMsg(res) << std::endl;
 }
-// You can safely delete the string content
-aligned_free((void*)p.data());
 // the ParsedJson document can be used here
 // pj can be reused with other json_parse calls.
 ```
@ -90,21 +88,49 @@ of memory allocation with each new JSON document:
 /...

 const char * filename = ... //
-std::string_view p = get_corpus(filename);
+padding_string p = get_corpus(filename);
 ParsedJson pj = build_parsed_json(p); // do the parsing
-// you no longer need p at this point, can do aligned_free((void*)p.data())
 if( ! pj.isValid() ) {
    // something went wrong
 }
-aligned_free((void*)p.data());
 ```

-You can call `json_parse` and `build_parsed_json`, passing a standard `std::string` object.
+Though the `padded_string` class is recommended for best performance, you can call `json_parse` and `build_parsed_json`, passing a standard `std::string` object.


-## Memory overallocation `
+```C
+#include "simdjson/jsonparser.h"

-As needed, the `json_parse` and `build_parsed_json` functions copy the input data to a temporary buffer readable up to SIMDJSON_PADDING bytes beyond the end of the data. To avoid this potentially expensive copy, overallocate your own input data and then call the `json_parse` and `build_parsed_json` functions with an extra parameter value set to `false` (e.g., `build_parsed_json(p,false)` and  `parsed_json(p,pj,false)`). In such instance, no temporary copy is made. The `get_corpus` function does this automatically as well as the provide `char * allocate_padded_buffer(size_t length)` function to achieve the desired effect. 
+/...
+std::string mystring = ... //
+ParsedJson pj;
+pj.allocateCapacity(mystring.size()); // allocate memory for parsing up to p.size() bytes
+// std::string may not overallocate so a copy will be needed
+const int res = json_parse(mystring, pj); // do the parsing, return 0 on success
+// parsing is done!
+if (res != 0) {
+    // You can use the "simdjson/simdjson.h" header to access the error message
+    std::cout << "Error parsing:" << simdjson::errorMsg(res) << std::endl;
+}
+// pj can be reused with other json_parse calls.
+```
+
+or
+
+```C
+#include "simdjson/jsonparser.h"
+
+/...
+
+std::string mystring = ... //
+// std::string may not overallocate so a copy will be needed
+ParsedJson pj = build_parsed_json(mystring); // do the parsing
+if( ! pj.isValid() ) {
+    // something went wrong
+}
+```
+
+As needed, the `json_parse` and `build_parsed_json` functions copy the input data to a temporary buffer readable up to SIMDJSON_PADDING bytes beyond the end of the data. 

 ## Usage: easy single-header version

@ -118,14 +144,13 @@ copy the files in your project in your include path. You can then include them q
 #include "simdjson.cpp"
 int main(int argc, char *argv[]) {
  const char * filename = argv[1];
-  std::string_view p = get_corpus(filename);
+  padded_string p = get_corpus(filename);
  ParsedJson pj = build_parsed_json(p); // do the parsing
  if( ! pj.isValid() ) {
    std::cout << "not valid" << std::endl;
  } else {
    std::cout << "valid" << std::endl;
  }
-  aligned_free((void*)p.data());
  return EXIT_SUCCESS;
 }
 ```
--- a/amalgamation.sh
+++ b/amalgamation.sh
@ -28,6 +28,7 @@ $SCRIPTPATH/include/simdjson/simdjson_version.h
 $SCRIPTPATH/include/simdjson/simdjson.h
 $SCRIPTPATH/include/simdjson/portability.h
 $SCRIPTPATH/include/simdjson/common_defs.h
+$SCRIPTPATH/include/simdjson/padded_string.h
 $SCRIPTPATH/include/simdjson/jsoncharutils.h
 $SCRIPTPATH/include/simdjson/jsonformatutils.h
 $SCRIPTPATH/include/simdjson/jsonioutil.h
@ -100,7 +101,7 @@ cat <<< '
 #include "simdjson.cpp"
 int main(int argc, char *argv[]) {
  const char * filename = argv[1];
-  std::string_view p = get_corpus(filename);
+  padded_string p = get_corpus(filename);
  ParsedJson pj = build_parsed_json(p); // do the parsing
  if( ! pj.isValid() ) {
    std::cout << "not valid" << std::endl;
--- a/benchmark/distinctuseridcompetition.cpp
+++ b/benchmark/distinctuseridcompetition.cpp
@ -14,17 +14,16 @@
 #include "sajson.h"

 using namespace rapidjson;
-using namespace std;

 bool equals(const char *s1, const char *s2) { return strcmp(s1, s2) == 0; }

-void remove_duplicates(vector<int64_t> &v) {
+void remove_duplicates(std::vector<int64_t> &v) {
  std::sort(v.begin(), v.end());
  auto last = std::unique(v.begin(), v.end());
  v.erase(last, v.end());
 }

-void print_vec(vector<int64_t> &v) {
+void print_vec(const std::vector<int64_t> &v) {
  for (auto i : v) {
    std::cout << i << " ";
  }
@ -73,7 +72,7 @@ void simdjson_traverse(std::vector<int64_t> &answer, ParsedJson::iterator &i) {
  }
 }

-std::vector<int64_t> simdjson_computestats(const std::string_view &p) {
+std::vector<int64_t> simdjson_computestats(const padded_string &p) {
  std::vector<int64_t> answer;
  ParsedJson pj = build_parsed_json(p);
  if (!pj.isValid()) {
@ -134,7 +133,7 @@ void sajson_traverse(std::vector<int64_t> &answer, const sajson::value &node) {
  }
 }

-std::vector<int64_t> sasjon_computestats(const std::string_view &p) {
+std::vector<int64_t> sasjon_computestats(const padded_string &p) {
  std::vector<int64_t> answer;
  char *buffer = (char *)malloc(p.size());
  memcpy(buffer, p.data(), p.size());
@ -187,7 +186,7 @@ void rapid_traverse(std::vector<int64_t> &answer, const rapidjson::Value &v) {
  }
 }

-std::vector<int64_t> rapid_computestats(const std::string_view &p) {
+std::vector<int64_t> rapid_computestats(const padded_string &p) {
  std::vector<int64_t> answer;
  char *buffer = (char *)malloc(p.size() + 1);
  memcpy(buffer, p.data(), p.size());
@ -220,19 +219,19 @@ int main(int argc, char *argv[]) {
      abort();
    }
  if (optind >= argc) {
-    cerr << "Using different parsers, we compute the content statistics of "
-            "JSON documents.\n";
-    cerr << "Usage: " << argv[0] << " <jsonfile>\n";
-    cerr << "Or " << argv[0] << " -v <jsonfile>\n";
+    std::cerr << "Using different parsers, we compute the content statistics of "
+            "JSON documents." << std::endl;
+    std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
+    std::cerr << "Or " << argv[0] << " -v <jsonfile>" << std::endl;
    exit(1);
  }
  const char *filename = argv[optind];
  if (optind + 1 < argc) {
-    cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
+    std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
  }
-  std::string_view p;
+  padded_string p;
  try {
-    p = get_corpus(filename);
+    get_corpus(filename).swap(p);
  } catch (const std::exception &e) { // caught by reference to base
    std::cout << "Could not load the file " << filename << std::endl;
    return EXIT_FAILURE;
@ -279,5 +278,4 @@ int main(int argc, char *argv[]) {
            !justdata);
  BEST_TIME("sasjon  ", sasjon_computestats(p).size(), size, , repeat, volume,
            !justdata);
-  aligned_free((void*)p.data());
 }
--- a/benchmark/minifiercompetition.cpp
+++ b/benchmark/minifiercompetition.cpp
@ -16,7 +16,6 @@


 using namespace rapidjson;
-using namespace std;

 std::string rapidstringmeInsitu(char *json) {
  Document d;
@ -62,13 +61,13 @@ int main(int argc, char *argv[]) {
        abort ();
      }
  if (optind >= argc) {
-    cerr << "Usage: " << argv[0] << " <jsonfile>" << endl;
+    std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
    exit(1);
  }
  const char * filename = argv[optind];
-  std::string_view p;
+  padded_string p;
  try {
-    p = get_corpus(filename);
+    get_corpus(filename).swap(p);
  } catch (const std::exception& e) { // caught by reference to base
    std::cout << "Could not load the file " << filename << std::endl;
    return EXIT_FAILURE;
@ -140,7 +139,8 @@ int main(int argc, char *argv[]) {
    fprintf(stderr, "failed to allocate memory\n");
    return EXIT_FAILURE;
  } 
-  BEST_TIME("simdjson orig", json_parse((const uint8_t*)buffer, p.size(), pj), true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
+  bool automated_reallocation = false; 
+  BEST_TIME("simdjson orig", json_parse((const uint8_t*)buffer, p.size(), pj, automated_reallocation), true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
  
  ParsedJson pj2;
  bool isallocok2 = pj2.allocateCapacity(p.size(), 1024);
@ -148,9 +148,8 @@ int main(int argc, char *argv[]) {
    fprintf(stderr, "failed to allocate memory\n");
    return EXIT_FAILURE;
  } 
-
-  BEST_TIME("simdjson despaced", json_parse((const uint8_t*)buffer, minisize, pj2), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, !justdata);
-  aligned_free((void*)p.data());
+  automated_reallocation = false; 
+  BEST_TIME("simdjson despaced", json_parse((const uint8_t*)buffer, minisize, pj2, automated_reallocation), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, !justdata);
  free(buffer);
  free(ast_buffer);
  free(minibuffer);
--- a/benchmark/parse.cpp
+++ b/benchmark/parse.cpp
@ -33,7 +33,6 @@
 #include "simdjson/parsedjson.h"
 #include "simdjson/stage1_find_marks.h"
 #include "simdjson/stage2_build_tape.h"
-using namespace std;

 int main(int argc, char *argv[]) {
  bool verbose = false;
@ -69,26 +68,26 @@ int main(int argc, char *argv[]) {
  int optind = 1;
 #endif
  if (optind >= argc) {
-    cerr << "Usage: " << argv[0] << " <jsonfile>" << endl;
+    std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
    exit(1);
  }
  const char *filename = argv[optind];
  if (optind + 1 < argc) {
-    cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
+    std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
  }
  if (verbose) {
-    cout << "[verbose] loading " << filename << endl;
-}
-  std::string_view p;
+    std::cout << "[verbose] loading " << filename << std::endl;
+  }
+  padded_string p;
  try {
-    p = get_corpus(filename);
+    get_corpus(filename).swap(p);
  } catch (const std::exception &e) { // caught by reference to base
    std::cout << "Could not load the file " << filename << std::endl;
    return EXIT_FAILURE;
  }
  if (verbose) {
-    cout << "[verbose] loaded " << filename << " (" << p.size() << " bytes)"
-         << endl;
+    std::cout << "[verbose] loaded " << filename << " (" << p.size() << " bytes)"
+         << std::endl;
 }
 #if defined(DEBUG)
  const uint32_t iterations = 1;
@ -96,7 +95,7 @@ int main(int argc, char *argv[]) {
  const uint32_t iterations =
      forceoneiteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
 #endif
-  vector<double> res;
+  std::vector<double> res;
  res.resize(iterations);

 #if !defined(__linux__)
@ -107,14 +106,14 @@ int main(int argc, char *argv[]) {
 #endif

 #ifndef SQUASH_COUNTERS
-  vector<int> evts;
+  std::vector<int> evts;
  evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
  evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
  evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
  evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES);
  evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
  LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
-  vector<unsigned long long> results;
+  std::vector<unsigned long long> results;
  results.resize(evts.size());
  unsigned long cy0 = 0, cy1 = 0, cy2 = 0;
  unsigned long cl0 = 0, cl1 = 0, cl2 = 0;
@ -126,8 +125,8 @@ int main(int argc, char *argv[]) {

  for (uint32_t i = 0; i < iterations; i++) {
    if (verbose) {
-      cout << "[verbose] iteration # " << i << endl;
-}
+      std::cout << "[verbose] iteration # " << i << std::endl;
+    }
 #ifndef SQUASH_COUNTERS
    unified.start();
 #endif
@ -135,7 +134,6 @@ int main(int argc, char *argv[]) {
    bool allocok = pj.allocateCapacity(p.size());
    if (!allocok) {
      std::cerr << "failed to allocate memory" << std::endl;
-      aligned_free((void *)p.data());
      return EXIT_FAILURE;
    }
 #ifndef SQUASH_COUNTERS
@ -147,7 +145,7 @@ int main(int argc, char *argv[]) {
    cmis0 += results[4];
 #endif
    if (verbose) {
-      cout << "[verbose] allocated memory for parsed JSON " << endl;
+      std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
 }

    auto start = std::chrono::steady_clock::now();
@ -163,7 +161,7 @@ int main(int argc, char *argv[]) {
    cref1 += results[3];
    cmis1 += results[4];
    if (!isok) {
-      cout << "Failed during stage 1\n";
+      std::cout << "Failed during stage 1" << std::endl;
      break;
    }
    unified.start();
@ -178,7 +176,7 @@ int main(int argc, char *argv[]) {
    cref2 += results[3];
    cmis2 += results[4];
    if (!isok) {
-      cout << "Failed during stage 2\n";
+      std::cout << "Failed during stage 2" << std::endl;
      break;
    }
 #endif
@ -190,7 +188,6 @@ int main(int argc, char *argv[]) {
  ParsedJson pj = build_parsed_json(p); // do the parsing again to get the stats
  if (!pj.isValid()) {
    std::cerr << "Could not parse. " << std::endl;
-    aligned_free((void *)p.data());
    return EXIT_FAILURE;
  }
 #ifndef SQUASH_COUNTERS
@ -202,7 +199,6 @@ int main(int argc, char *argv[]) {
    float cpbtotal = (double)total / (iterations * p.size());
    char *newfile = (char *)malloc(strlen(filename) + 1);
    if (newfile == NULL) {
-      aligned_free((void *)p.data());
      return EXIT_FAILURE;
    }
    ::strcpy(newfile, filename);
@ -255,9 +251,9 @@ int main(int argc, char *argv[]) {
 #endif
  double min_result = *min_element(res.begin(), res.end());
  if (!justdata) {
-    cout << "Min:  " << min_result << " bytes read: " << p.size()
+    std::cout << "Min:  " << min_result << " bytes read: " << p.size()
         << " Gigabytes/second: " << (p.size()) / (min_result * 1000000000.0)
-         << "\n";
+         << std::endl;
 }
  if (jsonoutput) {
    isok = isok && pj.printjson(std::cout);
@ -265,7 +261,6 @@ int main(int argc, char *argv[]) {
  if (dump) {
    isok = isok && pj.dump_raw_tape(std::cout);
  }
-  aligned_free((void *)p.data());
  if (!isok) {
    fprintf(stderr, " Parsing failed. \n ");
    return EXIT_FAILURE;
--- a/benchmark/parseandstatcompetition.cpp
+++ b/benchmark/parseandstatcompetition.cpp
@ -12,7 +12,6 @@
 #include "sajson.h"

 using namespace rapidjson;
-using namespace std;

 struct stat_s {
  size_t number_count;
@ -45,7 +44,7 @@ void print_stat(const stat_t &s) {
 }

 __attribute__ ((noinline))
-stat_t simdjson_computestats(const std::string_view &p) {
+stat_t simdjson_computestats(const padded_string &p) {
  stat_t answer;
  ParsedJson pj = build_parsed_json(p);
  answer.valid = pj.isValid();
@ -147,7 +146,7 @@ void sajson_traverse(stat_t &stats, const sajson::value &node) {
 }

 __attribute__ ((noinline))
-stat_t sasjon_computestats(const std::string_view &p) {
+stat_t sasjon_computestats(const padded_string &p) {
  stat_t answer;
  char *buffer = (char *)malloc(p.size());
  memcpy(buffer, p.data(), p.size());
@ -205,7 +204,7 @@ void rapid_traverse(stat_t &stats, const rapidjson::Value &v) {
 }

 __attribute__ ((noinline))
-stat_t rapid_computestats(const std::string_view &p) {
+stat_t rapid_computestats(const padded_string &p) {
  stat_t answer;
  char *buffer = (char *)malloc(p.size() + 1);
  memcpy(buffer, p.data(), p.size());
@ -244,19 +243,19 @@ int main(int argc, char *argv[]) {
      abort();
    }
  if (optind >= argc) {
-    cerr << "Using different parsers, we compute the content statistics of "
-            "JSON documents.\n";
-    cerr << "Usage: " << argv[0] << " <jsonfile>\n";
-    cerr << "Or " << argv[0] << " -v <jsonfile>\n";
+    std::cerr << "Using different parsers, we compute the content statistics of "
+            "JSON documents." << std::endl;
+    std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
+    std::cerr << "Or " << argv[0] << " -v <jsonfile>" << std::endl;
    exit(1);
  }
  const char *filename = argv[optind];
  if (optind + 1 < argc) {
-    cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
+    std::cerr << "warning: ignoring everything after " << argv[optind + 1]  << std::endl;
  }
-  std::string_view p;
+  padded_string p;
  try {
-    p = get_corpus(filename);
+    get_corpus(filename).swap(p);
  } catch (const std::exception &e) { // caught by reference to base
    std::cout << "Could not load the file " << filename << std::endl;
    return EXIT_FAILURE;
@ -300,5 +299,4 @@ int main(int argc, char *argv[]) {
            !justdata);
  BEST_TIME("sasjon  ", sasjon_computestats(p).valid, true, , repeat, volume,
            !justdata);
-  aligned_free((void*)p.data());
 }
--- a/benchmark/parsingcompetition.cpp
+++ b/benchmark/parsingcompetition.cpp
@ -44,7 +44,6 @@ extern "C" {
 #endif 

 using namespace rapidjson;
-using namespace std;


 #ifdef ALLPARSER
@ -77,19 +76,19 @@ int main(int argc, char *argv[]) {
      abort();
    }
  if (optind >= argc) {
-    cerr << "Usage: " << argv[0] << " <jsonfile>\n";
-    cerr << "Or " << argv[0] << " -v <jsonfile>\n";
-    cerr << "To enable parsers that are not standard compliant, use the -a "
-            "flag\n";
+    std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
+    std::cerr << "Or " << argv[0] << " -v <jsonfile>" << std::endl;
+    std::cerr << "To enable parsers that are not standard compliant, use the -a "
+            "flag" << std::endl;
    exit(1);
  }
  const char *filename = argv[optind];
  if (optind + 1 < argc) {
-    cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
+    std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
  }
-  std::string_view p;
+  padded_string p;
  try {
-    p = get_corpus(filename);
+    get_corpus(filename).swap(p);
  } catch (const std::exception &e) { // caught by reference to base
    std::cout << "Could not load the file " << filename << std::endl;
    return EXIT_FAILURE;
@ -158,15 +157,15 @@ int main(int argc, char *argv[]) {
            true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
 #ifdef __linux__
  if(!justdata) {
-      vector<int> evts;
+      std::vector<int> evts;
      evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
      evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
      evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
      evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES);
      evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
      LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
-      vector<unsigned long long> results;
-      vector<unsigned long long> stats;
+      std::vector<unsigned long long> results;
+      std::vector<unsigned long long> stats;
      results.resize(evts.size());
      stats.resize(evts.size());
      std::fill(stats.begin(), stats.end(), 0);// unnecessary
@ -227,10 +226,8 @@ int main(int argc, char *argv[]) {



-  auto * tokens = make_unique<jsmntok_t[](p.size());
-  if(tokens == NULL) {
-    printf("Failed to alloc memory for jsmn\n");
-  } else {
+  {
+    std::unique_ptr<jsmntok_t[]> tokens = std::make_unique<jsmntok_t[]>(p.size());
    jsmn_parser parser;
    jsmn_init(&parser);
    memcpy(buffer, p.data(), p.size());
@ -239,7 +236,6 @@ int main(int argc, char *argv[]) {
              (jsmn_parse(&parser, buffer, p.size(), tokens.get(), p.size()) > 0), true,
              jsmn_init(&parser), repeat, volume, !justdata);
  }
-
  memcpy(buffer, p.data(), p.size());
  buffer[p.size()] = '\0';
  cJSON * tree = cJSON_Parse(buffer);
@ -260,7 +256,6 @@ int main(int argc, char *argv[]) {
  if(!justdata) BEST_TIME("memcpy            ",
            (memcpy(buffer, p.data(), p.size()) == buffer), true, , repeat,
            volume, !justdata);
-  aligned_free((void *)p.data());
  free(ast_buffer);
  free(buffer);
 }
--- a/benchmark/statisticalmodel.cpp
+++ b/benchmark/statisticalmodel.cpp
@ -8,8 +8,6 @@
 #include "linux-perf-events.h"
 #endif

-using namespace std;
-
 size_t count_nonasciibytes(const uint8_t *input, size_t length) {
  size_t count = 0;
  for (size_t i = 0; i < length; i++) {
@ -44,7 +42,7 @@ struct stat_s {

 using stat_t = struct stat_s;

-stat_t simdjson_computestats(const std::string_view &p) {
+stat_t simdjson_computestats(const padded_string &p) {
  stat_t answer;
  ParsedJson pj = build_parsed_json(p);
  answer.valid = pj.isValid();
@ -126,8 +124,8 @@ int main(int argc, char *argv[]) {
  int optind = 1;
 #endif
  if (optind >= argc) {
-    cerr << "Reads json, prints stats. " << endl;
-    cerr << "Usage: " << argv[0] << " <jsonfile>" << endl;
+    std::cerr << "Reads json, prints stats. " << std::endl;
+    std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;

    exit(1);
  }
@ -136,9 +134,9 @@ int main(int argc, char *argv[]) {
    std::cerr << "warning: ignoring everything after " << argv[optind + 1]
              << std::endl;
  }
-  std::string_view p;
+  padded_string p;
  try {
-    p = get_corpus(filename);
+    get_corpus(filename).swap(p);
  } catch (const std::exception &e) { // caught by reference to base
    std::cerr << "Could not load the file " << filename << std::endl;
    return EXIT_FAILURE;
@ -172,13 +170,13 @@ int main(int argc, char *argv[]) {
    return EXIT_FAILURE;
  }
  const uint32_t iterations = p.size() < 1 * 1000 * 1000 ? 1000 : 50;
-  vector<int> evts;
+  std::vector<int> evts;
  evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
  evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
  LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
  unsigned long cy1 = 0, cy2 = 0;
  unsigned long cl1 = 0, cl2 = 0;
-  vector<unsigned long long> results;
+  std::vector<unsigned long long> results;
  results.resize(evts.size());
  for (uint32_t i = 0; i < iterations; i++) {
    unified.start();
--- a/include/simdjson/common_defs.h
+++ b/include/simdjson/common_defs.h
@ -50,20 +50,22 @@

 #else

-// The following is likely unnecessarily complex.
-#ifdef __SANITIZE_ADDRESS__
-#define ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER  __attribute__((no_sanitize("address")))
-#elif defined(__has_feature)
-#  if (__has_feature(address_sanitizer))
-#define ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER  __attribute__((no_sanitize("address")))
-#  endif 
-#endif 
-
 // for non-Visual Studio compilers, we assume that same-page buffer overrun is fine:
 #ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
 #define ALLOW_SAME_PAGE_BUFFER_OVERRUN
 #endif 

+// The following is likely unnecessarily complex.
+#ifdef __SANITIZE_ADDRESS__
+// we have GCC, stuck with https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
+#undef ALLOW_SAME_PAGE_BUFFER_OVERRUN
+#elif defined(__has_feature)
+// we have CLANG?
+#  if (__has_feature(address_sanitizer))
+#define ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER  __attribute__((no_sanitize("address")))
+#  endif 
+#endif 
+
 #define really_inline inline __attribute__((always_inline, unused))
 #define never_inline inline __attribute__((noinline, unused))

--- a/include/simdjson/jsonioutil.h
+++ b/include/simdjson/jsonioutil.h
@ -9,13 +9,7 @@
 #include <string>


-// low-level function to allocate memory with padding so we can read passed the "length" bytes
-// safely.
-// if you must provide a pointer to some data, create it with this function:
-// length is the max. size in bytes of the string
-// caller is responsible to free the memory (free(...))
-char * allocate_padded_buffer(size_t length);
-
+#include "simdjson/padded_string.h"



@ -34,7 +28,7 @@ char * allocate_padded_buffer(size_t length);
 //        aligned_free((void*)p.data());
 //        std::cout << "Could not load the file " << filename << std::endl;
 //      }
-std::string_view  get_corpus(const std::string& filename);
+padded_string get_corpus(const std::string& filename);


 #endif
--- a/include/simdjson/jsonminifier.h
+++ b/include/simdjson/jsonminifier.h
@ -19,4 +19,8 @@ static inline size_t jsonminify(const std::string_view & p, char *out) {
    return jsonminify(p.data(), p.size(), out);
 }

+static inline size_t jsonminify(const padded_string & p, char *out) {
+    return jsonminify(p.data(), p.size(), out);
+}
+
 #endif
--- a/include/simdjson/jsonparser.h
+++ b/include/simdjson/jsonparser.h
@ -2,6 +2,7 @@
 #define SIMDJSON_JSONPARSER_H
 #include <string>
 #include "simdjson/common_defs.h"
+#include "simdjson/padded_string.h"
 #include "simdjson/jsonioutil.h"
 #include "simdjson/parsedjson.h"
 #include "simdjson/stage1_find_marks.h"
@ -46,10 +47,10 @@ inline int json_parse(const char * buf, size_t len, ParsedJson &pj, bool realloc
 // The input s should be readable up to s.data() + s.size() + SIMDJSON_PADDING  if reallocifneeded is false,
 // all bytes at and after s.data()+s.size() are ignored (can be garbage).
 // The ParsedJson object can be reused.
-WARN_UNUSED
-inline int json_parse(const std::string_view &s, ParsedJson &pj, bool reallocifneeded = true) {
-  return json_parse(s.data(), s.size(), pj, reallocifneeded);
-}
+//WARN_UNUSED
+//inline int json_parse(const std::string_view &s, ParsedJson &pj, bool reallocifneeded = true) {
+//  return json_parse(s.data(), s.size(), pj, reallocifneeded);
+//}



@ -65,6 +66,14 @@ inline int json_parse(const std::string &s, ParsedJson &pj) {
  return json_parse(s.data(), s.length(), pj, true);
 }

+// Parse a document found in in string s.
+// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
+// Return SUCCESS (an integer = 1) in case of a success. You can also check validity
+// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
+WARN_UNUSED
+inline int json_parse(const padded_string &s, ParsedJson &pj) {
+  return json_parse(s.data(), s.length(), pj, false);
+}


 // Build a ParsedJson object. You can check validity
@ -96,9 +105,9 @@ WARN_UNUSED
 // (a copy of the input string is made).
 // The input s should be readable up to s.data() + s.size() + SIMDJSON_PADDING  if reallocifneeded is false,
 // all bytes at and after s.data()+s.size() are ignored (can be garbage).
-inline ParsedJson build_parsed_json(const std::string_view &s, bool reallocifneeded = true) {
-  return build_parsed_json(s.data(), s.size(), reallocifneeded);
-}
+//inline ParsedJson build_parsed_json(const std::string_view &s, bool reallocifneeded = true) {
+ // return build_parsed_json(s.data(), s.size(), reallocifneeded);
+//}

 // Parse a document found in in string s.
 // You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
@ -113,6 +122,15 @@ inline ParsedJson build_parsed_json(const std::string &s) {
 }


+// Parse a document found in in string s.
+// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
+// Return SUCCESS (an integer = 1) in case of a success. You can also check validity
+// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
+WARN_UNUSED
+inline ParsedJson build_parsed_json(const padded_string &s) {
+  return build_parsed_json(s.data(), s.length(), false);
+}
+


 #endif
--- a/include/simdjson/numberparsing.h
+++ b/include/simdjson/numberparsing.h
@ -99,7 +99,7 @@ const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};

 really_inline bool
-is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
+is_not_structural_or_whitespace_or_exponent_or_decimal_or_null(unsigned char c) {
  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
 }

@ -115,6 +115,9 @@ is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
 // http://0x80.pl/articles/swar-digits-validate.html
 static inline bool is_made_of_eight_digits_fast(const char *chars) {
  uint64_t val;
+  // this can read up to 7 bytes beyond the buffer size, but we require 
+  // SIMDJSON_PADDING of padding
+  static_assert(7 <= SIMDJSON_PADDING);
  memcpy(&val, chars, 8);
  // a branchy method might be faster:
  // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
@ -128,6 +131,9 @@ static inline bool is_made_of_eight_digits_fast(const char *chars) {
 // this is more efficient apparently than the scalar code above (fewer instructions)
 static inline bool is_made_of_eight_digits_fast(const char *chars) {
  __m64 val;
+  // this can read up to 7 bytes beyond the buffer size, but we require 
+  // SIMDJSON_PADDING of padding
+  static_assert(7 <= SIMDJSON_PADDING);
  memcpy(&val, chars, 8);
  __m64 base = _mm_sub_pi8(val,_mm_set1_pi8('0'));
  __m64 basecmp = _mm_subs_pu8(base,_mm_set1_pi8(9));
@ -388,7 +394,7 @@ static really_inline bool parse_number(const uint8_t *const buf,
  uint64_t i; // an unsigned int avoids signed overflows (which are bad)
  if (*p == '0') { // 0 cannot be followed by an integer
    ++p;
-    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
+    if (is_not_structural_or_whitespace_or_exponent_or_decimal_or_null(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
      foundInvalidNumber(buf + offset);
 #endif
--- a/include/simdjson/padded_string.h
+++ b/include/simdjson/padded_string.h
@ -0,0 +1,67 @@
+#ifndef SIMDJSON_PADDING_STRING_H
+#define SIMDJSON_PADDING_STRING_H
+#include "simdjson/portability.h"
+#include <memory>
+#include <cstring>
+// low-level function to allocate memory with padding so we can read passed the
+// "length" bytes safely. if you must provide a pointer to some data, create it
+// with this function: length is the max. size in bytes of the string caller is
+// responsible to free the memory (free(...))
+char *allocate_padded_buffer(size_t length);
+
+// Simple string with padded allocation.
+// We deliberately forbid copies, users should rely on swap or move
+// constructors.
+class padded_string {
+public:
+  explicit padded_string() noexcept : viable_size(0), data_ptr(nullptr) {}
+  explicit padded_string(size_t length) noexcept
+      : viable_size(length), data_ptr(allocate_padded_buffer(length)) {
+
+    if (data_ptr != nullptr)
+      data_ptr[length] = '\0'; // easier when you need a c_str
+  }
+  explicit padded_string(char *data, size_t length) noexcept
+      : viable_size(length), data_ptr(allocate_padded_buffer(length)) {
+    if (data_ptr != nullptr) {
+      memcpy(data_ptr, data, length);
+      data_ptr[length] = '\0'; // easier when you need a c_str
+    }
+  }
+  padded_string(std::string s) noexcept
+      : viable_size(s.size()), data_ptr(allocate_padded_buffer(s.size())) {
+    if (data_ptr != nullptr) {
+      memcpy(data_ptr, s.data(), s.size());
+      data_ptr[s.size()] = '\0'; // easier when you need a c_str
+    }
+  }
+  padded_string(padded_string &&o) noexcept
+      : viable_size(o.viable_size), data_ptr(o.data_ptr) {
+    o.data_ptr = nullptr; // we take ownership
+  }
+  void swap(padded_string &o) {
+    size_t tmp_viable_size = viable_size;
+    char *tmp_data_ptr = data_ptr;
+    viable_size = o.viable_size;
+    data_ptr = o.data_ptr;
+    o.data_ptr = tmp_data_ptr;
+    o.viable_size = tmp_viable_size;
+  }
+
+  ~padded_string() { aligned_free_char(data_ptr); }
+
+  size_t size() const { return viable_size; }
+
+  size_t length() const { return viable_size; }
+
+  char *data() const { return data_ptr; }
+
+private:
+  padded_string &operator=(const padded_string &o) = delete;
+  padded_string(const padded_string &o) = delete;
+
+  size_t viable_size;
+  char *data_ptr;
+};
+
+#endif
--- a/include/simdjson/portability.h
+++ b/include/simdjson/portability.h
@ -95,6 +95,9 @@ static inline void *aligned_malloc(size_t alignment, size_t size) {
 	return p;
 }

+static inline char *aligned_malloc_char(size_t alignment, size_t size) {
+	return (char*)aligned_malloc(alignment, size);
+}

 #ifdef __AVX2__

@ -131,4 +134,10 @@ static inline void aligned_free(void *memblock) {
 #endif
 }

+
+
+static inline void aligned_free_char(char *memblock) {
+	aligned_free((void*)memblock);
+}
+
 #endif // SIMDJSON_PORTABILITY_H
--- a/include/simdjson/stringparsing.h
+++ b/include/simdjson/stringparsing.h
@ -89,6 +89,9 @@ really_inline  bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
  const uint8_t *const start_of_string = dst;
  while (1) {
 #ifdef __AVX2__
+    // this can read up to 31 bytes beyond the buffer size, but we require 
+    // SIMDJSON_PADDING of padding
+    static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
    __m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
    // store to dest unconditionally - we can overwrite the bits we don't like
    // later
@ -99,6 +102,9 @@ really_inline  bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
    auto quote_bits =
        static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask));
 #else
+    // this can read up to 31 bytes beyond the buffer size, but we require 
+    // SIMDJSON_PADDING of padding
+    static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
    uint8x16_t v0 = vld1q_u8(src);
    uint8x16_t v1 = vld1q_u8(src+16);
    vst1q_u8(dst, v0);
--- a/jsonchecker/pass16.json
+++ b/jsonchecker/pass16.json
@ -1 +1 @@
-0
+0
--- a/singleheader/amalgamation_demo.cpp
+++ b/singleheader/amalgamation_demo.cpp
@ -1,11 +1,11 @@
-/* auto-generated on Wed 13 Mar 2019 21:02:37 EDT. Do not edit! */
+/* auto-generated on Thu May  9 17:40:56 EDT 2019. Do not edit! */

 #include <iostream>
 #include "simdjson.h"
 #include "simdjson.cpp"
 int main(int argc, char *argv[]) {
  const char * filename = argv[1];
-  std::string_view p = get_corpus(filename);
+  padded_string p = get_corpus(filename);
  ParsedJson pj = build_parsed_json(p); // do the parsing
  if( ! pj.isValid() ) {
    std::cout << "not valid" << std::endl;
--- a/singleheader/simdjson.cpp
+++ b/singleheader/simdjson.cpp
@ -1,4 +1,4 @@
-/* auto-generated on Wed 13 Mar 2019 21:02:37 EDT. Do not edit! */
+/* auto-generated on Thu May  9 17:40:56 EDT 2019. Do not edit! */
 #include "simdjson.h"

 /* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
@ -15,28 +15,27 @@ char * allocate_padded_buffer(size_t length) {
    //return (char *) malloc(length + SIMDJSON_PADDING);
    // However, we might as well align to cache lines...
    size_t totalpaddedlength = length + SIMDJSON_PADDING;
-    char *padded_buffer = (char *) aligned_malloc(64, totalpaddedlength);
+    char *padded_buffer = aligned_malloc_char(64, totalpaddedlength);
    return padded_buffer;
 }

-std::string_view get_corpus(const std::string& filename) {
+padded_string get_corpus(const std::string& filename) {
  std::FILE *fp = std::fopen(filename.c_str(), "rb");
  if (fp != nullptr) {
    std::fseek(fp, 0, SEEK_END);
    size_t len = std::ftell(fp);
-    char * buf = allocate_padded_buffer(len);
-    if(buf == nullptr) {
+    padded_string s(len);
+    if(s.data() == nullptr) {
      std::fclose(fp);
      throw  std::runtime_error("could not allocate memory");
    }
    std::rewind(fp);
-    size_t readb = std::fread(buf, 1, len, fp);
+    size_t readb = std::fread(s.data(), 1, len, fp);
    std::fclose(fp);
    if(readb != len) {
-      aligned_free(buf);
      throw  std::runtime_error("could not read the data");
    }
-    return std::string_view(buf,len);
+    return s;
  }
  throw  std::runtime_error("could not load corpus");
 }
@ -308,16 +307,27 @@ int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifnee
  }
  bool reallocated = false;
  if(reallocifneeded) {
-      // realloc is needed if the end of the memory crosses a page
+#ifdef ALLOW_SAME_PAGE_BUFFER_OVERRUN
+	  // realloc is needed if the end of the memory crosses a page
 #ifdef _MSC_VER
 	  SYSTEM_INFO sysInfo; 
 	  GetSystemInfo(&sysInfo); 
 	  long pagesize = sysInfo.dwPageSize;
 #else
-     long pagesize = sysconf (_SC_PAGESIZE); 
+    long pagesize = sysconf (_SC_PAGESIZE); 
 #endif
-	 if ( (reinterpret_cast<uintptr_t>(buf + len - 1) % pagesize ) < SIMDJSON_PADDING ) {
-       const uint8_t *tmpbuf  = buf;
+  //////////////
+  // We want to check that buf + len - 1 and buf + len - 1 + SIMDJSON_PADDING
+  // are in the same page.
+  // That is, we want to check that  
+  // (buf + len - 1) / pagesize == (buf + len - 1 + SIMDJSON_PADDING) / pagesize
+  // That's true if (buf + len - 1) % pagesize + SIMDJSON_PADDING < pagesize.
+  ///////////
+	 if ( (reinterpret_cast<uintptr_t>(buf + len - 1) % pagesize ) + SIMDJSON_PADDING < static_cast<uintptr_t>(pagesize) ) {
+#else // SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN
+     if(true) { // if not SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN, we always reallocate
+#endif
+	   const uint8_t *tmpbuf  = buf;
       buf = (uint8_t *) allocate_padded_buffer(len);
       if(buf == NULL) return simdjson::MEMALLOC;
       memcpy((void*)buf,tmpbuf,len);
@ -350,8 +360,18 @@ ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool reallocifneede
 /* begin file src/stage1_find_marks.cpp */
 #include <cassert>

+
+#ifdef __AVX2__
+
 #ifndef SIMDJSON_SKIPUTF8VALIDATION
 #define SIMDJSON_UTF8VALIDATE
+
+#endif
+#else
+// currently we don't UTF8 validate for ARM
+// also we assume that if you're not __AVX2__ 
+// you're ARM, which is a bit dumb. TODO: Fix...
+#include <arm_neon.h>
 #endif

 // It seems that many parsers do UTF-8 validation.
@ -359,13 +379,51 @@ ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool reallocifneede
 // allows it.
 #ifdef SIMDJSON_UTF8VALIDATE
 #endif
-using namespace std;

-really_inline void check_utf8(__m256i input_lo, __m256i input_hi,
+#define TRANSPOSE
+
+struct simd_input {
+#ifdef __AVX2__
+  __m256i lo;
+  __m256i hi;
+#elif defined(__ARM_NEON)
+#ifndef TRANSPOSE
+  uint8x16_t i0;
+  uint8x16_t i1;
+  uint8x16_t i2;
+  uint8x16_t i3;
+#else
+  uint8x16x4_t i;
+#endif
+#else
+#error "It's called SIMDjson for a reason, bro"
+#endif
+};
+
+really_inline simd_input fill_input(const uint8_t * ptr) {
+  struct simd_input in;
+#ifdef __AVX2__
+  in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
+  in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
+#elif defined(__ARM_NEON)
+#ifndef TRANSPOSE
+  in.i0 = vld1q_u8(ptr + 0);
+  in.i1 = vld1q_u8(ptr + 16);
+  in.i2 = vld1q_u8(ptr + 32);
+  in.i3 = vld1q_u8(ptr + 48);
+#else
+  in.i = vld4q_u8(ptr);
+#endif
+#endif
+  return in;
+}
+
+#ifdef SIMDJSON_UTF8VALIDATE
+really_inline void check_utf8(simd_input in,
                              __m256i &has_error,
                              struct avx_processed_utf_bytes &previous) {
  __m256i highbit = _mm256_set1_epi8(0x80);
-  if ((_mm256_testz_si256(_mm256_or_si256(input_lo, input_hi), highbit)) == 1) {
+  if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) {
    // it is ascii, we just check continuation
    has_error = _mm256_or_si256(
        _mm256_cmpgt_epi8(
@ -375,30 +433,101 @@ really_inline void check_utf8(__m256i input_lo, __m256i input_hi,
        has_error);
  } else {
    // it is not ascii so we have to do heavy work
-    previous = avxcheckUTF8Bytes(input_lo, &previous, &has_error);
-    previous = avxcheckUTF8Bytes(input_hi, &previous, &has_error);
+    previous = avxcheckUTF8Bytes(in.lo, &previous, &has_error);
+    previous = avxcheckUTF8Bytes(in.hi, &previous, &has_error);
  }
 }
+#endif
+
+#ifdef __ARM_NEON
+uint16_t neonmovemask(uint8x16_t input) {
+  const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+  uint8x16_t minput = vandq_u8(input, bitmask);
+  uint8x16_t tmp = vpaddq_u8(minput, minput);
+  tmp = vpaddq_u8(tmp, tmp);
+  tmp = vpaddq_u8(tmp, tmp);
+  return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
+}
+
+really_inline
+uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) {
+#ifndef TRANSPOSE
+  const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+  uint8x16_t t0 = vandq_u8(p0, bitmask);
+  uint8x16_t t1 = vandq_u8(p1, bitmask);
+  uint8x16_t t2 = vandq_u8(p2, bitmask);
+  uint8x16_t t3 = vandq_u8(p3, bitmask);
+  uint8x16_t sum0 = vpaddq_u8(t0, t1);
+  uint8x16_t sum1 = vpaddq_u8(t2, t3);
+  sum0 = vpaddq_u8(sum0, sum1);
+  sum0 = vpaddq_u8(sum0, sum0);
+  return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+#else
+  const uint8x16_t bitmask1 = { 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10,
+                                0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10};
+  const uint8x16_t bitmask2 = { 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20,
+                                0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20};
+  const uint8x16_t bitmask3 = { 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40,
+                                0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40};
+  const uint8x16_t bitmask4 = { 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80,
+                                0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80};
+#if 0
+  uint8x16_t t0 = vandq_u8(p0, bitmask1);
+  uint8x16_t t1 = vandq_u8(p1, bitmask2);
+  uint8x16_t t2 = vandq_u8(p2, bitmask3);
+  uint8x16_t t3 = vandq_u8(p3, bitmask4);
+  uint8x16_t tmp = vorrq_u8(vorrq_u8(t0, t1), vorrq_u8(t2, t3));
+#else
+  uint8x16_t t0 = vandq_u8(p0, bitmask1);
+  uint8x16_t t1 = vbslq_u8(bitmask2, p1, t0);
+  uint8x16_t t2 = vbslq_u8(bitmask3, p2, t1);
+  uint8x16_t tmp = vbslq_u8(bitmask4, p3, t2);
+#endif
+  uint8x16_t sum = vpaddq_u8(tmp, tmp);
+  return vgetq_lane_u64(vreinterpretq_u64_u8(sum), 0);
+#endif
+}
+#endif

 // a straightforward comparison of a mask against input. 5 uops; would be
 // cheaper in AVX512.
-really_inline uint64_t cmp_mask_against_input(__m256i input_lo,
-                                              __m256i input_hi, __m256i mask) {
-  __m256i cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask);
+really_inline uint64_t cmp_mask_against_input(simd_input in, uint8_t m) {
+#ifdef __AVX2__
+  const __m256i mask = _mm256_set1_epi8(m);
+  __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-  __m256i cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask);
+  __m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
  return res_0 | (res_1 << 32);
+#elif defined(__ARM_NEON)
+  const uint8x16_t mask = vmovq_n_u8(m); 
+  uint8x16_t cmp_res_0 = vceqq_u8(in.i.val[0], mask); 
+  uint8x16_t cmp_res_1 = vceqq_u8(in.i.val[1], mask); 
+  uint8x16_t cmp_res_2 = vceqq_u8(in.i.val[2], mask); 
+  uint8x16_t cmp_res_3 = vceqq_u8(in.i.val[3], mask); 
+  return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+#endif
 }

 // find all values less than or equal than the content of maxval (using unsigned arithmetic) 
-really_inline uint64_t unsigned_lteq_against_input(__m256i input_lo,
-                                              __m256i input_hi, __m256i maxval) {
-  __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,input_lo),maxval);
+really_inline uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m) {
+#ifdef __AVX2__
+  const __m256i maxval = _mm256_set1_epi8(m);
+  __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.lo),maxval);
  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-  __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,input_hi),maxval);
+  __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.hi),maxval);
  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
  return res_0 | (res_1 << 32);
+#elif defined(__ARM_NEON)
+  const uint8x16_t mask = vmovq_n_u8(m); 
+  uint8x16_t cmp_res_0 = vcleq_u8(in.i.val[0], mask); 
+  uint8x16_t cmp_res_1 = vcleq_u8(in.i.val[1], mask); 
+  uint8x16_t cmp_res_2 = vcleq_u8(in.i.val[2], mask); 
+  uint8x16_t cmp_res_3 = vcleq_u8(in.i.val[3], mask); 
+  return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
+#endif
 }

 // return a bitvector indicating where we have characters that end an odd-length
@ -411,12 +540,11 @@ really_inline uint64_t unsigned_lteq_against_input(__m256i input_lo,
 // backslashes, which modifies our subsequent search for odd-length
 // sequences of backslashes in an obvious way.
 really_inline uint64_t
-find_odd_backslash_sequences(__m256i input_lo, __m256i input_hi,
+find_odd_backslash_sequences(simd_input in,
                             uint64_t &prev_iter_ends_odd_backslash) {
  const uint64_t even_bits = 0x5555555555555555ULL;
  const uint64_t odd_bits = ~even_bits;
-  uint64_t bs_bits =
-      cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('\\'));
+  uint64_t bs_bits = cmp_mask_against_input(in, '\\');
  uint64_t start_edges = bs_bits & ~(bs_bits << 1);
  // flip lowest if we have an odd-length run at the end of the prior
  // iteration
@ -457,22 +585,24 @@ find_odd_backslash_sequences(__m256i input_lo, __m256i input_hi,
 // Note that we don't do any error checking to see if we have backslash
 // sequences outside quotes; these
 // backslash sequences (of any length) will be detected elsewhere.
-really_inline uint64_t find_quote_mask_and_bits(
-    __m256i input_lo, __m256i input_hi, uint64_t odd_ends,
+really_inline uint64_t find_quote_mask_and_bits(simd_input in, uint64_t odd_ends,
    uint64_t &prev_iter_inside_quote, uint64_t &quote_bits, uint64_t &error_mask) {
-  quote_bits =
-      cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
+  quote_bits = cmp_mask_against_input(in, '"');
  quote_bits = quote_bits & ~odd_ends;
  // remove from the valid quoted region the unescapted characters.
+#ifdef __AVX2__
  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
      _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
+#elif defined(__ARM_NEON)
+  uint64_t quote_mask = vmull_p64( -1ULL, quote_bits);
+#endif
  quote_mask ^= prev_iter_inside_quote;
  // All Unicode characters may be placed within the
  // quotation marks, except for the characters that MUST be escaped:
  // quotation mark, reverse solidus, and the control characters (U+0000
  //through U+001F).
  // https://tools.ietf.org/html/rfc8259
-  uint64_t unescaped = unsigned_lteq_against_input(input_lo, input_hi, _mm256_set1_epi8(0x1F));
+  uint64_t unescaped = unsigned_lteq_against_input(in, 0x1F);
  error_mask |= quote_mask & unescaped;
  // right shift of a signed value expected to be well-defined and standard
  // compliant as of C++20,
@ -482,8 +612,7 @@ really_inline uint64_t find_quote_mask_and_bits(
  return quote_mask;
 }

-really_inline void find_whitespace_and_structurals(const __m256i input_lo,
-                                                   __m256i input_hi,
+really_inline void find_whitespace_and_structurals(simd_input in,
                                                   uint64_t &whitespace,
                                                   uint64_t &structurals) {
  // do a 'shufti' to detect structural JSON characters
@ -493,26 +622,27 @@ really_inline void find_whitespace_and_structurals(const __m256i input_lo,
  // we are also interested in the four whitespace characters
  // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
  // these go into the next 2 buckets of the comparison (8/16)
+#ifdef __AVX2__
  const __m256i low_nibble_mask = _mm256_setr_epi8(
-      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0,
-      0, 8, 12, 1, 2, 9, 0, 0);
+      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 
+      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
  const __m256i high_nibble_mask = _mm256_setr_epi8(
-      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0, 1,
-      0, 0, 0, 3, 2, 1, 0, 0);
+      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 
+      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);

  __m256i structural_shufti_mask = _mm256_set1_epi8(0x7);
  __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);

  __m256i v_lo = _mm256_and_si256(
-      _mm256_shuffle_epi8(low_nibble_mask, input_lo),
+      _mm256_shuffle_epi8(low_nibble_mask, in.lo),
      _mm256_shuffle_epi8(high_nibble_mask,
-                          _mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
+                          _mm256_and_si256(_mm256_srli_epi32(in.lo, 4),
                                           _mm256_set1_epi8(0x7f))));

  __m256i v_hi = _mm256_and_si256(
-      _mm256_shuffle_epi8(low_nibble_mask, input_hi),
+      _mm256_shuffle_epi8(low_nibble_mask, in.hi),
      _mm256_shuffle_epi8(high_nibble_mask,
-                          _mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
+                          _mm256_and_si256(_mm256_srli_epi32(in.hi, 4),
                                           _mm256_set1_epi8(0x7f))));
  __m256i tmp_lo = _mm256_cmpeq_epi8(
      _mm256_and_si256(v_lo, structural_shufti_mask), _mm256_set1_epi8(0));
@ -532,6 +662,124 @@ really_inline void find_whitespace_and_structurals(const __m256i input_lo,
  uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
  uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
  whitespace = ~(ws_res_0 | (ws_res_1 << 32));
+#elif defined(__ARM_NEON)
+#ifndef FUNKY_BAD_TABLE
+  const uint8x16_t low_nibble_mask = (uint8x16_t){ 
+      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
+  const uint8x16_t high_nibble_mask = (uint8x16_t){ 
+      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
+  const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7); 
+  const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); 
+  const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf); 
+
+  uint8x16_t nib_0_lo = vandq_u8(in.i.val[0], low_nib_and_mask);
+  uint8x16_t nib_0_hi = vshrq_n_u8(in.i.val[0], 4);
+  uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
+  uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
+  uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
+
+  uint8x16_t nib_1_lo = vandq_u8(in.i.val[1], low_nib_and_mask);
+  uint8x16_t nib_1_hi = vshrq_n_u8(in.i.val[1], 4);
+  uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
+  uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
+  uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
+
+  uint8x16_t nib_2_lo = vandq_u8(in.i.val[2], low_nib_and_mask);
+  uint8x16_t nib_2_hi = vshrq_n_u8(in.i.val[2], 4);
+  uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
+  uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
+  uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
+
+  uint8x16_t nib_3_lo = vandq_u8(in.i.val[3], low_nib_and_mask);
+  uint8x16_t nib_3_hi = vshrq_n_u8(in.i.val[3], 4);
+  uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
+  uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
+  uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
+
+  uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
+  uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
+  uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
+  uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
+  structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
+
+  uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
+  uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
+  uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
+  uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
+  whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
+#else
+  // I think this one is garbage. In order to save the expense
+  // of another shuffle, I use an equally expensive shift, and 
+  // this gets glued to the end of the dependency chain. Seems a bit
+  // slower for no good reason.
+  //
+  // need to use a weird arrangement. Bytes in this bitvector
+  // are in conventional order, but bits are reversed as we are
+  // using a signed left shift (that is a +ve value from 0..7) to
+  // shift upwards to 0x80 in the bit. So we need to reverse bits.
+  
+  // note no structural/whitespace has the high bit on
+  // so it's OK to put the high 5 bits into our TBL shuffle
+  //
+
+  // structurals are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
+  // or in 5 bit, 3 bit form thats
+  // (15,3) (15, 5) (7,2) (11,3) (11,5) (5,4) 
+  // bit-reversing (subtract low 3 bits from 7) yields:
+  // (15,4) (15, 2) (7,5) (11,4) (11,2) (5,3) 
+  
+  const uint8x16_t structural_bitvec = (uint8x16_t){ 
+      0, 0, 0, 0, 
+      0, 8, 0, 32, 
+      0, 0, 0, 20, 
+      0, 0, 0, 20};
+  // we are also interested in the four whitespace characters
+  // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
+  // (4,0) (1, 2) (1, 1) (1, 5)
+  // bit-reversing (subtract low 3 bits from 7) yields:
+  // (4,7) (1, 5) (1, 6) (1, 2)
+  
+  const uint8x16_t whitespace_bitvec = (uint8x16_t){ 
+      0, 100, 0, 0, 
+      128, 0, 0, 0, 
+      0, 0, 0, 0, 
+      0, 0, 0, 0};
+  const uint8x16_t low_3bits_and_mask = vmovq_n_u8(0x7); 
+  const uint8x16_t high_1bit_tst_mask = vmovq_n_u8(0x80); 
+
+  int8x16_t low_3bits_0 = vreinterpretq_s8_u8(vandq_u8(in.i.val[0], low_3bits_and_mask));
+  uint8x16_t high_5bits_0 = vshrq_n_u8(in.i.val[0], 3);
+  uint8x16_t shuffle_structural_0 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_0), low_3bits_0);
+  uint8x16_t shuffle_ws_0 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_0), low_3bits_0);
+  uint8x16_t tmp_0 = vtstq_u8(shuffle_structural_0, high_1bit_tst_mask);
+  uint8x16_t tmp_ws_0 = vtstq_u8(shuffle_ws_0, high_1bit_tst_mask);
+
+  int8x16_t low_3bits_1 = vreinterpretq_s8_u8(vandq_u8(in.i.val[1], low_3bits_and_mask));
+  uint8x16_t high_5bits_1 = vshrq_n_u8(in.i.val[1], 3);
+  uint8x16_t shuffle_structural_1 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_1), low_3bits_1);
+  uint8x16_t shuffle_ws_1 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_1), low_3bits_1);
+  uint8x16_t tmp_1 = vtstq_u8(shuffle_structural_1, high_1bit_tst_mask);
+  uint8x16_t tmp_ws_1 = vtstq_u8(shuffle_ws_1, high_1bit_tst_mask);
+
+  int8x16_t low_3bits_2 = vreinterpretq_s8_u8(vandq_u8(in.i.val[2], low_3bits_and_mask));
+  uint8x16_t high_5bits_2 = vshrq_n_u8(in.i.val[2], 3);
+  uint8x16_t shuffle_structural_2 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_2), low_3bits_2);
+  uint8x16_t shuffle_ws_2 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_2), low_3bits_2);
+  uint8x16_t tmp_2 = vtstq_u8(shuffle_structural_2, high_1bit_tst_mask);
+  uint8x16_t tmp_ws_2 = vtstq_u8(shuffle_ws_2, high_1bit_tst_mask);
+
+  int8x16_t low_3bits_3 = vreinterpretq_s8_u8(vandq_u8(in.i.val[3], low_3bits_and_mask));
+  uint8x16_t high_5bits_3 = vshrq_n_u8(in.i.val[3], 3);
+  uint8x16_t shuffle_structural_3 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_3), low_3bits_3);
+  uint8x16_t shuffle_ws_3 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_3), low_3bits_3);
+  uint8x16_t tmp_3 = vtstq_u8(shuffle_structural_3, high_1bit_tst_mask);
+  uint8x16_t tmp_ws_3 = vtstq_u8(shuffle_ws_3, high_1bit_tst_mask);
+
+  structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
+  whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
+#endif
+
+#endif
 }

 // flatten out values in 'bits' assuming that they are are to have values of idx
@ -608,9 +856,9 @@ WARN_UNUSED
 /*never_inline*/ bool find_structural_bits(const uint8_t *buf, size_t len,
                                           ParsedJson &pj) {
  if (len > pj.bytecapacity) {
-    cerr << "Your ParsedJson object only supports documents up to "
+    std::cerr << "Your ParsedJson object only supports documents up to "
         << pj.bytecapacity << " bytes but you are trying to process " << len
-         << " bytes\n";
+         << " bytes" << std::endl;
    return false;
  }
  uint32_t *base_ptr = pj.structural_indexes;
@ -654,32 +902,26 @@ WARN_UNUSED
 #ifndef _MSC_VER
    __builtin_prefetch(buf + idx + 128);
 #endif
-    __m256i input_lo =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0));
-    __m256i input_hi =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32));
-
+    simd_input in = fill_input(buf+idx);
 #ifdef SIMDJSON_UTF8VALIDATE
-    check_utf8(input_lo, input_hi, has_error, previous);
+    check_utf8(in, has_error, previous);
 #endif
-
    // detect odd sequences of backslashes
    uint64_t odd_ends = find_odd_backslash_sequences(
-        input_lo, input_hi, prev_iter_ends_odd_backslash);
+        in, prev_iter_ends_odd_backslash);

    // detect insides of quote pairs ("quote_mask") and also our quote_bits
    // themselves
    uint64_t quote_bits;
    uint64_t quote_mask = find_quote_mask_and_bits(
-        input_lo, input_hi, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
+        in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);

    // take the previous iterations structural bits, not our current iteration,
    // and flatten
    flatten_bits(base_ptr, base, idx, structurals);

    uint64_t whitespace;
-    find_whitespace_and_structurals(input_lo, input_hi, whitespace,
-                                    structurals);
+    find_whitespace_and_structurals(in, whitespace, structurals);

    // fixup structurals to reflect quotes and add pseudo-structural characters
    structurals = finalize_structurals(structurals, whitespace, quote_mask,
@ -695,38 +937,39 @@ WARN_UNUSED
    uint8_t tmpbuf[64];
    memset(tmpbuf, 0x20, 64);
    memcpy(tmpbuf, buf + idx, len - idx);
-    __m256i input_lo =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(tmpbuf + 0));
-    __m256i input_hi =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(tmpbuf + 32));
-
+    simd_input in = fill_input(tmpbuf);
 #ifdef SIMDJSON_UTF8VALIDATE
-    check_utf8(input_lo, input_hi, has_error, previous);
+    check_utf8(in, has_error, previous);
 #endif

    // detect odd sequences of backslashes
    uint64_t odd_ends = find_odd_backslash_sequences(
-        input_lo, input_hi, prev_iter_ends_odd_backslash);
+        in, prev_iter_ends_odd_backslash);

    // detect insides of quote pairs ("quote_mask") and also our quote_bits
    // themselves
    uint64_t quote_bits;
    uint64_t quote_mask = find_quote_mask_and_bits(
-        input_lo, input_hi, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
+        in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);

    // take the previous iterations structural bits, not our current iteration,
    // and flatten
    flatten_bits(base_ptr, base, idx, structurals);

    uint64_t whitespace;
-    find_whitespace_and_structurals(input_lo, input_hi, whitespace,
-                                    structurals);
+    find_whitespace_and_structurals(in, whitespace, structurals);

    // fixup structurals to reflect quotes and add pseudo-structural characters
    structurals = finalize_structurals(structurals, whitespace, quote_mask,
                                       quote_bits, prev_iter_ends_pseudo_pred);
    idx += 64;
  }
+
+  // is last string quote closed?
+  if (prev_iter_inside_quote) {
+      return false;
+  }
+
  // finally, flatten out the remaining structurals from the last iteration
  flatten_bits(base_ptr, base, idx, structurals);

@ -734,6 +977,7 @@ WARN_UNUSED
  // a valid JSON file cannot have zero structural indexes - we should have
  // found something
  if (pj.n_structural_indexes == 0u) {
+printf("wacky exit\n");
    return false;
  }
  if (base_ptr[pj.n_structural_indexes - 1] > len) {
@ -748,6 +992,7 @@ WARN_UNUSED
  // make it safe to dereference one beyond this array
  base_ptr[pj.n_structural_indexes] = 0;  
  if (error_mask) {
+printf("had error mask\n");
    return false;
  }
 #ifdef SIMDJSON_UTF8VALIDATE
@ -762,13 +1007,6 @@ bool find_structural_bits(const char *buf, size_t len, ParsedJson &pj) {
 }
 /* end file src/stage1_find_marks.cpp */
 /* begin file src/stage2_build_tape.cpp */
-#ifdef _MSC_VER
-/* Microsoft C/C++-compatible compiler */
-#include <intrin.h>
-#else
-#include <x86intrin.h>
-#endif
-
 #include <cassert>
 #include <cstring>

@ -777,14 +1015,15 @@ bool find_structural_bits(const char *buf, size_t len, ParsedJson &pj) {
 #define PATH_SEP '/'


-using namespace std;
-
 WARN_UNUSED
 really_inline bool is_valid_true_atom(const uint8_t *loc) {
  uint64_t tv = *reinterpret_cast<const uint64_t *>("true    ");
  uint64_t mask4 = 0x00000000ffffffff;
  uint32_t error = 0;
  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
+  // this can read up to 7 bytes beyond the buffer size, but we require 
+  // SIMDJSON_PADDING of padding
+  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
  std::memcpy(&locval, loc, sizeof(uint64_t));
  error = (locval & mask4) ^ tv;
  error |= is_not_structural_or_whitespace(loc[4]);
@ -793,10 +1032,21 @@ really_inline bool is_valid_true_atom(const uint8_t *loc) {

 WARN_UNUSED
 really_inline bool is_valid_false_atom(const uint8_t *loc) {
-  uint64_t fv = *reinterpret_cast<const uint64_t *>("false   ");
+  // We have to use an integer constant because the space in the cast
+  // below would lead to values illegally being qualified
+  // uint64_t fv = *reinterpret_cast<const uint64_t *>("false   ");
+  // using this constant (that is the same false) but nulls out the
+  // unused bits solves that
+  uint64_t fv = 0x00000065736c6166; // takes into account endianness
  uint64_t mask5 = 0x000000ffffffffff;
-  uint32_t error = 0;
+  // we can't use the 32 bit value for checking for errors otherwise
+  // the last character of false (it being 5 byte long!) would be
+  // ignored
+  uint64_t error = 0;
  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
+  // this can read up to 7 bytes beyond the buffer size, but we require 
+  // SIMDJSON_PADDING of padding
+  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
  std::memcpy(&locval, loc, sizeof(uint64_t));
  error = (locval & mask5) ^ fv;
  error |= is_not_structural_or_whitespace(loc[5]);
@ -809,6 +1059,9 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) {
  uint64_t mask4 = 0x00000000ffffffff;
  uint32_t error = 0;
  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
+  // this can read up to 7 bytes beyond the buffer size, but we require 
+  // SIMDJSON_PADDING of padding
+  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
  std::memcpy(&locval, loc, sizeof(uint64_t));
  error = (locval & mask4) ^ nv;
  error |= is_not_structural_or_whitespace(loc[4]);
@ -820,7 +1073,7 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) {
 * The JSON is parsed to a tape, see the accompanying tape.md file
 * for documentation.
 ***********/
-WARN_UNUSED
+WARN_UNUSED  ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER
 int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
  uint32_t i = 0; // index of the structural character (0,1,2,3...)
  uint32_t idx;   // location of the structural character in the input (buf)
@ -1587,26 +1840,32 @@ bool ParsedJson::dump_raw_tape(std::ostream &os) {
 #include <iterator>

 ParsedJson::iterator::iterator(ParsedJson &pj_) : pj(pj_), depth(0), location(0), tape_length(0), depthindex(nullptr) {
-        if(pj.isValid()) {
-            depthindex = new scopeindex_t[pj.depthcapacity];
-            if(depthindex == nullptr) { return;
-}
-            depthindex[0].start_of_scope = location;
-            current_val = pj.tape[location++];
-            current_type = (current_val >> 56);
-            depthindex[0].scope_type = current_type;
-            if (current_type == 'r') {
-              tape_length = current_val & JSONVALUEMASK;
-              if(location < tape_length) {
+        if(!pj.isValid()) {
+            throw InvalidJSON();
+        }
+        depthindex = new scopeindex_t[pj.depthcapacity];
+        // memory allocation would throw
+        //if(depthindex == nullptr) { 
+        //    return;
+        //}
+        depthindex[0].start_of_scope = location;
+        current_val = pj.tape[location++];
+        current_type = (current_val >> 56);
+        depthindex[0].scope_type = current_type;
+        if (current_type == 'r') {
+            tape_length = current_val & JSONVALUEMASK;
+            if(location < tape_length) {
                current_val = pj.tape[location];
                current_type = (current_val >> 56);
                depth++;
                depthindex[depth].start_of_scope = location;
                depthindex[depth].scope_type = current_type;
              }
-            }
+        } else {
+            // should never happen
+            throw InvalidJSON();
        }
-    }
+}

 ParsedJson::iterator::~iterator() {
      delete[] depthindex;
@ -1614,14 +1873,12 @@ ParsedJson::iterator::~iterator() {

 ParsedJson::iterator::iterator(const iterator &o):
    pj(o.pj), depth(o.depth), location(o.location),
-    tape_length(o.tape_length), current_type(o.current_type),
+    tape_length(0), current_type(o.current_type),
    current_val(o.current_val), depthindex(nullptr) {
    depthindex = new scopeindex_t[pj.depthcapacity];
-    if(depthindex != nullptr) {
-        memcpy(depthindex, o.depthindex, pj.depthcapacity * sizeof(depthindex[0]));
-    } else {
-        tape_length = 0;
-    }
+    // allocation might throw
+    memcpy(depthindex, o.depthindex, pj.depthcapacity * sizeof(depthindex[0]));
+    tape_length = o.tape_length;
 }

 ParsedJson::iterator::iterator(iterator &&o):
@ -1741,6 +1998,18 @@ bool ParsedJson::iterator::is_double() const {
    return get_type() == 'd';
 }

+bool ParsedJson::iterator::is_true() const {
+    return get_type() == 't';
+}
+
+bool ParsedJson::iterator::is_false() const {
+    return get_type() == 'f';
+}
+
+bool ParsedJson::iterator::is_null() const {
+    return get_type() == 'n';
+}
+
 bool ParsedJson::iterator::is_object_or_array(uint8_t type) {
    return (type == '[' || (type == '{'));
 }
--- a/singleheader/simdjson.h
+++ b/singleheader/simdjson.h
@ -1,4 +1,4 @@
-/* auto-generated on Wed 13 Mar 2019 21:02:37 EDT. Do not edit! */
+/* auto-generated on Thu May  9 17:40:56 EDT 2019. Do not edit! */
 /* begin file include/simdjson/simdjson_version.h */
 // /include/simdjson/simdjson_version.h automatically generated by release.py, do not change by hand 
 #ifndef SIMDJSON_INCLUDE_SIMDJSON_VERSION 
@ -27,18 +27,11 @@ struct simdjson {
  static const std::string& errorMsg(const int);
 };

-#endif
-/* end file include/simdjson/simdjson.h */
+#endif/* end file include/simdjson/simdjson.h */
 /* begin file include/simdjson/portability.h */
 #ifndef SIMDJSON_PORTABILITY_H
 #define SIMDJSON_PORTABILITY_H

-#if defined(_MSC_VER)
-#include <intrin.h>
-#else
-#include <x86intrin.h>
-#endif
-
 #ifdef _MSC_VER
 /* Microsoft C/C++-compatible compiler */
 #include <intrin.h>
@ -75,7 +68,11 @@ static inline int hamming(uint64_t input_num) {

 #else
 #include <cstdint>
+#include <cstdlib>
+
+#if defined(__BMI2__) || defined(__POPCOUNT__) || defined(__AVX2__)
 #include <x86intrin.h>
+#endif

 static inline bool add_overflow(uint64_t  value1, uint64_t  value2, uint64_t *result) {
 	return __builtin_uaddll_overflow(value1, value2, (unsigned long long*)result);
@ -86,28 +83,34 @@ static inline bool mul_overflow(uint64_t  value1, uint64_t  value2, uint64_t *re

 /* result might be undefined when input_num is zero */
 static inline int trailingzeroes(uint64_t input_num) {
-#ifdef __BMI__
+#ifdef __BMI2__
 	return _tzcnt_u64(input_num);
 #else
-#warning "BMI is missing?"
 	return __builtin_ctzll(input_num);
 #endif
 }

 /* result might be undefined when input_num is zero */
 static inline int leadingzeroes(uint64_t  input_num) {
+#ifdef __BMI2__
 	return _lzcnt_u64(input_num);
+#else
+	return __builtin_clzll(input_num);
+#endif
 }

 /* result might be undefined when input_num is zero */
 static inline int hamming(uint64_t input_num) {
+#ifdef __POPCOUNT__
 	return _popcnt64(input_num);
+#else
+	return __builtin_popcountll(input_num);
+#endif
 }

 #endif // _MSC_VER


-
 // portable version of  posix_memalign
 static inline void *aligned_malloc(size_t alignment, size_t size) {
 	void *p;
@ -123,6 +126,11 @@ static inline void *aligned_malloc(size_t alignment, size_t size) {
 	return p;
 }

+static inline char *aligned_malloc_char(size_t alignment, size_t size) {
+	return (char*)aligned_malloc(alignment, size);
+}
+
+#ifdef __AVX2__

 #ifndef __clang__
 #ifndef _MSC_VER
@ -144,6 +152,7 @@ static inline void _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo,
 #endif
 #endif

+#endif // AVX_2

 static inline void aligned_free(void *memblock) {
    if(memblock == nullptr) { return; }
@ -156,6 +165,12 @@ static inline void aligned_free(void *memblock) {
 #endif
 }

+
+
+static inline void aligned_free_char(char *memblock) {
+	aligned_free((void*)memblock);
+}
+
 #endif // SIMDJSON_PORTABILITY_H
 /* end file include/simdjson/portability.h */
 /* begin file include/simdjson/common_defs.h */
@ -169,7 +184,13 @@ static inline void aligned_free(void *memblock) {
 #define SIMDJSON_MAXSIZE_BYTES 0xFFFFFFFF

 // the input buf should be readable up to buf + SIMDJSON_PADDING
+#ifdef __AVX2__
 #define SIMDJSON_PADDING  sizeof(__m256i)
+#else
+// this is a stopgap; there should be a better description of the
+// main loop and its behavior that abstracts over this
+#define SIMDJSON_PADDING  32
+#endif

 #ifndef _MSC_VER
 // Implemented using Labels as Values which works in GCC and CLANG (and maybe
@ -187,8 +208,8 @@ static inline void aligned_free(void *memblock) {
 #define ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0)

 #ifdef _MSC_VER
-
-
+// Visual Studio won't allow it:
+//#define ALLOW_SAME_PAGE_BUFFER_OVERRUN
 #define really_inline inline
 #define never_inline __declspec(noinline)

@ -204,6 +225,22 @@ static inline void aligned_free(void *memblock) {

 #else

+// for non-Visual Studio compilers, we assume that same-page buffer overrun is fine:
+#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
+#define ALLOW_SAME_PAGE_BUFFER_OVERRUN
+#endif 
+
+// The following is likely unnecessarily complex.
+#ifdef __SANITIZE_ADDRESS__
+// we have GCC, stuck with https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
+#undef ALLOW_SAME_PAGE_BUFFER_OVERRUN
+#elif defined(__has_feature)
+// we have CLANG?
+#  if (__has_feature(address_sanitizer))
+#define ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER  __attribute__((no_sanitize("address")))
+#  endif 
+#endif 
+
 #define really_inline inline __attribute__((always_inline, unused))
 #define never_inline inline __attribute__((noinline, unused))

@ -219,8 +256,80 @@ static inline void aligned_free(void *memblock) {

 #endif  // MSC_VER

+// if it does not apply, make it an empty macro
+#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER
+#define ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER
+#endif
 #endif // SIMDJSON_COMMON_DEFS_H
 /* end file include/simdjson/common_defs.h */
+/* begin file include/simdjson/padded_string.h */
+#ifndef SIMDJSON_PADDING_STRING_H
+#define SIMDJSON_PADDING_STRING_H
+#include <memory>
+#include <cstring>
+// low-level function to allocate memory with padding so we can read passed the
+// "length" bytes safely. if you must provide a pointer to some data, create it
+// with this function: length is the max. size in bytes of the string caller is
+// responsible to free the memory (free(...))
+char *allocate_padded_buffer(size_t length);
+
+// Simple string with padded allocation.
+// We deliberately forbid copies, users should rely on swap or move
+// constructors.
+class padded_string {
+public:
+  explicit padded_string() noexcept : viable_size(0), data_ptr(nullptr) {}
+  explicit padded_string(size_t length) noexcept
+      : viable_size(length), data_ptr(allocate_padded_buffer(length)) {
+
+    if (data_ptr != nullptr)
+      data_ptr[length] = '\0'; // easier when you need a c_str
+  }
+  explicit padded_string(char *data, size_t length) noexcept
+      : viable_size(length), data_ptr(allocate_padded_buffer(length)) {
+    if (data_ptr != nullptr) {
+      memcpy(data_ptr, data, length);
+      data_ptr[length] = '\0'; // easier when you need a c_str
+    }
+  }
+  padded_string(std::string s) noexcept
+      : viable_size(s.size()), data_ptr(allocate_padded_buffer(s.size())) {
+    if (data_ptr != nullptr) {
+      memcpy(data_ptr, s.data(), s.size());
+      data_ptr[s.size()] = '\0'; // easier when you need a c_str
+    }
+  }
+  padded_string(padded_string &&o) noexcept
+      : viable_size(o.viable_size), data_ptr(o.data_ptr) {
+    o.data_ptr = nullptr; // we take ownership
+  }
+  void swap(padded_string &o) {
+    size_t tmp_viable_size = viable_size;
+    char *tmp_data_ptr = data_ptr;
+    viable_size = o.viable_size;
+    data_ptr = o.data_ptr;
+    o.data_ptr = tmp_data_ptr;
+    o.viable_size = tmp_viable_size;
+  }
+
+  ~padded_string() { aligned_free_char(data_ptr); }
+
+  size_t size() const { return viable_size; }
+
+  size_t length() const { return viable_size; }
+
+  char *data() const { return data_ptr; }
+
+private:
+  padded_string &operator=(const padded_string &o) = delete;
+  padded_string(const padded_string &o) = delete;
+
+  size_t viable_size;
+  char *data_ptr;
+};
+
+#endif
+/* end file include/simdjson/padded_string.h */
 /* begin file include/simdjson/jsoncharutils.h */
 #ifndef SIMDJSON_JSONCHARUTILS_H
 #define SIMDJSON_JSONCHARUTILS_H
@ -273,34 +382,166 @@ really_inline uint32_t is_structural_or_whitespace(uint8_t c) {
  return structural_or_whitespace[c];
 }

-const signed char digittoval[256] = {
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  1,  2,  3,  4,  5,  6,  7,  8,
-    9,  -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1};
-
+const uint32_t digittoval32[886] = {
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0x0,        0x1,        0x2,        0x3,        0x4, 0x5,
+    0x6,        0x7,        0x8,        0x9,        0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa,
+    0xb,        0xc,        0xd,        0xe,        0xf,        0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xa,        0xb,        0xc,        0xd,        0xe,
+    0xf,        0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0x0,        0x10,       0x20,       0x30,       0x40, 0x50,
+    0x60,       0x70,       0x80,       0x90,       0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa0,
+    0xb0,       0xc0,       0xd0,       0xe0,       0xf0,       0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xa0,       0xb0,       0xc0,       0xd0,       0xe0,
+    0xf0,       0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0x0,        0x100,      0x200,      0x300,      0x400, 0x500,
+    0x600,      0x700,      0x800,      0x900,      0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa00,
+    0xb00,      0xc00,      0xd00,      0xe00,      0xf00,      0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xa00,      0xb00,      0xc00,      0xd00,      0xe00,
+    0xf00,      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0x0,        0x1000,     0x2000,     0x3000,     0x4000, 0x5000,
+    0x6000,     0x7000,     0x8000,     0x9000,     0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa000,
+    0xb000,     0xc000,     0xd000,     0xe000,     0xf000,     0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xa000,     0xb000,     0xc000,     0xd000,     0xe000,
+    0xf000,     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
 // returns a value with the high 16 bits set if not valid
 // otherwise returns the conversion of the 4 hex digits at src into the bottom 16 bits of the 32-bit
 // return register
+//
+// see https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/ 
 static inline uint32_t hex_to_u32_nocheck(const uint8_t *src) {// strictly speaking, static inline is a C-ism
-  // all these will sign-extend the chars looked up, placing 1-bits into the high 28 bits of every
-  // invalid value. After the shifts, this will *still* result in the outcome that the high 16 bits of any
-  // value with any invalid char will be all 1's. We check for this in the caller.
-  int32_t v1 = digittoval[src[0]];
-  int32_t v2 = digittoval[src[1]];
-  int32_t v3 = digittoval[src[2]];
-  int32_t v4 = digittoval[src[3]];
-  return static_cast<uint32_t>(v1 << 12 | v2 << 8 | v3 << 4 | v4);
+  uint32_t v1 = digittoval32[630 + src[0]];
+  uint32_t v2 = digittoval32[420 + src[1]];
+  uint32_t v3 = digittoval32[210 + src[2]];
+  uint32_t v4 = digittoval32[0 + src[3]];
+  return v1 | v2 | v3 | v4;
 }

 // given a code point cp, writes to c
@ -557,13 +798,6 @@ static inline void print_with_escapes(const char *src, std::ostream &os,
 #include <string>


-// low-level function to allocate memory with padding so we can read passed the "length" bytes
-// safely.
-// if you must provide a pointer to some data, create it with this function:
-// length is the max. size in bytes of the string
-// caller is responsible to free the memory (free(...))
-char * allocate_padded_buffer(size_t length);
-



@ -573,16 +807,16 @@ char * allocate_padded_buffer(size_t length);
 // throws exceptions in case of failure
 // first element of the pair is a string (null terminated)
 // whereas the second element is the length.
-// caller is responsible to free (free((void*)result.data())))
+// caller is responsible to free (aligned_free((void*)result.data())))
 // 
 // throws an exception if the file cannot be opened, use try/catch
 //      try {
 //        p = get_corpus(filename);
 //      } catch (const std::exception& e) { 
-//        free((void*)p.data());//use aligned_free if you plan to use VisualStudio
+//        aligned_free((void*)p.data());
 //        std::cout << "Could not load the file " << filename << std::endl;
 //      }
-std::string_view  get_corpus(const std::string& filename);
+padded_string get_corpus(const std::string& filename);


 #endif
@ -35789,29 +36023,31 @@ static inline void avxcheckOverlong(__m256i current_bytes,
                                    __m256i *has_error) {
  __m256i off1_hibits = push_last_byte_of_a_to_b(previous_hibits, hibits);
  __m256i initial_mins = _mm256_shuffle_epi8(
-      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
-                       -128, -128, -128, // 10xx => false
-                       0xC2, -128,       // 110x
-                       0xE1,             // 1110
-                       0xF1, -128, -128, -128, -128, -128, -128, -128, -128,
+      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128,
                       -128, -128, -128, -128, // 10xx => false
                       0xC2, -128,             // 110x
                       0xE1,                   // 1110
-                       0xF1),
+                       0xF1,                   // 1111
+                       -128, -128, -128, -128, -128, -128, -128, -128,
+                       -128, -128, -128, -128, // 10xx => false
+                       0xC2, -128,             // 110x
+                       0xE1,                   // 1110
+                       0xF1),                  // 1111
      off1_hibits);

  __m256i initial_under = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes);

  __m256i second_mins = _mm256_shuffle_epi8(
-      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
-                       -128, -128, -128, // 10xx => false
-                       127, 127,         // 110x => true
-                       0xA0,             // 1110
-                       0x90, -128, -128, -128, -128, -128, -128, -128, -128,
+      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128,
                       -128, -128, -128, -128, // 10xx => false
                       127, 127,               // 110x => true
                       0xA0,                   // 1110
-                       0x90),
+                       0x90,                   // 1111
+                       -128, -128, -128, -128, -128, -128, -128, -128,
+                       -128, -128, -128, -128, // 10xx => false
+                       127, 127,               // 110x => true
+                       0xA0,                   // 1110
+                       0x90),                  // 1111
      off1_hibits);
  __m256i second_under = _mm256_cmpgt_epi8(second_mins, current_bytes);
  *has_error = _mm256_or_si256(*has_error,
@ -35885,6 +36121,10 @@ static inline size_t jsonminify(const std::string_view & p, char *out) {
    return jsonminify(p.data(), p.size(), out);
 }

+static inline size_t jsonminify(const padded_string & p, char *out) {
+    return jsonminify(p.data(), p.size(), out);
+}
+
 #endif
 /* end file include/simdjson/jsonminifier.h */
 /* begin file include/simdjson/parsedjson.h */
@ -35976,7 +36216,14 @@ public:
      tape[saved_loc] |= val;
  }

+  struct InvalidJSON : public std::exception {
+	const char * what () const throw () {
+ 	     return "JSON document is invalid";
+    }
+  };
+
  struct iterator {
+    // might throw InvalidJSON if ParsedJson is invalid
    explicit iterator(ParsedJson &pj_);
    ~iterator();

@ -36034,6 +36281,12 @@ public:

    bool is_double() const;

+    bool is_true() const;
+
+    bool is_false() const;
+
+    bool is_null() const;
+
    static bool is_object_or_array(uint8_t type);

    // when at {, go one level deep, looking for a given key
@ -36234,9 +36487,13 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, uint8_t **d
  return offset > 0;
 }

-WARN_UNUSED
-really_inline  bool parse_string(const uint8_t *buf, UNUSED size_t len,
-                                ParsedJson &pj, UNUSED const uint32_t depth, uint32_t offset) {
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER
+really_inline  bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
+                                ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
 #ifdef SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing
  pj.write_tape(0, '"');// don't bother with the string parsing at all
  return true; // always succeeds
@ -36246,6 +36503,10 @@ really_inline  bool parse_string(const uint8_t *buf, UNUSED size_t len,
  uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
  const uint8_t *const start_of_string = dst;
  while (1) {
+#ifdef __AVX2__
+    // this can read up to 31 bytes beyond the buffer size, but we require 
+    // SIMDJSON_PADDING of padding
+    static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
    __m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
    // store to dest unconditionally - we can overwrite the bits we don't like
    // later
@ -36255,6 +36516,36 @@ really_inline  bool parse_string(const uint8_t *buf, UNUSED size_t len,
    auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
    auto quote_bits =
        static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask));
+#else
+    // this can read up to 31 bytes beyond the buffer size, but we require 
+    // SIMDJSON_PADDING of padding
+    static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
+    uint8x16_t v0 = vld1q_u8(src);
+    uint8x16_t v1 = vld1q_u8(src+16);
+    vst1q_u8(dst, v0);
+    vst1q_u8(dst+16, v1);
+    
+    uint8x16_t bs_mask = vmovq_n_u8('\\');
+    uint8x16_t qt_mask = vmovq_n_u8('"');
+    const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+    uint8x16_t cmp_bs_0 = vceqq_u8(v0, bs_mask);
+    uint8x16_t cmp_bs_1 = vceqq_u8(v1, bs_mask);
+    uint8x16_t cmp_qt_0 = vceqq_u8(v0, qt_mask);
+    uint8x16_t cmp_qt_1 = vceqq_u8(v1, qt_mask);
+    
+    cmp_bs_0 = vandq_u8(cmp_bs_0, bitmask);
+    cmp_bs_1 = vandq_u8(cmp_bs_1, bitmask);
+    cmp_qt_0 = vandq_u8(cmp_qt_0, bitmask);
+    cmp_qt_1 = vandq_u8(cmp_qt_1, bitmask);
+
+    uint8x16_t sum0 = vpaddq_u8(cmp_bs_0, cmp_bs_1);
+    uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1);
+    sum0 = vpaddq_u8(sum0, sum1);
+    sum0 = vpaddq_u8(sum0, sum0);
+    auto bs_bits =  vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0);
+    auto quote_bits =  vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1);
+#endif
    if(((bs_bits - 1) & quote_bits) != 0 ) {
      // we encountered quotes first. Move dst to point to quotes and exit

@ -36414,7 +36705,7 @@ static inline bool is_integer(char c) {
 // probably frequent and it is hard than it looks. We are building all of this
 // just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
 const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
@ -36427,11 +36718,13 @@ const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};

 really_inline bool
-is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
+is_not_structural_or_whitespace_or_exponent_or_decimal_or_null(unsigned char c) {
  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
 }

+#ifdef __AVX2__
 #define SWAR_NUMBER_PARSING
+#endif

 #ifdef SWAR_NUMBER_PARSING

@ -36441,6 +36734,9 @@ is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
 // http://0x80.pl/articles/swar-digits-validate.html
 static inline bool is_made_of_eight_digits_fast(const char *chars) {
  uint64_t val;
+  // this can read up to 7 bytes beyond the buffer size, but we require 
+  // SIMDJSON_PADDING of padding
+  static_assert(7 <= SIMDJSON_PADDING);
  memcpy(&val, chars, 8);
  // a branchy method might be faster:
  // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
@ -36454,6 +36750,9 @@ static inline bool is_made_of_eight_digits_fast(const char *chars) {
 // this is more efficient apparently than the scalar code above (fewer instructions)
 static inline bool is_made_of_eight_digits_fast(const char *chars) {
  __m64 val;
+  // this can read up to 7 bytes beyond the buffer size, but we require 
+  // SIMDJSON_PADDING of padding
+  static_assert(7 <= SIMDJSON_PADDING);
  memcpy(&val, chars, 8);
  __m64 base = _mm_sub_pi8(val,_mm_set1_pi8('0'));
  __m64 basecmp = _mm_subs_pu8(base,_mm_set1_pi8(9));
@ -36461,6 +36760,23 @@ static inline bool is_made_of_eight_digits_fast(const char *chars) {
 }
 #endif

+// clang-format off
+/***
+Should parse_eight_digits_unrolled be out of the question, one could
+use a standard approach like the following:
+
+static inline uint32_t newparse_eight_digits_unrolled(const char *chars) {
+   uint64_t val;
+   memcpy(&val, chars, sizeof(uint64_t));  
+   val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
+   val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
+   return (val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32;
+}
+
+credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
+*/
+// clang-format on
+
 static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
  // this actually computes *16* values so we are being wasteful.
  const __m128i ascii0 = _mm_set1_epi8('0');
@ -36575,14 +36891,14 @@ parse_float(const uint8_t *const buf,
 #endif
      return false;
    }
-    int exponent = (negexp ? -expnumber : expnumber);
-    if ((exponent > 308) || (exponent < -308)) {
+    if (expnumber > 308) {
 // we refuse to parse this
 #ifdef JSON_TEST_NUMBERS // for unit testing
      foundInvalidNumber(buf + offset);
 #endif
      return false;
    }
+    int exponent = (negexp ? -expnumber : expnumber);
    i *= power_of_ten[308 + exponent];
  }
  if(is_not_structural_or_whitespace(*p)) {
@ -36694,10 +37010,10 @@ static really_inline bool parse_number(const uint8_t *const buf,
  }
  const char *const startdigits = p;

-  int64_t i;
+  uint64_t i; // an unsigned int avoids signed overflows (which are bad)
  if (*p == '0') { // 0 cannot be followed by an integer
    ++p;
-    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
+    if (is_not_structural_or_whitespace_or_exponent_or_decimal_or_null(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
      foundInvalidNumber(buf + offset);
 #endif
@ -36744,7 +37060,6 @@ static really_inline bool parse_number(const uint8_t *const buf,
    if (is_made_of_eight_digits_fast(p)) {
      i = i * 100000000 + parse_eight_digits_unrolled(p);
      p += 8;
-      // exponent -= 8;
    }
 #endif
    while (is_integer(*p)) {
@ -36792,9 +37107,15 @@ static really_inline bool parse_number(const uint8_t *const buf,
 #endif
      return false;
    }
+    if(expnumber > 308) {
+// we refuse to parse this
+#ifdef JSON_TEST_NUMBERS // for unit testing
+        foundInvalidNumber(buf + offset);
+#endif
+        return false;       
+    }
    exponent += (negexp ? -expnumber : expnumber);
  }
-  i = negative ? -i : i;
  if ((exponent != 0) || (expnumber != 0)) {
    if (unlikely(digitcount >= 19)) { // this is uncommon!!!
      // this is almost never going to get called!!!
@ -36811,16 +37132,9 @@ static really_inline bool parse_number(const uint8_t *const buf,
      foundFloat(0.0, buf + offset);
 #endif
    } else {
-      if ((exponent > 308) || (exponent < -308)) {
-// we refuse to parse this
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        foundInvalidNumber(buf + offset);
-#endif
-        return false;
-      }
      double d = i;
+      d = negative ? -d : d;
      d *= power_of_ten[308 + exponent];
-      // d = negative ? -d : d;
      pj.write_tape_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
      foundFloat(d, buf + offset);
@ -36831,6 +37145,7 @@ static really_inline bool parse_number(const uint8_t *const buf,
      return parse_large_integer(buf, pj, offset,
                                 found_minus);
    }
+    i = negative ? 0-i : i;
    pj.write_tape_s64(i);
 #ifdef JSON_TEST_NUMBERS // for unit testing
    foundInteger(i, buf + offset);
@ -36862,20 +37177,23 @@ int unified_machine(const char *buf, size_t len, ParsedJson &pj);
 /* begin file include/simdjson/jsonparser.h */
 #ifndef SIMDJSON_JSONPARSER_H
 #define SIMDJSON_JSONPARSER_H
+#include <string>

-
-// Parse a document found in buf, need to preallocate ParsedJson.
+// Parse a document found in buf. 
+// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
 // Return 0 on success, an error code from simdjson/simdjson.h otherwise
-// You can also check validit by calling pj.isValid(). The same ParsedJson can be reused for other documents.
+// You can also check validity by calling pj.isValid(). The same ParsedJson can be reused for other documents.
 //
 // If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
 // (a copy of the input string is made).
 // The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false,
 // all bytes at and after buf + len  are ignored (can be garbage).
+// The ParsedJson object can be reused.
 WARN_UNUSED
 int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true);

-// Parse a document found in buf, need to preallocate ParsedJson.
+// Parse a document found in buf.
+// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
 // Return SUCCESS (an integer = 1) in case of a success. You can also check validity
 // by calling pj.isValid(). The same ParsedJson can be reused for other documents.
 //
@ -36883,22 +37201,48 @@ int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifnee
 // (a copy of the input string is made).
 // The input buf should be readable up to buf + len + SIMDJSON_PADDING  if reallocifneeded is false,
 // all bytes at and after buf + len  are ignored (can be garbage).
+// The ParsedJson object can be reused.
 WARN_UNUSED
 inline int json_parse(const char * buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) {
  return json_parse(reinterpret_cast<const uint8_t *>(buf), len, pj, reallocifneeded);
 }

-// Parse a document found in buf, need to preallocate ParsedJson.
+// Parse a document found in buf.
+// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
 // Return SUCCESS (an integer = 1) in case of a success. You can also check validity
 // by calling pj.isValid(). The same ParsedJson can be reused for other documents.
 //
 // If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
 // (a copy of the input string is made).
-// the input s should be readable up to s.data() + s.size() + SIMDJSON_PADDING  if reallocifneeded is false,
+// The input s should be readable up to s.data() + s.size() + SIMDJSON_PADDING  if reallocifneeded is false,
 // all bytes at and after s.data()+s.size() are ignored (can be garbage).
+// The ParsedJson object can be reused.
+//WARN_UNUSED
+//inline int json_parse(const std::string_view &s, ParsedJson &pj, bool reallocifneeded = true) {
+//  return json_parse(s.data(), s.size(), pj, reallocifneeded);
+//}
+
+
+
+// Parse a document found in in string s.
+// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
+// Return SUCCESS (an integer = 1) in case of a success. You can also check validity
+// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
+//
+// A temporary buffer is created when needed during processing
+// (a copy of the input string is made).
 WARN_UNUSED
-inline int json_parse(const std::string_view &s, ParsedJson &pj, bool reallocifneeded = true) {
-  return json_parse(s.data(), s.size(), pj, reallocifneeded);
+inline int json_parse(const std::string &s, ParsedJson &pj) {
+  return json_parse(s.data(), s.length(), pj, true);
+}
+
+// Parse a document found in in string s.
+// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
+// Return SUCCESS (an integer = 1) in case of a success. You can also check validity
+// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
+WARN_UNUSED
+inline int json_parse(const padded_string &s, ParsedJson &pj) {
+  return json_parse(s.data(), s.length(), pj, false);
 }


@ -36931,9 +37275,33 @@ WARN_UNUSED
 // (a copy of the input string is made).
 // The input s should be readable up to s.data() + s.size() + SIMDJSON_PADDING  if reallocifneeded is false,
 // all bytes at and after s.data()+s.size() are ignored (can be garbage).
-inline ParsedJson build_parsed_json(const std::string_view &s, bool reallocifneeded = true) {
-  return build_parsed_json(s.data(), s.size(), reallocifneeded);
+//inline ParsedJson build_parsed_json(const std::string_view &s, bool reallocifneeded = true) {
+ // return build_parsed_json(s.data(), s.size(), reallocifneeded);
+//}
+
+// Parse a document found in in string s.
+// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
+// Return SUCCESS (an integer = 1) in case of a success. You can also check validity
+// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
+//
+// A temporary buffer is created when needed during processing
+// (a copy of the input string is made).
+WARN_UNUSED
+inline ParsedJson build_parsed_json(const std::string &s) {
+  return build_parsed_json(s.data(), s.length(), true);
 }

+
+// Parse a document found in in string s.
+// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
+// Return SUCCESS (an integer = 1) in case of a success. You can also check validity
+// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
+WARN_UNUSED
+inline ParsedJson build_parsed_json(const padded_string &s) {
+  return build_parsed_json(s.data(), s.length(), false);
+}
+
+
+
 #endif
 /* end file include/simdjson/jsonparser.h */
--- a/src/jsonioutil.cpp
+++ b/src/jsonioutil.cpp
@ -7,28 +7,27 @@ char * allocate_padded_buffer(size_t length) {
    //return (char *) malloc(length + SIMDJSON_PADDING);
    // However, we might as well align to cache lines...
    size_t totalpaddedlength = length + SIMDJSON_PADDING;
-    char *padded_buffer = (char *) aligned_malloc(64, totalpaddedlength);
+    char *padded_buffer = aligned_malloc_char(64, totalpaddedlength);
    return padded_buffer;
 }

-std::string_view get_corpus(const std::string& filename) {
+padded_string get_corpus(const std::string& filename) {
  std::FILE *fp = std::fopen(filename.c_str(), "rb");
  if (fp != nullptr) {
    std::fseek(fp, 0, SEEK_END);
    size_t len = std::ftell(fp);
-    char * buf = allocate_padded_buffer(len);
-    if(buf == nullptr) {
+    padded_string s(len);
+    if(s.data() == nullptr) {
      std::fclose(fp);
      throw  std::runtime_error("could not allocate memory");
    }
    std::rewind(fp);
-    size_t readb = std::fread(buf, 1, len, fp);
+    size_t readb = std::fread(s.data(), 1, len, fp);
    std::fclose(fp);
    if(readb != len) {
-      aligned_free(buf);
      throw  std::runtime_error("could not read the data");
    }
-    return std::string_view(buf,len);
+    return s;
  }
  throw  std::runtime_error("could not load corpus");
 }
--- a/src/stage1_find_marks.cpp
+++ b/src/stage1_find_marks.cpp
@ -23,7 +23,6 @@
 #ifdef SIMDJSON_UTF8VALIDATE
 #include "simdjson/simdutf8check.h"
 #endif
-using namespace std;

 #define TRANSPOSE

@ -501,9 +500,9 @@ WARN_UNUSED
 /*never_inline*/ bool find_structural_bits(const uint8_t *buf, size_t len,
                                           ParsedJson &pj) {
  if (len > pj.bytecapacity) {
-    cerr << "Your ParsedJson object only supports documents up to "
+    std::cerr << "Your ParsedJson object only supports documents up to "
         << pj.bytecapacity << " bytes but you are trying to process " << len
-         << " bytes\n";
+         << " bytes" << std::endl;
    return false;
  }
  uint32_t *base_ptr = pj.structural_indexes;
--- a/src/stage2_build_tape.cpp
+++ b/src/stage2_build_tape.cpp
@ -12,14 +12,15 @@
 #define PATH_SEP '/'


-using namespace std;
-
 WARN_UNUSED
 really_inline bool is_valid_true_atom(const uint8_t *loc) {
  uint64_t tv = *reinterpret_cast<const uint64_t *>("true    ");
  uint64_t mask4 = 0x00000000ffffffff;
  uint32_t error = 0;
  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
+  // this can read up to 7 bytes beyond the buffer size, but we require 
+  // SIMDJSON_PADDING of padding
+  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
  std::memcpy(&locval, loc, sizeof(uint64_t));
  error = (locval & mask4) ^ tv;
  error |= is_not_structural_or_whitespace(loc[4]);
@ -40,6 +41,9 @@ really_inline bool is_valid_false_atom(const uint8_t *loc) {
  // ignored
  uint64_t error = 0;
  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
+  // this can read up to 7 bytes beyond the buffer size, but we require 
+  // SIMDJSON_PADDING of padding
+  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
  std::memcpy(&locval, loc, sizeof(uint64_t));
  error = (locval & mask5) ^ fv;
  error |= is_not_structural_or_whitespace(loc[5]);
@ -52,6 +56,9 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) {
  uint64_t mask4 = 0x00000000ffffffff;
  uint32_t error = 0;
  uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
+  // this can read up to 7 bytes beyond the buffer size, but we require 
+  // SIMDJSON_PADDING of padding
+  static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
  std::memcpy(&locval, loc, sizeof(uint64_t));
  error = (locval & mask4) ^ nv;
  error |= is_not_structural_or_whitespace(loc[4]);
--- a/tests/allparserscheckfile.cpp
+++ b/tests/allparserscheckfile.cpp
@ -40,7 +40,6 @@ bool fastjson_parse(const char *input) {


 using namespace rapidjson;
-using namespace std;

 int main(int argc, char *argv[]) {
  bool verbose = false;
@ -55,14 +54,14 @@ int main(int argc, char *argv[]) {
        abort ();
      }
  if (optind >= argc) {
-    cerr << "Usage: " << argv[0] << " <jsonfile>\n";
-    cerr << "Or " << argv[0] << " -v <jsonfile>\n";
+    std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
+    std::cerr << "Or " << argv[0] << " -v <jsonfile>" << std::endl;
    exit(1);
  }
  const char * filename = argv[optind];
-  std::string_view p;
+  padded_string p;
  try {
-    p = get_corpus(filename);
+    get_corpus(filename).swap(p);
  } catch (const std::exception& e) { // caught by reference to base
    std::cout << "Could not load the file " << filename << std::endl;
    return EXIT_FAILURE;
@ -83,7 +82,7 @@ int main(int argc, char *argv[]) {
    std::cerr << "can't allocate memory" << std::endl;
    return EXIT_FAILURE;
  }
-  bool ours_correct = json_parse(p, pj) == 0; // returns 0 on success
+  bool ours_correct = (json_parse(p, pj) == 0); // returns 0 on success

  rapidjson::Document d;

@ -103,7 +102,7 @@ int main(int argc, char *argv[]) {
  void *state;
  bool ultrajson_correct = ((UJDecode(buffer, p.size(), NULL, &state) == NULL) == false);

-  auto tokens = make_unique<jsmntok_t[]>(p.size());
+  auto tokens = std::make_unique<jsmntok_t[]>(p.size());
  bool jsmn_correct = false;
  if(tokens == nullptr) {
    printf("Failed to alloc memory for jsmn\n");
@ -145,7 +144,6 @@ int main(int argc, char *argv[]) {
  printf("cjson                      : %s \n", cjson_correct ? "correct":"invalid");
  printf("jsoncpp                    : %s \n", isjsoncppok ? "correct":"invalid");

-  aligned_free((void*)p.data());
  free(buffer);
  return EXIT_SUCCESS;
 }
--- a/tests/jsoncheck.cpp
+++ b/tests/jsoncheck.cpp
@ -65,9 +65,9 @@ bool validate(const char *dirname) {
      } else {
        strcpy(fullpath + dirlen, name);
      }
-      std::string_view p;
+      padded_string p;
      try {
-        p = get_corpus(fullpath);
+        get_corpus(fullpath).swap(p);
      } catch (const std::exception& e) {
        std::cerr << "Could not load the file " << fullpath << std::endl;
        return EXIT_FAILURE;
@ -80,7 +80,6 @@ bool validate(const char *dirname) {
      }
      ++howmany;
      const int parseRes = json_parse(p, pj);
-      aligned_free((void*)p.data());
      printf("%s\n", parseRes == 0 ? "ok" : "invalid");
      if(contains("EXCLUDE",name)) {
        // skipping
--- a/tests/numberparsingcheck.cpp
+++ b/tests/numberparsingcheck.cpp
@ -79,8 +79,8 @@ inline void foundFloat(double result, const uint8_t *buf) {
    parse_error |= PARSE_ERROR;
  }
  // we want to get some reasonable relative accuracy
-  else if (fabs(expected - result) / fmin(fabs(expected), fabs(result)) >
-      1e-14) {
+  else if (fabs(expected - result)  >
+      1e-14 * fmin(fabs(expected), fabs(result))) {
    fprintf(stderr, "parsed %.128e from \n", result);
    fprintf(stderr, "       %.32s whereas strtod gives\n", buf);
    fprintf(stderr, "       %.128e,", expected);
@ -128,9 +128,9 @@ bool validate(const char *dirname) {
      } else {
        strcpy(fullpath + dirlen, name);
      }
-      std::string_view p;
+      padded_string p;
      try {
-        p = get_corpus(fullpath);
+        get_corpus(fullpath).swap(p);
      } catch (const std::exception& e) { 
        std::cout << "Could not load the file " << fullpath << std::endl;
        return EXIT_FAILURE;
@ -154,7 +154,6 @@ bool validate(const char *dirname) {
               float_count, invalid_count,
               int_count + float_count + invalid_count);
      }
-      aligned_free((void*)p.data());
      free(fullpath);
    }
  }
--- a/tests/singleheadertest.cpp
+++ b/tests/singleheadertest.cpp
@ -3,7 +3,7 @@

 int main() {
  const char * filename = JSON_TEST_PATH; 
-  std::string_view p = get_corpus(filename);
+  padded_string p = get_corpus(filename);
  ParsedJson pj = build_parsed_json(p); // do the parsing
  if( ! pj.isValid() ) {
    return EXIT_FAILURE;
@ -16,6 +16,5 @@ int main() {
    std::cerr << simdjson::errorMsg(res) << std::endl;
    return EXIT_FAILURE;
  }
-  aligned_free((void*)p.data());
  return EXIT_SUCCESS;
-}
+}
--- a/tests/stringparsingcheck.cpp
+++ b/tests/stringparsingcheck.cpp
@ -325,9 +325,9 @@ bool validate(const char *dirname) {
      } else {
        strcpy(fullpath + dirlen, name);
      }
-      std::string_view p;
+      padded_string p;
      try {
-        p = get_corpus(fullpath);
+        get_corpus(fullpath).swap(p);
      } catch (const std::exception& e) { 
        std::cout << "Could not load the file " << fullpath << std::endl;
        return EXIT_FAILURE;
@ -341,7 +341,6 @@ bool validate(const char *dirname) {
      bigbuffer = (char *) malloc(p.size());
      if(bigbuffer == NULL) {
        std::cerr << "can't allocate memory" << std::endl;
-        aligned_free((void*)p.data());
        return false;
      }
      bad_string = 0;
@ -350,7 +349,6 @@ bool validate(const char *dirname) {
      empty_string = 0;
      bool isok = json_parse(p, pj);
      free(bigbuffer);
-      aligned_free((void*)p.data());
      if (good_string > 0) {
        printf("File %40s %s --- bad strings: %10zu \tgood strings: %10zu\t "
               "empty strings: %10zu "
--- a/tools/json2json.cpp
+++ b/tools/json2json.cpp
@ -5,8 +5,6 @@
 #include "simdjson/jsonioutil.h"
 #include "simdjson/jsonparser.h"

-using namespace std;
-
 void compute_dump(ParsedJson::iterator &pjh) {
  if (pjh.is_object()) {
    std::cout << "{";
@ -64,19 +62,19 @@ int main(int argc, char *argv[]) {
  int optind = 1;
 #endif
  if (optind >= argc) {
-    cerr << "Reads json in, out the result of the parsing. " << endl;
-    cerr << "Usage: " << argv[0] << " <jsonfile>" << endl;
-    cerr << "The -d flag dumps the raw content of the tape." << endl;
+    std::cerr << "Reads json in, out the result of the parsing. " << std::endl;
+    std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
+    std::cerr << "The -d flag dumps the raw content of the tape." << std::endl;

    exit(1);
  }
  const char *filename = argv[optind];
  if (optind + 1 < argc) {
-    cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
+    std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
  }
-  std::string_view p;
+  padded_string p;
  try {
-    p = get_corpus(filename);
+    get_corpus(filename).swap(p);
  } catch (const std::exception &e) { // caught by reference to base
    std::cout << "Could not load the file " << filename << std::endl;
    return EXIT_FAILURE;
@ -88,7 +86,6 @@ int main(int argc, char *argv[]) {
    return EXIT_FAILURE;
  }
  int res = json_parse(p, pj); // do the parsing, return false on error
-  aligned_free((void *)p.data());
  if (res) {
    std::cerr << " Parsing failed. " << std::endl;
    return EXIT_FAILURE;
--- a/tools/jsonstats.cpp
+++ b/tools/jsonstats.cpp
@ -3,8 +3,6 @@
 #include "simdjson/jsonioutil.h"
 #include "simdjson/jsonparser.h"

-using namespace std;
-
 size_t count_nonasciibytes(const uint8_t* input, size_t length) {
  size_t count = 0;
  for(size_t i = 0; i < length; i++) {
@ -43,7 +41,7 @@ using stat_t = struct stat_s;



-stat_t simdjson_computestats(const std::string_view &p) {
+stat_t simdjson_computestats(const padded_string &p) {
  stat_t answer;
  ParsedJson pj = build_parsed_json(p);
  answer.valid = pj.isValid();
@ -119,18 +117,17 @@ stat_t simdjson_computestats(const std::string_view &p) {
 int main(int argc, char *argv[]) {
  int optind = 1;
  if (optind >= argc) {
-    cerr << "Reads json, prints stats. " << endl;
-    cerr << "Usage: " << argv[0] << " <jsonfile>" << endl;
-
+    std::cerr << "Reads json, prints stats. " << std::endl;
+    std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
    exit(1);
  }
  const char *filename = argv[optind];
  if (optind + 1 < argc) {
    std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
  }
-  std::string_view p;
+  padded_string p;
  try {
-    p = get_corpus(filename);
+    get_corpus(filename).swap(p);
  } catch (const std::exception &e) { // caught by reference to base
    std::cerr << "Could not load the file " << filename << std::endl;
    return EXIT_FAILURE;
--- a/tools/minify.cpp
+++ b/tools/minify.cpp
@ -8,15 +8,14 @@ int main(int argc, char *argv[]) {
    std::cerr << "Usage: " << argv[0] << " <jsonfile>\n";
    exit(1);
  }
-  std::string_view p;
+  padded_string p;
  std::string filename = argv[argc - 1];
  try{
-    p = get_corpus(filename);
+    get_corpus(filename).swap(p);
  } catch (const std::exception& e) { 
        std::cout << "Could not load the file " << filename << std::endl;
        return EXIT_FAILURE;
  }
-  jsonminify(p, const_cast<char *>(p.data()));
+  jsonminify(p, p.data());
  printf("%s",p.data());
-  aligned_free((void*)p.data());
 }