Fixing issue. (#193)

2019-06-20 18:49:47 -04:00 · 2019-06-20 18:49:47 -04:00 · d7f7f1b200
parent 8914b12db5
commit d7f7f1b200
17 changed files with 144 additions and 55 deletions
--- a/README.md
+++ b/README.md
@ -92,6 +92,7 @@ padded_string p = get_corpus(filename);
 ParsedJson pj = build_parsed_json(p); // do the parsing
 if( ! pj.isValid() ) {
    // something went wrong
+    std::cout << pj.getErrorMsg() << std::endl;
 }
 ```

@ -127,6 +128,7 @@ std::string mystring = ... //
 ParsedJson pj = build_parsed_json(mystring); // do the parsing
 if( ! pj.isValid() ) {
    // something went wrong
+    std::cout << pj.getErrorMsg() << std::endl;
 }
 ```

@ -148,6 +150,7 @@ int main(int argc, char *argv[]) {
  ParsedJson pj = build_parsed_json(p); // do the parsing
  if( ! pj.isValid() ) {
    std::cout << "not valid" << std::endl;
+    std::cout << pj.getErrorMsg() << std::endl;
  } else {
    std::cout << "valid" << std::endl;
  }
--- a/benchmark/parse.cpp
+++ b/benchmark/parse.cpp
@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
      std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
    }
    unified.start();
-    isok = find_structural_bits(p.data(), p.size(), pj);
+    isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS);
    unified.end(results);
    cy1 += results[0];
    cl1 += results[1];
@ -185,18 +185,20 @@ int main(int argc, char *argv[]) {
    }

    auto start = std::chrono::steady_clock::now();
-    isok = find_structural_bits(p.data(), p.size(), pj);
+    isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS);
    isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> secs = end - start;
    res[i] = secs.count();
    if(! isok) {
+      std::cerr << pj.getErrorMsg() << std::endl;
      std::cerr << "Could not parse. " << std::endl;
      return EXIT_FAILURE;
    }
  }  
  ParsedJson pj = build_parsed_json(p); // do the parsing again to get the stats
  if (!pj.isValid()) {
+    std::cerr << pj.getErrorMsg() << std::endl;
    std::cerr << "Could not parse. " << std::endl;
    return EXIT_FAILURE;
  }
--- a/benchmark/statisticalmodel.cpp
+++ b/benchmark/statisticalmodel.cpp
@ -180,7 +180,7 @@ int main(int argc, char *argv[]) {
  results.resize(evts.size());
  for (uint32_t i = 0; i < iterations; i++) {
    unified.start();
-    bool isok = find_structural_bits(p.data(), p.size(), pj);
+    bool isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS);
    unified.end(results);
    
    cy1 += results[0];
--- a/include/simdjson/jsoncharutils.h
+++ b/include/simdjson/jsoncharutils.h
@ -11,7 +11,7 @@

 // these are the chars that can follow a true/false/null or number atom
 // and nothing else
-const uint32_t structural_or_whitespace_negated[256] = {
+const uint32_t structural_or_whitespace_or_null_negated[256] = {
    0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
@ -28,13 +28,37 @@ const uint32_t structural_or_whitespace_negated[256] = {
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};

+// return non-zero if not a structural or whitespace char
+// zero otherwise
+really_inline uint32_t is_not_structural_or_whitespace_or_null(uint8_t c) {
+  return structural_or_whitespace_or_null_negated[c];
+}
+
+
+const uint32_t structural_or_whitespace_negated[256] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
+
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
 // return non-zero if not a structural or whitespace char
 // zero otherwise
 really_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
  return structural_or_whitespace_negated[c];
 }

-const uint32_t structural_or_whitespace[256] = {
+const uint32_t structural_or_whitespace_or_null[256] = {
    1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -47,6 +71,24 @@ const uint32_t structural_or_whitespace[256] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

+really_inline uint32_t is_structural_or_whitespace_or_null(uint8_t c) {
+  return structural_or_whitespace_or_null[c];
+}
+
+
+const uint32_t structural_or_whitespace[256] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
 really_inline uint32_t is_structural_or_whitespace(uint8_t c) {
  return structural_or_whitespace[c];
 }
--- a/include/simdjson/numberparsing.h
+++ b/include/simdjson/numberparsing.h
@ -90,7 +90,7 @@ static inline bool is_integer(char c) {
 // probably frequent and it is hard than it looks. We are building all of this
 // just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
 const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
@ -103,7 +103,7 @@ const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};

 really_inline bool
-is_not_structural_or_whitespace_or_exponent_or_decimal_or_null(unsigned char c) {
+is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
 }

@ -380,6 +380,12 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,

 // parse the number at buf + offset
 // define JSON_TEST_NUMBERS for unit testing
+//
+// It is assumed that the number is followed by a structural ({,},],[) character
+// or a white space character. If that is not the case (e.g., when the JSON document
+// is made of a single number), then it is necessary to copy the content and append
+// a space before calling this function.
+//
 static really_inline bool parse_number(const uint8_t *const buf,
                                       ParsedJson &pj,
                                       const uint32_t offset,
@ -405,7 +411,7 @@ static really_inline bool parse_number(const uint8_t *const buf,
  uint64_t i; // an unsigned int avoids signed overflows (which are bad)
  if (*p == '0') { // 0 cannot be followed by an integer
    ++p;
-    if (is_not_structural_or_whitespace_or_exponent_or_decimal_or_null(*p)) {
+    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
      foundInvalidNumber(buf + offset);
 #endif
@ -430,7 +436,6 @@ static really_inline bool parse_number(const uint8_t *const buf,
      ++p;
    }
  }
-
  int64_t exponent = 0;
  bool is_float = false;
  if ('.' == *p) {
--- a/include/simdjson/parsedjson.h
+++ b/include/simdjson/parsedjson.h
@ -6,7 +6,7 @@
 #include <cstring>
 #include <iomanip>
 #include <iostream>
-
+#include "simdjson/simdjson.h"
 #include "simdjson/common_defs.h"
 #include "simdjson/jsonformatutils.h"
 #include "simdjson/portability.h"
@ -34,8 +34,16 @@ public:
  WARN_UNUSED
  bool allocateCapacity(size_t len, size_t maxdepth = DEFAULTMAXDEPTH);

+  // returns true if the document parsed was valid
  bool isValid() const;

+  // return an error code corresponding to the last parsing attempt, see simdjson.h
+  // will return simdjson::UNITIALIZED if no parsing was attempted
+  int getErrorCode() const;
+
+  // return the string equivalent of "getErrorCode"
+  std::string getErrorMsg() const;
+
  // deallocate memory and set capacity to zero, called automatically by the
  // destructor
  void deallocate();
@ -297,6 +305,7 @@ private:
  uint8_t *string_buf; // should be at least bytecapacity
  uint8_t *current_string_buf_loc;
  bool isvalid{false};
+  int errorcode{simdjson::UNITIALIZED};

 private :

--- a/include/simdjson/simdjson.h
+++ b/include/simdjson/simdjson.h
@ -15,7 +15,12 @@ struct simdjson {
    F_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'f'
    N_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'n'
    NUMBER_ERROR, // Problem while parsing a number
-    UTF8_ERROR // the input is not valid UTF-8
+    UTF8_ERROR, // the input is not valid UTF-8
+    UNITIALIZED, // unknown error, or uninitialized document
+    EMPTY, // no structural document found
+    UNESCAPED_CHARS, // found unescaped characters in a string.
+    UNCLOSED_STRING, // missing quote at the end
+    UNEXPECTED_ERROR // indicative of a bug in simdjson
  };
  static const std::string& errorMsg(const int);
 };
--- a/include/simdjson/stage1_find_marks.h
+++ b/include/simdjson/stage1_find_marks.h
@ -6,9 +6,9 @@
 struct ParsedJson;

 WARN_UNUSED
-bool find_structural_bits(const uint8_t *buf, size_t len, ParsedJson &pj);
+int find_structural_bits(const uint8_t *buf, size_t len, ParsedJson &pj);

 WARN_UNUSED
-bool find_structural_bits(const char *buf, size_t len, ParsedJson &pj);
+int find_structural_bits(const char *buf, size_t len, ParsedJson &pj);

 #endif
--- a/jsonchecker/fail66.json
+++ b/jsonchecker/fail66.json
--- a/jsonchecker/fail67.json
+++ b/jsonchecker/fail67.json
@ -0,0 +1 @@
+44
--- a/jsonchecker/fail68.json
+++ b/jsonchecker/fail68.json
--- a/jsonchecker/fail69.json
+++ b/jsonchecker/fail69.json
--- a/src/jsonparser.cpp
+++ b/src/jsonparser.cpp
@ -42,10 +42,11 @@ int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifnee
       reallocated = true;
     }
  }
-  bool stage1_is_ok = find_structural_bits(buf, len, pj);
-  if(!stage1_is_ok) {
-    return simdjson::UTF8_ERROR;
-  }
+  int stage1_is_ok = find_structural_bits(buf, len, pj);
+  if(stage1_is_ok != simdjson::SUCCESS) {
+    pj.errorcode = stage1_is_ok;
+    return pj.errorcode;
+  } 
  int res = unified_machine(buf, len, pj);
  if(reallocated) { aligned_free((void*)buf);}
  return res;
@ -56,9 +57,7 @@ ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool reallocifneede
  ParsedJson pj;
  bool ok = pj.allocateCapacity(len);
  if(ok) {
-    int res = json_parse(buf, len, pj, reallocifneeded);
-    ok = res == simdjson::SUCCESS;
-    assert(ok == pj.isValid());
+    (void)json_parse(buf, len, pj, reallocifneeded);
  } else {
    std::cerr << "failure during memory allocation " << std::endl;
  }
--- a/src/parsedjson.cpp
+++ b/src/parsedjson.cpp
@ -92,6 +92,14 @@ bool ParsedJson::isValid() const {
    return isvalid;
 }

+int ParsedJson::getErrorCode() const {
+    return errorcode;
+}
+
+std::string ParsedJson::getErrorMsg() const {
+  return simdjson::errorMsg(errorcode);
+}
+
 void ParsedJson::deallocate() {
    bytecapacity = 0;
    depthcapacity = 0;
--- a/src/simdjson.cpp
+++ b/src/simdjson.cpp
@ -11,7 +11,11 @@ const std::map<int, const std::string> errorStrings = {
    {simdjson::F_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'f'"},
    {simdjson::N_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'n'"},
    {simdjson::NUMBER_ERROR, "Problem while parsing a number"},
-    {simdjson::UTF8_ERROR, "The input is not valid UTF-8"}
+    {simdjson::UTF8_ERROR, "The input is not valid UTF-8"},
+    {simdjson::UNITIALIZED, "Unitialized"},
+    {simdjson::EMPTY, "Empty"},
+    {simdjson::UNESCAPED_CHARS, "Within strings, some characters must be escapted, we found unescapted characters"},
+    {simdjson::UNEXPECTED_ERROR, "Unexpected error, consider reporting this problem as you may have found a bug in simdjson"},
 };

 const std::string& simdjson::errorMsg(const int errorCode) {
--- a/src/stage1_find_marks.cpp
+++ b/src/stage1_find_marks.cpp
@ -609,7 +609,7 @@ really_inline uint64_t finalize_structurals(
  // following it.

  // a qualified predecessor is something that can happen 1 position before an
-  // psuedo-structural character
+  // pseudo-structural character
  uint64_t pseudo_pred = structurals | whitespace;

  uint64_t shifted_pseudo_pred =
@ -626,13 +626,13 @@ really_inline uint64_t finalize_structurals(
 }

 WARN_UNUSED
-/*never_inline*/ bool find_structural_bits(const uint8_t *buf, size_t len,
+/*never_inline*/ int find_structural_bits(const uint8_t *buf, size_t len,
                                           ParsedJson &pj) {
  if (len > pj.bytecapacity) {
    std::cerr << "Your ParsedJson object only supports documents up to "
         << pj.bytecapacity << " bytes but you are trying to process " << len
         << " bytes" << std::endl;
-    return false;
+    return simdjson::CAPACITY;
  }
  uint32_t *base_ptr = pj.structural_indexes;
  uint32_t base = 0;
@ -740,7 +740,7 @@ WARN_UNUSED

  // is last string quote closed?
  if (prev_iter_inside_quote) {
-      return false;
+      return simdjson::UNCLOSED_STRING;
  }

  // finally, flatten out the remaining structurals from the last iteration
@ -750,12 +750,12 @@ WARN_UNUSED
  // a valid JSON file cannot have zero structural indexes - we should have
  // found something
  if (pj.n_structural_indexes == 0u) {
-printf("wacky exit\n");
-    return false;
+    fprintf(stderr, "Empty document?\n");
+    return simdjson::EMPTY;
  }
  if (base_ptr[pj.n_structural_indexes - 1] > len) {
    fprintf(stderr, "Internal bug\n");
-    return false;
+    return simdjson::UNEXPECTED_ERROR;
  }
  if (len != base_ptr[pj.n_structural_indexes - 1]) {
    // the string might not be NULL terminated, but we add a virtual NULL ending
@ -765,16 +765,16 @@ printf("wacky exit\n");
  // make it safe to dereference one beyond this array
  base_ptr[pj.n_structural_indexes] = 0;  
  if (error_mask) {
-printf("had error mask\n");
-    return false;
+    fprintf(stderr, "Unescaped characters\n");
+    return simdjson::UNESCAPED_CHARS;
  }
 #ifdef SIMDJSON_UTF8VALIDATE
-  return _mm256_testz_si256(has_error, has_error) != 0;
+    return _mm256_testz_si256(has_error, has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
 #else
-  return true;
+  return simdjson::SUCCESS;
 #endif
 }

-bool find_structural_bits(const char *buf, size_t len, ParsedJson &pj) {
+int find_structural_bits(const char *buf, size_t len, ParsedJson &pj) {
  return find_structural_bits(reinterpret_cast<const uint8_t *>(buf), len, pj);
 }
--- a/src/stage2_build_tape.cpp
+++ b/src/stage2_build_tape.cpp
@ -80,9 +80,10 @@ int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
  uint8_t c; // used to track the (structural) character we are looking at, updated
        // by UPDATE_CHAR macro
  uint32_t depth = 0; // could have an arbitrary starting depth
-  pj.init();
+  pj.init(); // sets isvalid to false
  if(pj.bytecapacity < len) {
-      return simdjson::CAPACITY;
+      pj.errorcode = simdjson::CAPACITY;
+      return pj.errorcode;
  }
 // this macro reads the next structural character, updating idx, i and c.
 #define UPDATE_CHAR()                                                          \
@ -149,7 +150,7 @@ int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
    break;
  }
  case 't': {
-    // we need to make a copy to make sure that the string is NULL terminated.
+    // we need to make a copy to make sure that the string is space terminated.
    // this only applies to the JSON document made solely of the true value.
    // this will almost never be called in practice
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
@ -157,7 +158,7 @@ int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
      goto fail;
    }
    memcpy(copy, buf, len);
-    copy[len] = '\0';
+    copy[len] = ' ';
    if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
      free(copy);
      goto fail;
@ -167,7 +168,7 @@ int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
    break;
  }
  case 'f': {
-    // we need to make a copy to make sure that the string is NULL terminated.
+    // we need to make a copy to make sure that the string is space terminated.
    // this only applies to the JSON document made solely of the false value.
    // this will almost never be called in practice
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
@ -175,7 +176,7 @@ int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
      goto fail;
    }
    memcpy(copy, buf, len);
-    copy[len] = '\0';
+    copy[len] = ' ';
    if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
      free(copy);
      goto fail;
@ -185,7 +186,7 @@ int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
    break;
  }
  case 'n': {
-    // we need to make a copy to make sure that the string is NULL terminated.
+    // we need to make a copy to make sure that the string is space terminated.
    // this only applies to the JSON document made solely of the null value.
    // this will almost never be called in practice
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
@ -193,7 +194,7 @@ int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
      goto fail;
    }
    memcpy(copy, buf, len);
-    copy[len] = '\0';
+    copy[len] = ' ';
    if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
      free(copy);
      goto fail;
@ -212,15 +213,17 @@ int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
  case '7':
  case '8':
  case '9': {
-    // we need to make a copy to make sure that the string is NULL terminated.
+    // we need to make a copy to make sure that the string is space terminated.
    // this is done only for JSON documents made of a sole number
-    // this will almost never be called in practice
+    // this will almost never be called in practice. We terminate with a space
+    // because we do not want to allow NULLs in the middle of a number (whereas a
+    // space in the middle of a number would be identified in stage 1).
    char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
    if(copy == nullptr) { 
      goto fail;
    }
    memcpy(copy, buf, len);
-    copy[len] = '\0';
+    copy[len] = ' ';
    if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, false)) {
      free(copy);
      goto fail;
@ -522,22 +525,25 @@ succeed:
                          pj.get_current_loc());
  pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root

-
-
  pj.isvalid  = true;
-  return simdjson::SUCCESS;
+  pj.errorcode = simdjson::SUCCESS;
+  return pj.errorcode;
 fail:
+  // we do not need the next line because this is done by pj.init(), pessimistically.
+  // pj.isvalid  = false;
  // At this point in the code, we have all the time in the world.
  // Note that we know exactly where we are in the document so we could,
  // without any overhead on the processing code, report a specific location.
  // We could even trigger special code paths to assess what happened carefully,
  // all without any added cost.
  if (depth >= pj.depthcapacity) {
-    return simdjson::DEPTH_ERROR;
+    pj.errorcode = simdjson::DEPTH_ERROR;
+    return pj.errorcode;
  }
  switch(c) {
-    case '"': 
-      return simdjson::STRING_ERROR;
+    case '"':
+      pj.errorcode = simdjson::STRING_ERROR; 
+      return pj.errorcode;
    case '0':
    case '1':
    case '2':
@ -549,17 +555,22 @@ fail:
    case '8':
    case '9': 
    case '-': 
-      return simdjson::NUMBER_ERROR;
+      pj.errorcode = simdjson::NUMBER_ERROR;
+      return pj.errorcode;
    case 't':
-      return simdjson::T_ATOM_ERROR;
+      pj.errorcode = simdjson::T_ATOM_ERROR;
+      return pj.errorcode;
    case 'n':
-      return simdjson::N_ATOM_ERROR;
+      pj.errorcode = simdjson::N_ATOM_ERROR;
+      return pj.errorcode;
    case 'f':
-      return simdjson::F_ATOM_ERROR;
+      pj.errorcode = simdjson::F_ATOM_ERROR;
+      return pj.errorcode;
    default: 
      break;
  }
-  return simdjson::TAPE_ERROR; 
+  pj.errorcode = simdjson::TAPE_ERROR;
+  return pj.errorcode; 
 }

 int unified_machine(const char *buf, size_t len, ParsedJson &pj) {