Even safer.

2018-12-10 20:54:31 -05:00 · 2018-12-10 20:54:31 -05:00 · e4703a383b
parent 8a2281269c
commit e4703a383b
10 changed files with 75 additions and 22 deletions
--- a/README.md
+++ b/README.md
@ -70,7 +70,7 @@ To simplify the engineering, we make some assumptions.
 - We support UTF-8 (and thus ASCII), nothing else (no Latin, no UTF-16).
 - We assume AVX2 support which is available in all recent mainstream x86 processors produced by AMD and Intel. No support for non-x86 processors is included.
 - We only support GNU GCC and LLVM Clang at this time. There is no support for Microsoft Visual Studio, though it should not be difficult.
- We expect the input memory pointer to be padded (e.g., with spaces) so that it can be read entirely in blocks of 512 bits (a cache line). In practice, this means that users may allocate the memory where the JSON bytes are located using the `allocate_padded_buffer` function or the equivalent. Of course, the data you may want to process could be on a buffer that does have this padding. However, copying the data is relatively cheap (much cheaper than parsing JSON), and we can eventually remove this constraint.
+- We expect the input memory to be readable up to 32 bytes beyond the end of the JSON document (to support fast vector loads). All bytes beyond the end of the JSON document are ignored (can be garbage) and the JSON document does not need to be NULL terminated. You can allocate a properly overallocated memory region with the provided `allocate_padded_buffer` function or simply by allocating your memory with extra capacity (`malloc(length + SIMDJSON_PADDING)`).

 ## Features

@ -78,7 +78,7 @@ To simplify the engineering, we make some assumptions.
 - We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.)
 - We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.)
 - We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tags in strings.)
- The input string is unmodified.
+- The input string is unmodified. (Parsers like sajson and RapidJSON overwrite the input string.)

 ## Architecture

--- a/include/simdjson/common_defs.h
+++ b/include/simdjson/common_defs.h
@ -2,6 +2,10 @@

 #include <cassert>

+// the input buf should be readable up to buf + SIMDJSON_PADDING
+#define SIMDJSON_PADDING  sizeof(__m256i) 
+
+
 typedef unsigned char u8;
 typedef unsigned short u16;
 typedef unsigned int u32;
--- a/include/simdjson/jsonparser.h
+++ b/include/simdjson/jsonparser.h
@ -7,18 +7,27 @@
 #include "simdjson/stage2_flatten.h"
 #include "simdjson/stage34_unified.h"

+
+
+
 // Parse a document found in buf, need to preallocate ParsedJson.
 // Return false in case of a failure. You can also check validity 
 // by calling pj.isValid(). The same ParsedJson can be reused.
+// the input buf should be readable up to buf + len + SIMDJSON_PADDING 
+// all bytes at and after buf + len  are ignored (can be garbage)
 WARN_UNUSED
 bool json_parse(const u8 *buf, size_t len, ParsedJson &pj);

+// the input buf should be readable up to buf + len + SIMDJSON_PADDING 
+// all bytes at and after buf + len  are ignored (can be garbage)
 WARN_UNUSED
 static inline bool json_parse(const char * buf, size_t len, ParsedJson &pj) {
  return json_parse((const u8 *) buf, len, pj);
 }

 // convenience function
+// the input s should be readable up to s.data() + s.size() + SIMDJSON_PADDING 
+// all bytes at and after s.data()+s.size() are ignored (can be garbage)
 WARN_UNUSED
 static inline bool json_parse(const std::string_view &s, ParsedJson &pj) {
  return json_parse(s.data(), s.size(), pj);
@ -27,16 +36,22 @@ static inline bool json_parse(const std::string_view &s, ParsedJson &pj) {

 // Build a ParsedJson object. You can check validity 
 // by calling pj.isValid(). This does memory allocation.
+// the input buf should be readable up to buf + len + SIMDJSON_PADDING 
+// all bytes at and after buf + len  are ignored (can be garbage)
 WARN_UNUSED
 ParsedJson build_parsed_json(const u8 *buf, size_t len);

 WARN_UNUSED
+// the input buf should be readable up to buf + len + SIMDJSON_PADDING 
+// all bytes at and after buf + len  are ignored (can be garbage)
 static inline ParsedJson build_parsed_json(const char * buf, size_t len) {
  return build_parsed_json((const u8 *) buf, len);
 }

 // convenience function
 WARN_UNUSED
+// the input s should be readable up to s.data() + s.size() + SIMDJSON_PADDING 
+// all bytes at and after s.data()+s.size() are ignored (can be garbage)
 static inline ParsedJson build_parsed_json(const std::string_view &s) {
  return build_parsed_json(s.data(), s.size());
 }
--- a/jsonchecker/pass06.json
+++ b/jsonchecker/pass06.json
@ -1 +1 @@
-true
+true
--- a/jsonchecker/pass07.json
+++ b/jsonchecker/pass07.json
@ -1 +1 @@
-null
+null
--- a/jsonchecker/pass08.json
+++ b/jsonchecker/pass08.json
@ -1 +1 @@
-1
+1
--- a/jsonchecker/pass09.json
+++ b/jsonchecker/pass09.json
@ -1 +1 @@
-false
+false
--- a/src/jsonioutil.cpp
+++ b/src/jsonioutil.cpp
@ -3,14 +3,15 @@


 char * allocate_padded_buffer(size_t length) {
-    char *aligned_buffer;
-    size_t paddedlength = ROUNDUP_N(length, 64);
-    // allocate an extra sizeof(__m256i) just so we can always use AVX safely
-    size_t totalpaddedlength = paddedlength + 1 + sizeof(__m256i);
-    if (posix_memalign((void **)&aligned_buffer, 64, totalpaddedlength)) {
-      throw std::runtime_error("Could not allocate sufficient memory");
+    // we could do a simple malloc
+    //return (char *) malloc(length + SIMDJSON_PADDING);
+    // However, we might as well align to cache lines...
+    char *padded_buffer;
+    size_t totalpaddedlength = length + SIMDJSON_PADDING;
+    if (posix_memalign((void **)&padded_buffer, 64, totalpaddedlength)) {
+      return NULL;
    };
-    return aligned_buffer;
+    return padded_buffer;
 }

 std::string_view get_corpus(std::string filename) {
--- a/src/jsonparser.cpp
+++ b/src/jsonparser.cpp
@ -14,9 +14,10 @@ bool json_parse(const u8 *buf, size_t len, ParsedJson &pj) {
    isok = flatten_indexes(len, pj);
  } else {
    return false;
-  }
+  }//printf("ok\n");
  if (isok) {
    isok = unified_machine(buf, len, pj);
+    //printf("ok %d \n",isok);
  } else {
    return false;
  }
--- a/src/stage34_unified.cpp
+++ b/src/stage34_unified.cpp
@ -125,24 +125,54 @@ bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
    }
    break;
  }
-  case 't':
-    if (!is_valid_true_atom(buf + idx)) {
+  case 't': {
+    // we need to make a copy to make sure that the string is NULL terminated.
+    // this only applies to the JSON document made solely of the true value.
+    // this will almost never be called in practice
+    char * copy = (char *) malloc(len + SIMDJSON_PADDING);
+    if(copy == NULL) goto fail;
+    memcpy(copy, buf, len);
+    copy[len] = '\0';
+    if (!is_valid_true_atom((const u8 *)copy + idx)) {
+      free(copy);
      goto fail;
    }
+    free(copy);
    pj.write_tape(0, c);
    break;
-  case 'f':
-    if (!is_valid_false_atom(buf + idx)) {
+  }
+  case 'f': {
+    // we need to make a copy to make sure that the string is NULL terminated.
+    // this only applies to the JSON document made solely of the false value.
+    // this will almost never be called in practice
+    char * copy = (char *) malloc(len + SIMDJSON_PADDING);
+    if(copy == NULL) goto fail;
+    memcpy(copy, buf, len);
+    copy[len] = '\0';
+    if (!is_valid_false_atom((const u8 *)copy + idx)) {
+      free(copy);
      goto fail;
    }
+    free(copy);
    pj.write_tape(0, c);
    break;
-  case 'n':
-    if (!is_valid_null_atom(buf + idx)) {
+  }
+  case 'n': {
+    // we need to make a copy to make sure that the string is NULL terminated.
+    // this only applies to the JSON document made solely of the null value.
+    // this will almost never be called in practice
+    char * copy = (char *) malloc(len + SIMDJSON_PADDING);
+    if(copy == NULL) goto fail;
+    memcpy(copy, buf, len);
+    copy[len] = '\0';
+    if (!is_valid_null_atom((const u8 *)copy + idx)) {
+      free(copy);
      goto fail;
    }
+    free(copy);
    pj.write_tape(0, c);
    break;
+  }
  case '0': 
  case '1':
  case '2':
@ -155,7 +185,8 @@ bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
  case '9': {
    // we need to make a copy to make sure that the string is NULL terminated.
    // this is done only for JSON documents made of a sole number
-    char * copy = (char *) malloc(len + 1 + 64);
+    // this will almost never be called in practice
+    char * copy = (char *) malloc(len + SIMDJSON_PADDING);
    if(copy == NULL) goto fail;
    memcpy(copy, buf, len);
    copy[len] = '\0';
@ -169,7 +200,8 @@ bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
  case '-': {
    // we need to make a copy to make sure that the string is NULL terminated.
    // this is done only for JSON documents made of a sole number
-    char * copy = (char *) malloc(len + 1 + 64);
+    // this will almost never be called in practice
+    char * copy = (char *) malloc(len + SIMDJSON_PADDING);
    if(copy == NULL) goto fail;
    memcpy(copy, buf, len);
    copy[len] = '\0';