Cleaning.

2018-12-10 16:47:02 -05:00 · 2018-12-10 16:47:02 -05:00 · 05636f3a1d
parent 7fda77d51a
commit 05636f3a1d
8 changed files with 47 additions and 22 deletions
--- a/README.md
+++ b/README.md
@ -71,7 +71,6 @@ To simplify the engineering, we make some assumptions.
 - We assume AVX2 support which is available in all recent mainstream x86 processors produced by AMD and Intel. No support for non-x86 processors is included.
 - We only support GNU GCC and LLVM Clang at this time. There is no support for Microsoft Visual Studio, though it should not be difficult.
 - We expect the input memory pointer to be padded (e.g., with spaces) so that it can be read entirely in blocks of 512 bits (a cache line). In practice, this means that users should allocate the memory where the JSON bytes are located using the `allocate_aligned_buffer` function or the equivalent. Of course, the data you may want to processed could be on a buffer that does have this padding. However, copying the data is relatively cheap (much cheaper than parsing JSON), and we can eventually remove this constraint.
- The input string should be NULL terminated.

 ## Features

--- a/benchmark/parsingcompetition.cpp
+++ b/benchmark/parsingcompetition.cpp
@ -101,15 +101,15 @@ int main(int argc, char *argv[]) {
  BEST_TIME("RapidJSON", 
      d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(),
      false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
-  BEST_TIME("RapidJSON Insitu", d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), false,
+  BEST_TIME("RapidJSON (insitu)", d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), false,
            memcpy(buffer, p.data(), p.size()), repeat, volume, true);

-  BEST_TIME("sajson (dynamic mem)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
+  BEST_TIME("sajson (dynamic mem, insitu)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);

  size_t astbuffersize = p.size();
  size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));

-  BEST_TIME("sajson (static alloc)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
+  BEST_TIME("sajson (static alloc, insitu)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
  std::string json11err;
  if(all) BEST_TIME("dropbox (json11)     ",  (( json11::Json::parse(buffer,json11err).is_null() ) || ( ! json11err.empty() )), false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);

--- a/include/simdjson/jsonformatutils.h
+++ b/include/simdjson/jsonformatutils.h
@ -7,6 +7,14 @@
 static inline void print_with_escapes(const unsigned char *src) {
  while (*src) {
    switch (*src) {
+    case '\b':
+      putchar('\\');
+      putchar('b');
+      break;
+    case '\f':
+      putchar('\\');
+      putchar('f');
+      break;
    case '\n':
      putchar('\\');
      putchar('n');
@ -40,6 +48,14 @@ static inline void print_with_escapes(const unsigned char *src) {
 static inline void print_with_escapes(const unsigned char *src, std::ostream &os) {
  while (*src) {
    switch (*src) {
+    case '\b':
+      os << '\\';
+      os << 'b';
+      break;
+    case '\f':
+      os << '\\';
+      os << 'f';
+      break;
    case '\n':
      os << '\\';
      os << 'n';
--- a/include/simdjson/jsonparser.h
+++ b/include/simdjson/jsonparser.h
@ -10,7 +10,6 @@
 // Parse a document found in buf, need to preallocate ParsedJson.
 // Return false in case of a failure. You can also check validity 
 // by calling pj.isValid(). The same ParsedJson can be reused.
-// The string should be NULL terminated.
 WARN_UNUSED
 bool json_parse(const u8 *buf, size_t len, ParsedJson &pj);

--- a/include/simdjson/parsedjson.h
+++ b/include/simdjson/parsedjson.h
@ -132,20 +132,21 @@ public:
    size_t *inobjectidx = new size_t[depthcapacity];
    int depth = 1; // only root at level 0
    inobjectidx[depth] = 0;
+    inobject[depth] = false;
    for (; tapeidx < howmany; tapeidx++) {
      tape_val = tape[tapeidx];
      u64 payload = tape_val & JSONVALUEMASK;
      type = (tape_val >> 56);
      if (!inobject[depth]) {
        if ((inobjectidx[depth] > 0) && (type != ']'))
-          os << ",  ";
+          os << ",";
        inobjectidx[depth]++;
      } else { // if (inobject) {
        if ((inobjectidx[depth] > 0) && ((inobjectidx[depth] & 1) == 0) &&
            (type != '}'))
-          os << ",  ";
+          os << ",";
        if (((inobjectidx[depth] & 1) == 1))
-          os << " : ";
+          os << ":";
        inobjectidx[depth]++;
      }
      switch (type) {
@ -176,7 +177,6 @@ public:
        os << "false";
        break;
      case '{': // we have an object
-        os << '\n';
        os << '{';
        depth++;
        inobject[depth] = true;
@ -187,7 +187,6 @@ public:
        os << '}';
        break;
      case '[': // we start an array
-        os << '\n';
        os << '['; 
        depth++;
        inobject[depth] = false;
@ -237,6 +236,7 @@ public:
        os << "string \""; 
        print_with_escapes((const unsigned char *)(string_buf + payload));
        os << '"';
+        os << '\n';
        break;
      case 'l': // we have a long int
        if (tapeidx + 1 >= howmany)
--- a/src/jsonioutil.cpp
+++ b/src/jsonioutil.cpp
@ -25,7 +25,7 @@ std::string_view get_corpus(std::string filename) {
    }
    std::rewind(fp);
    std::fread(buf, 1, len, fp);
-    buf[len] = '\0';
+    //buf[len] = '\0';
    std::fclose(fp);
    return std::string_view(buf,len);
  }
--- a/src/stage1_find_marks.cpp
+++ b/src/stage1_find_marks.cpp
@ -13,8 +13,7 @@
 #define UTF8VALIDATE
 // It seems that many parsers do UTF-8 validation.
 // RapidJSON does not do it by default, but a flag
-// allows it. It appears that sajson might do utf-8
-// validation
+// allows it. 
 #ifdef UTF8VALIDATE
 #include "simdjson/simdutf8check.h"
 #endif
@ -61,7 +60,7 @@ WARN_UNUSED
  // effectively the very first char is considered to follow "whitespace" for the
  // purposes of psuedo-structural character detection
  u64 prev_iter_ends_pseudo_pred = 1ULL;
-  size_t lenminus64 = len + 1 < 64 ? 0 : len + 1  - 64; // len + 1 because of the NULL termination
+  size_t lenminus64 = len < 64 ? 0 : len - 64; 
  size_t idx = 0;
  for (; idx < lenminus64; idx += 64) {
    __builtin_prefetch(buf + idx + 128);
@ -256,10 +255,10 @@ WARN_UNUSED
  /// but otherwise the string needs to be properly padded or else we
  /// risk invalidating the UTF-8 checks.
  ////////////
-  if (idx < len + 1) { // +1 due to NULL termination
+  if (idx < len) { 
    u8 tmpbuf[64];
    memset(tmpbuf,0x20,64);
-    memcpy(tmpbuf,buf+idx,len - idx + 1);// +1 due to NULL termination
+    memcpy(tmpbuf,buf+idx,len - idx);
    m256 input_lo = _mm256_loadu_si256((const m256 *)(tmpbuf + 0));
    m256 input_hi = _mm256_loadu_si256((const m256 *)(tmpbuf + 32));
 #ifdef UTF8VALIDATE
@ -403,10 +402,6 @@ WARN_UNUSED
    structurals &= ~(quote_bits & ~quote_mask);
    *(u64 *)(pj.structurals + idx / 8) = structurals;
  }
-  if(buf[len] != '\0') {
-      std::cerr << "Your string should be NULL terminated." << std::endl;
-      return false;
-  }
 #ifdef UTF8VALIDATE
  return _mm256_testz_si256(has_error, has_error);
 #else
--- a/src/stage34_unified.cpp
+++ b/src/stage34_unified.cpp
@ -153,15 +153,31 @@ bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
  case '7':
  case '8':
  case '9': {
-    if (!parse_number(buf, pj, idx, false)) {
+    // we need to make a copy to make sure that the string is NULL terminated.
+    // this is done only for JSON documents made of a sole number
+    char * copy = (char *) malloc(len + 1);
+    memcpy(copy, buf, len);
+    copy[len] = '\0';
+    if(copy == NULL) goto fail;
+    if (!parse_number((const u8 *)copy, pj, idx, false)) {
+      free(copy);
      goto fail;
    }
+    free(copy);
    break;
  }
  case '-': {
-    if (!parse_number(buf, pj, idx, true)) {
+    // we need to make a copy to make sure that the string is NULL terminated.
+    // this is done only for JSON documents made of a sole number
+    char * copy = (char *) malloc(len + 1);
+    memcpy(copy, buf, len);
+    copy[len] = '\0';
+    if(copy == NULL) goto fail;
+    if (!parse_number((const u8 *)copy, pj, idx, true)) {
+      free(copy);
      goto fail;
    }
+    free(copy);
    break;
  }
 #endif // ALLOWANYTHINGINROOT