Ok. Let us benchmark this thing.

2018-11-27 15:05:50 -05:00 · 2018-11-27 15:05:50 -05:00 · 58ac242770
parent a43b0772e1
commit 58ac242770
8 changed files with 27 additions and 286 deletions
--- a/benchmark/parse.cpp
+++ b/benchmark/parse.cpp
@ -149,7 +149,7 @@ int main(int argc, char *argv[]) {
  }

 #ifndef SQUASH_COUNTERS
-  printf("number of bytes %ld number of structural chars %d ratio %.3f\n",
+  printf("number of bytes %ld number of structural chars %u ratio %.3f\n",
         p.second, pj.n_structural_indexes,
         (double)pj.n_structural_indexes / p.second);
  unsigned long total = cy1 + cy2 + cy3;
--- a/include/jsonparser/common_defs.h
+++ b/include/jsonparser/common_defs.h
@ -40,7 +40,7 @@ typedef __m256i m256;
 #define unlikely(x) __builtin_expect(!!(x), 0)
 #endif

-static inline u32 ctz64(u64 x) {
+/*static inline u32 ctz64(u64 x) {
  assert(x); // behaviour not defined for x == 0
 #if defined(_WIN64)
  unsigned long r;
@ -56,4 +56,4 @@ static inline u32 ctz64(u64 x) {
 #else
  return (u32)__builtin_ctzll(x);
 #endif
-}
+}*/
--- a/include/jsonparser/jsoncharutils.h
+++ b/include/jsonparser/jsoncharutils.h
@ -50,7 +50,7 @@ const char digittoval[256] = {
    -1, -1, -1, -1, -1, -1, -1, -1, -1};

 // return true if we have a valid hex between 0000 and FFFF
-inline bool hex_to_u32(const u8 *src, u32 *res) {
+/*inline bool hex_to_u32(const u8 *src, u32 *res) {
  u8 v1 = src[0];
  u8 v2 = src[1];
  u8 v3 = src[2];
@ -58,7 +58,7 @@ inline bool hex_to_u32(const u8 *src, u32 *res) {
  *res = digittoval[v1] << 12 | digittoval[v2] << 8 | digittoval[v3] << 4 |
         digittoval[v4];
  return (int32_t)(*res) >= 0;
-}
+}*/

 // returns a value with the highest bit set if it is not valud
 uint32_t hex_to_u32_nocheck(const u8 *src) {
--- a/include/jsonparser/numberparsing.h
+++ b/include/jsonparser/numberparsing.h
@ -147,6 +147,8 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
 //
 // This function will almost never be called!!!
 //
+// Note: a redesign could avoid this function entirely.
+//
 static never_inline bool
 parse_highprecision_float(const u8 *const buf, UNUSED size_t len,
                          ParsedJson &pj, UNUSED const u32 depth, const u32 offset,
--- a/include/jsonparser/simdjson_internal.h
+++ b/include/jsonparser/simdjson_internal.h
@ -97,30 +97,32 @@ public:
    // 

    // this should be considered a private function
-    void write_tape(u64 val, u8 c) {
+    inline void write_tape(u64 val, u8 c) {
        tape[current_loc++] =  val | (((u64)c) << 56);
        //tape[tape_locs[depth]] = val | (((u64)c) << 56);
        //tape_locs[depth]++;
    }


-    void write_tape_s64(s64 i) {
-        *((s64 *)current_number_buf_loc) = i;// safe because array will be 8-byte aligned, could use memcpy
+    inline void write_tape_s64(s64 i) {
+        memcpy(current_number_buf_loc, &i, sizeof(s64));
+        //*((s64 *)current_number_buf_loc) = i;// safe because array will be 8-byte aligned, could use memcpy
        current_number_buf_loc += sizeof(s64);
        write_tape(current_number_buf_loc - number_buf, 'l');
    }

-    void write_tape_double(double d) {
-        *((double *)current_number_buf_loc) = d;// safe because array will be 8-byte aligned, could use memcpy
+    inline void write_tape_double(double d) {
+        memcpy(current_number_buf_loc, &d, sizeof(double));
+        //*((double *)current_number_buf_loc) = d;// safe because array will be 8-byte aligned, could use memcpy
        current_number_buf_loc += sizeof(double);
        write_tape(current_number_buf_loc - number_buf, 'd');
    }

-    u32 get_current_loc() {
+    inline u32 get_current_loc() {
        return current_loc;
    }

-    void annotate_previousloc(u32 saved_loc,u64 val) {
+    inline void annotate_previousloc(u32 saved_loc,u64 val) {
        tape[saved_loc] |= val;
    }

@ -167,7 +169,7 @@ public:


 #ifdef DEBUG
-inline void dump256(m256 d, const std::string msg) {
+inline void dump256(m256 d, const std::string& msg) {
  for (u32 i = 0; i < 32; i++) {
    std::cout << std::setw(3) << (int)*(((u8 *)(&d)) + i);
    if (!((i + 1) % 8))
@ -181,14 +183,14 @@ inline void dump256(m256 d, const std::string msg) {
 }

 // dump bits low to high
-inline void dumpbits(u64 v, const std::string msg) {
+inline void dumpbits(u64 v, const std::string& msg) {
  for (u32 i = 0; i < 64; i++) {
    std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_");
  }
  std::cout << " " << msg << "\n";
 }

-inline void dumpbits32(u32 v, const std::string msg) {
+inline void dumpbits32(u32 v, const std::string& msg) {
  for (u32 i = 0; i < 32; i++) {
    std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_");
  }
@ -201,14 +203,14 @@ inline void dumpbits32(u32 v, const std::string msg) {
 #endif

 // dump bits low to high
-inline void dumpbits_always(u64 v, const std::string msg) {
+inline void dumpbits_always(u64 v, const std::string& msg) {
  for (u32 i = 0; i < 64; i++) {
    std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_");
  }
  std::cout << " " << msg << "\n";
 }

-inline void dumpbits32_always(u32 v, const std::string msg) {
+inline void dumpbits32_always(u32 v, const std::string& msg) {
  for (u32 i = 0; i < 32; i++) {
    std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_");
  }
--- a/include/jsonparser/simdutf8check.h
+++ b/include/jsonparser/simdutf8check.h
@ -24,168 +24,7 @@
 */

 // all byte values must be no larger than 0xF4
-static inline void checkSmallerThan0xF4(__m128i current_bytes,
-                                        __m128i *has_error) {
-  // unsigned, saturates to 0 below max
-  *has_error = _mm_or_si128(*has_error,
-                            _mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4)));
-}

-static inline __m128i continuationLengths(__m128i high_nibbles) {
-  return _mm_shuffle_epi8(
-      _mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
-                    0, 0, 0, 0,             // 10xx (continuation)
-                    2, 2,                   // 110x
-                    3,                      // 1110
-                    4), // 1111, next should be 0 (not checked here)
-      high_nibbles);
-}
-
-static inline __m128i carryContinuations(__m128i initial_lengths,
-                                         __m128i previous_carries) {
-
-  __m128i right1 =
-      _mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1),
-                    _mm_set1_epi8(1));
-  __m128i sum = _mm_add_epi8(initial_lengths, right1);
-
-  __m128i right2 = _mm_subs_epu8(_mm_alignr_epi8(sum, previous_carries, 16 - 2),
-                                 _mm_set1_epi8(2));
-  return _mm_add_epi8(sum, right2);
-}
-
-static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
-                                      __m128i *has_error) {
-
-  // overlap || underlap
-  // carry > length && length > 0 || !(carry > length) && !(length > 0)
-  // (carries > length) == (lengths > 0)
-  __m128i overunder =
-      _mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths),
-                     _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()));
-
-  *has_error = _mm_or_si128(*has_error, overunder);
-}
-
-// when 0xED is found, next byte must be no larger than 0x9F
-// when 0xF4 is found, next byte must be no larger than 0x8F
-// next byte must be continuation, ie sign bit is set, so signed < is ok
-static inline void checkFirstContinuationMax(__m128i current_bytes,
-                                             __m128i off1_current_bytes,
-                                             __m128i *has_error) {
-  __m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED));
-  __m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4));
-
-  __m128i badfollowED =
-      _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)), maskED);
-  __m128i badfollowF4 =
-      _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)), maskF4);
-
-  *has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollowED, badfollowF4));
-}
-
-// map off1_hibits => error condition
-// hibits     off1    cur
-// C       => < C2 && true
-// E       => < E1 && < A0
-// F       => < F1 && < 90
-// else      false && false
-static inline void checkOverlong(__m128i current_bytes,
-                                 __m128i off1_current_bytes, __m128i hibits,
-                                 __m128i previous_hibits, __m128i *has_error) {
-  __m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1);
-  __m128i initial_mins = _mm_shuffle_epi8(
-      _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-                    -128, -128, // 10xx => false
-                    0xC2, -128, // 110x
-                    0xE1,       // 1110
-                    0xF1),
-      off1_hibits);
-
-  __m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
-
-  __m128i second_mins = _mm_shuffle_epi8(
-      _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-                    -128, -128, // 10xx => false
-                    127, 127,   // 110x => true
-                    0xA0,       // 1110
-                    0x90),
-      off1_hibits);
-  __m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes);
-  *has_error =
-      _mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under));
-}
-
-struct processed_utf_bytes {
-  __m128i rawbytes;
-  __m128i high_nibbles;
-  __m128i carried_continuations;
-};
-
-static inline void count_nibbles(__m128i bytes,
-                                 struct processed_utf_bytes *answer) {
-  answer->rawbytes = bytes;
-  answer->high_nibbles =
-      _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F));
-}
-
-// check whether the current bytes are valid UTF-8
-// at the end of the function, previous gets updated
-static struct processed_utf_bytes
-checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
-               __m128i *has_error) {
-  struct processed_utf_bytes pb;
-  count_nibbles(current_bytes, &pb);
-
-  checkSmallerThan0xF4(current_bytes, has_error);
-
-  __m128i initial_lengths = continuationLengths(pb.high_nibbles);
-
-  pb.carried_continuations =
-      carryContinuations(initial_lengths, previous->carried_continuations);
-
-  checkContinuations(initial_lengths, pb.carried_continuations, has_error);
-
-  __m128i off1_current_bytes =
-      _mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1);
-  checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
-
-  checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
-                previous->high_nibbles, has_error);
-  return pb;
-}
-
-static inline bool validate_utf8_fast(const char *src, size_t len) {
-  size_t i = 0;
-  __m128i has_error = _mm_setzero_si128();
-  struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(),
-                                         .high_nibbles = _mm_setzero_si128(),
-                                         .carried_continuations =
-                                             _mm_setzero_si128()};
-  if (len >= 16) {
-    for (; i <= len - 16; i += 16) {
-      __m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i));
-      previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
-    }
-  }
-
-  // last part
-  if (i < len) {
-    char buffer[16];
-    memset(buffer, 0, 16);
-    memcpy(buffer, src + i, len - i);
-    __m128i current_bytes = _mm_loadu_si128((const __m128i *)(buffer));
-    previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
-  } else {
-    has_error =
-        _mm_or_si128(_mm_cmpgt_epi8(previous.carried_continuations,
-                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                                  9, 9, 9, 9, 9, 1)),
-                     has_error);
-  }
-
-  return _mm_testz_si128(has_error, has_error);
-}

 #ifdef __AVX2__

@ -349,109 +188,7 @@ avxcheckUTF8Bytes(__m256i current_bytes,
  return pb;
 }

-static inline bool validate_utf8_fast_avx(const char *src, size_t len) {
-  size_t i = 0;
-  __m256i has_error = _mm256_setzero_si256();
-  struct avx_processed_utf_bytes previous = {
-      .rawbytes = _mm256_setzero_si256(),
-      .high_nibbles = _mm256_setzero_si256(),
-      .carried_continuations = _mm256_setzero_si256()};
-  if (len >= 32) {
-    for (; i <= len - 32; i += 32) {
-      __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
-      previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
-    }
-  }
-
-  // last part
-  if (i < len) {
-    char buffer[32];
-    memset(buffer, 0, 32);
-    memcpy(buffer, src + i, len - i);
-    __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer));
-    previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
-  } else {
-    has_error = _mm256_or_si256(
-        _mm256_cmpgt_epi8(previous.carried_continuations,
-                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 1)),
-        has_error);
-  }
-
-  return _mm256_testz_si256(has_error, has_error);
-}
-
-
-// check whether the current bytes are valid UTF-8
-// at the end of the function, previous gets updated
-static struct avx_processed_utf_bytes
-avxcheckUTF8Bytes_asciipath(__m256i current_bytes,
-                  struct avx_processed_utf_bytes *previous,
-                  __m256i *has_error) {
-  if(_mm256_testz_si256(current_bytes,_mm256_set1_epi8(0x80))) { // fast ascii path
-    *has_error = _mm256_or_si256(
-        _mm256_cmpgt_epi8(previous->carried_continuations,
-                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 1)),*has_error);
-    return *previous;
-  }
-  struct avx_processed_utf_bytes pb;
-  avx_count_nibbles(current_bytes, &pb);
-
-  avxcheckSmallerThan0xF4(current_bytes, has_error);
-
-  __m256i initial_lengths = avxcontinuationLengths(pb.high_nibbles);
-
-  pb.carried_continuations =
-      avxcarryContinuations(initial_lengths, previous->carried_continuations);
-
-  avxcheckContinuations(initial_lengths, pb.carried_continuations, has_error);
-
-  __m256i off1_current_bytes =
-      push_last_byte_of_a_to_b(previous->rawbytes, pb.rawbytes);
-  avxcheckFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
-
-  avxcheckOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
-                   previous->high_nibbles, has_error);
-  return pb;
-}
-
-static inline bool validate_utf8_fast_avx_asciipath(const char *src, size_t len) {
-  size_t i = 0;
-  __m256i has_error = _mm256_setzero_si256();
-  struct avx_processed_utf_bytes previous = {
-      .rawbytes = _mm256_setzero_si256(),
-      .high_nibbles = _mm256_setzero_si256(),
-      .carried_continuations = _mm256_setzero_si256()};
-  if (len >= 32) {
-    for (; i <= len - 32; i += 32) {
-      __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
-      previous = avxcheckUTF8Bytes_asciipath(current_bytes, &previous, &has_error);
-    }
-  }
-
-  // last part
-  if (i < len) {
-    char buffer[32];
-    memset(buffer, 0, 32);
-    memcpy(buffer, src + i, len - i);
-    __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer));
-    previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
-  } else {
-    has_error = _mm256_or_si256(
-        _mm256_cmpgt_epi8(previous.carried_continuations,
-                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 1)),
-        has_error);
-  }
-
-  return _mm256_testz_si256(has_error, has_error);
-}
-
-
-
+#else // __AVX2__
+#warning "We require AVX2 support!"
 #endif // __AVX2__
 #endif
--- a/src/jsonminifier.cpp
+++ b/src/jsonminifier.cpp
@ -205,10 +205,10 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
    uint64_t odd_starts = start_edges & ~even_start_mask;
    uint64_t even_carries = bs_bits + even_starts;
    uint64_t odd_carries;
-    bool iter_ends_odd_backslash = __builtin_uaddll_overflow(
-        bs_bits, odd_starts, (unsigned long long *)&odd_carries);
+    //bool iter_ends_odd_backslash = 
+    __builtin_uaddll_overflow( bs_bits, odd_starts, (unsigned long long *)&odd_carries);
    odd_carries |= prev_iter_ends_odd_backslash;
-    prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
+    //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; // we never use it
    uint64_t even_carry_ends = even_carries & ~bs_bits;
    uint64_t odd_carry_ends = odd_carries & ~bs_bits;
    uint64_t even_start_odd_end = even_carry_ends & odd_bits;
--- a/src/stage2_flatten.cpp
+++ b/src/stage2_flatten.cpp
@ -54,7 +54,7 @@ bool flatten_indexes(size_t len, ParsedJson &pj) {
  u32 *base_ptr = pj.structural_indexes;
  u32 base = 0;
 #ifdef BUILDHISTOGRAM
-  uint32_t counters[65];
+  uint32_t counters[66];
  uint32_t total = 0;
  for (int k = 0; k < 66; k++)
    counters[k] = 0;