This updates the minifier. (#446)

2020-01-15 13:45:32 -05:00 · 2020-01-15 13:45:32 -05:00 · f611b65bc0
parent 2dc61fbdc4
commit f611b65bc0
4 changed files with 3522 additions and 4294 deletions
--- a/singleheader/amalgamation_demo.cpp
+++ b/singleheader/amalgamation_demo.cpp
@ -1,4 +1,4 @@
-/* auto-generated on Wed Dec 18 14:39:04 UTC 2019. Do not edit! */
+/* auto-generated on Wed Jan 15 13:09:01 EST 2020. Do not edit! */
 #include <iostream>
 #include "simdjson.h"
--- a/singleheader/simdjson.cpp
+++ b/singleheader/simdjson.cpp
--- a/singleheader/simdjson.h
+++ b/singleheader/simdjson.h
@ -1,4 +1,4 @@
-/* auto-generated on Wed Dec 18 14:39:04 UTC 2019. Do not edit! */
+/* auto-generated on Wed Jan 15 13:09:01 EST 2020. Do not edit! */
 /* begin file include/simdjson/simdjson_version.h */
 // /include/simdjson/simdjson_version.h automatically generated by release.py,
 // do not change by hand
@ -18,10 +18,10 @@ enum {
 #ifndef SIMDJSON_PORTABILITY_H
 #define SIMDJSON_PORTABILITY_H
 #include <cstdint>
 #include <cstddef>
-#include <cstdlib>
+#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #ifdef _MSC_VER
 #include <iso646.h>
 #endif
@ -34,7 +34,10 @@ enum {
 #endif
 // this is almost standard?
-#define STRINGIFY(a) #a
+#undef STRINGIFY_IMPLEMENTATION_
 #undef STRINGIFY
 #define STRINGIFY_IMPLEMENTATION_(a) #a
 #define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a)
 // we are going to use runtime dispatch
 #ifdef IS_X86_64
@ -54,7 +57,7 @@ enum {
 #define UNTARGET_REGION _Pragma("GCC pop_options")
 #endif // clang then gcc
-#endif  // x86
+#endif // x86
 // Default target region macros don't do anything.
 #ifndef TARGET_REGION
@ -67,9 +70,11 @@ enum {
 #define TARGET_WESTMERE TARGET_REGION("sse4.2,pclmul")
 #define TARGET_ARM64
 // Threading is disabled
 #undef SIMDJSON_THREADS_ENABLED
 // Is threading enabled?
 #if defined(BOOST_HAS_THREADS) || defined(_REENTRANT) || defined(_MT)
-#define SIMDJSON_THREADS_ENABLED 1
+#define SIMDJSON_THREADS_ENABLED
 #endif
 #if defined(__clang__)
@ -84,7 +89,6 @@ enum {
 #include <intrin.h> // visual studio
 #endif
 #ifdef _MSC_VER
 #define simdjson_strcasecmp _stricmp
 #else
@ -493,6 +497,22 @@ static inline void print_with_escapes(const char *src, std::ostream &os,
 #ifndef SIMDJSON_SIMDJSON_H
 #define SIMDJSON_SIMDJSON_H
 #ifndef __cplusplus
 #error simdjson requires a C++ compiler
 #endif
 #ifndef SIMDJSON_CPLUSPLUS
 #if defined(_MSVC_LANG) && !defined(__clang__)
 #define SIMDJSON_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
 #else
 #define SIMDJSON_CPLUSPLUS __cplusplus
 #endif
 #endif
 #if (SIMDJSON_CPLUSPLUS < 201703L)
 #error simdjson requires a compiler compliant with the C++17 standard
 #endif
 #include <string>
 namespace simdjson {
@ -529,8 +549,8 @@ enum ErrorValues {
  N_ATOM_ERROR,    // Problem while parsing an atom starting with the letter 'n'
  NUMBER_ERROR,    // Problem while parsing a number
  UTF8_ERROR,      // the input is not valid UTF-8
-  UNITIALIZED,     // unknown error, or uninitialized document
+  UNINITIALIZED,     // unknown error, or uninitialized document
-  EMPTY,           // no structural document found
+  EMPTY,           // no structural element found
  UNESCAPED_CHARS, // found unescaped characters in a string.
  UNCLOSED_STRING, // missing quote at the end
  UNEXPECTED_ERROR // indicative of a bug in simdjson
@ -623,39 +643,68 @@ const std::string &error_message(const int);
 #include <memory>
 #include <string>
 namespace simdjson {
 // low-level function to allocate memory with padding so we can read passed the
 // "length" bytes safely. if you must provide a pointer to some data, create it
 // with this function: length is the max. size in bytes of the string caller is
 // responsible to free the memory (free(...))
-char *allocate_padded_buffer(size_t length);
+inline char *allocate_padded_buffer(size_t length) noexcept {
  // we could do a simple malloc
  // return (char *) malloc(length + SIMDJSON_PADDING);
  // However, we might as well align to cache lines...
  size_t totalpaddedlength = length + SIMDJSON_PADDING;
  char *padded_buffer = aligned_malloc_char(64, totalpaddedlength);
 #ifndef NDEBUG
  if (padded_buffer == nullptr) {
    errno = EINVAL;
    perror("simdjson::allocate_padded_buffer() aligned_malloc_char() failed");
    return nullptr;
  }
 #endif // NDEBUG
  memset(padded_buffer + length, 0, totalpaddedlength - length);
  return padded_buffer;
 } // allocate_padded_buffer
 // Simple string with padded allocation.
 // We deliberately forbid copies, users should rely on swap or move
 // constructors.
-class padded_string {
+struct padded_string final {
-public:
+
  explicit padded_string() noexcept : viable_size(0), data_ptr(nullptr) {}
  explicit padded_string(size_t length) noexcept
      : viable_size(length), data_ptr(allocate_padded_buffer(length)) {
    if (data_ptr != nullptr)
      data_ptr[length] = '\0'; // easier when you need a c_str
  }
  explicit padded_string(char *data, size_t length) noexcept
      : viable_size(length), data_ptr(allocate_padded_buffer(length)) {
-    if (data_ptr != nullptr) {
+    if ((data != nullptr) and (data_ptr != nullptr)) {
      memcpy(data_ptr, data, length);
      data_ptr[length] = '\0'; // easier when you need a c_str
    }
  }
-  padded_string(std::string s) noexcept
+
-      : viable_size(s.size()), data_ptr(allocate_padded_buffer(s.size())) {
+  // note: do not pass std::string arguments by value
  padded_string(const std::string & str_ ) noexcept
      : viable_size(str_.size()), data_ptr(allocate_padded_buffer(str_.size())) {
    if (data_ptr != nullptr) {
-      memcpy(data_ptr, s.data(), s.size());
+      memcpy(data_ptr, str_.data(), str_.size());
-      data_ptr[s.size()] = '\0'; // easier when you need a c_str
+      data_ptr[str_.size()] = '\0'; // easier when you need a c_str
    }
  }
  // note: do pass std::string_view arguments by value
  padded_string(std::string_view sv_) noexcept
      : viable_size(sv_.size()), data_ptr(allocate_padded_buffer(sv_.size())) {
    if (data_ptr != nullptr) {
      memcpy(data_ptr, sv_.data(), sv_.size());
      data_ptr[sv_.size()] = '\0'; // easier when you need a c_str
    }
  }
  padded_string(padded_string &&o) noexcept
      : viable_size(o.viable_size), data_ptr(o.data_ptr) {
    o.data_ptr = nullptr; // we take ownership
@ -678,21 +727,25 @@ public:
    o.viable_size = tmp_viable_size;
  }
-  ~padded_string() { aligned_free_char(data_ptr); }
+  ~padded_string() {
      aligned_free_char(data_ptr);
  }
-  size_t size() const { return viable_size; }
+  size_t size() const  { return viable_size; }
-  size_t length() const { return viable_size; }
+  size_t length() const  { return viable_size; }
-  char *data() const { return data_ptr; }
+  char *data() const  { return data_ptr; }
 private:
  padded_string &operator=(const padded_string &o) = delete;
  padded_string(const padded_string &o) = delete;
  size_t viable_size;
-  char *data_ptr;
+  char *data_ptr{nullptr};
-};
+
 }; // padded_string
 } // namespace simdjson
 #endif
@ -766,6 +819,7 @@ static inline size_t json_minify(const padded_string &p, char *out) {
 #include <cstring>
 #include <iostream>
 #include <memory>
 #define JSON_VALUE_MASK 0xFFFFFFFFFFFFFF
@ -782,10 +836,14 @@ class ParsedJson {
 public:
  // create a ParsedJson container with zero capacity, call allocate_capacity to
  // allocate memory
-  ParsedJson();
+  ParsedJson()=default;
-  ~ParsedJson();
+  ~ParsedJson()=default;
-  ParsedJson(ParsedJson &&p);
+
-  ParsedJson &operator=(ParsedJson &&o);
+  // this is a move only class
  ParsedJson(ParsedJson &&p) = default;
  ParsedJson(const ParsedJson &p) = delete;
  ParsedJson &operator=(ParsedJson &&o) = default;
  ParsedJson &operator=(const ParsedJson &o) = delete;
  // if needed, allocate memory so that the object is able to process JSON
  // documents having up to len bytes and max_depth "depth"
@ -838,7 +896,8 @@ public:
  really_inline void write_tape_s64(int64_t i) {
    write_tape(0, 'l');
-    tape[current_loc++] = *(reinterpret_cast<uint64_t *>(&i));
+    std::memcpy(&tape[current_loc], &i, sizeof(i));
    ++current_loc;
  }
  really_inline void write_tape_u64(uint64_t i) {
@ -874,27 +933,22 @@ public:
  uint32_t current_loc{0};
  uint32_t n_structural_indexes{0};
-  uint32_t *structural_indexes;
+  std::unique_ptr<uint32_t[]> structural_indexes;
  std::unique_ptr<uint64_t[]> tape;
  std::unique_ptr<uint32_t[]> containing_scope_offset;
  uint64_t *tape;
  uint32_t *containing_scope_offset;
 #ifdef SIMDJSON_USE_COMPUTED_GOTO
-  void **ret_address;
+  std::unique_ptr<void*[]> ret_address;
 #else
-  char *ret_address;
+  std::unique_ptr<char[]> ret_address;
 #endif
-  uint8_t *string_buf; // should be at least byte_capacity
+  std::unique_ptr<uint8_t[]> string_buf;// should be at least byte_capacity
  uint8_t *current_string_buf_loc;
  bool valid{false};
-  int error_code{simdjson::UNITIALIZED};
+  int error_code{simdjson::UNINITIALIZED};
 private:
  // we don't want the default constructor to be called
  ParsedJson(const ParsedJson &p) =
      delete; // we don't want the default constructor to be called
  // we don't want the assignment to be called
  ParsedJson &operator=(const ParsedJson &o) = delete;
 };
 // dump bits low to high
@ -979,14 +1033,14 @@ public:
  // within the string: get_string_length determines the true string length.
  inline const char *get_string() const {
    return reinterpret_cast<const char *>(
-        pj->string_buf + (current_val & JSON_VALUE_MASK) + sizeof(uint32_t));
+        pj->string_buf.get() + (current_val & JSON_VALUE_MASK) + sizeof(uint32_t));
  }
  // return the length of the string in bytes
  inline uint32_t get_string_length() const {
    uint32_t answer;
    memcpy(&answer,
-           reinterpret_cast<const char *>(pj->string_buf +
+           reinterpret_cast<const char *>(pj->string_buf.get() +
                                          (current_val & JSON_VALUE_MASK)),
           sizeof(uint32_t));
    return answer;
@ -1665,22 +1719,32 @@ bool ParsedJson::BasicIterator<max_depth>::relative_move_to(const char *pointer,
 namespace simdjson {
 // Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
 // The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
 // you may want to call on a function like trimmed_length_safe_utf8.
 // A function like find_last_json_buf_idx may also prove useful.
 template <Architecture T = Architecture::NATIVE>
 int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj, bool streaming);
 // Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
 // The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
 // you may want to call on a function like trimmed_length_safe_utf8.
 // A function like find_last_json_buf_idx may also prove useful.
 template <Architecture T = Architecture::NATIVE>
 int find_structural_bits(const char *buf, size_t len, simdjson::ParsedJson &pj, bool streaming) {
  return find_structural_bits<T>((const uint8_t *)buf, len, pj, streaming);
 }
 template <Architecture T = Architecture::NATIVE>
-int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj){
+int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
-    return find_structural_bits<T>((const uint8_t *)buf, len, pj, false);
+     return find_structural_bits<T>(buf, len, pj, false);
 }
 template <Architecture T = Architecture::NATIVE>
 int find_structural_bits(const char *buf, size_t len, simdjson::ParsedJson &pj) {
-    return find_structural_bits<T>((const uint8_t *)buf, len, pj, false);
+  return find_structural_bits<T>((const uint8_t *)buf, len, pj);
 }
 }; // namespace simdjson
@ -1701,7 +1765,8 @@ WARN_UNUSED int
 unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
 template <Architecture T = Architecture::NATIVE>
-int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
+WARN_UNUSED int
 unified_machine(const char *buf, size_t len, ParsedJson &pj) {
  return unified_machine<T>(reinterpret_cast<const uint8_t *>(buf), len, pj);
 }
@ -2003,6 +2068,8 @@ namespace simdjson {
         * */
        JsonStream(const std::string &s, size_t batch_size = 1000000) : JsonStream(s.data(), s.size(), batch_size) {};
        ~JsonStream();
        /* Parse the next document found in the buffer previously given to JsonStream.
         * The content should be a valid JSON document encoded as UTF-8. If there is a
@ -2034,12 +2101,14 @@ namespace simdjson {
        /* Sets a new buffer for this JsonStream.  Will also reinitialize all the variables,
         * which acts as a reset.  A new JsonStream without initializing again.
         * */
-        void set_new_buffer(const char *buf, size_t len);
+        // todo: implement and test this function, note that _batch_size is mutable
        // void set_new_buffer(const char *buf, size_t len);
        /* Sets a new buffer for this JsonStream.  Will also reinitialize all the variables,
         * which is basically a reset.  A new JsonStream without initializing again.
         * */
-        void set_new_buffer(const std::string &s) { set_new_buffer(s.data(), s.size()); }
+        // todo: implement and test this function, note that _batch_size is mutable
        // void set_new_buffer(const std::string &s) { set_new_buffer(s.data(), s.size()); }
        /* Returns the location (index) of where the next document should be in the buffer.
         * Can be used for debugging, it tells the user the position of the end of the last
@ -2059,43 +2128,89 @@ namespace simdjson {
        size_t _len;
        size_t _batch_size;
        size_t next_json{0};
        bool error_on_last_attempt{false};
        bool load_next_batch{true};
        size_t current_buffer_loc{0};
        size_t last_json_buffer_loc{0};
        size_t n_parsed_docs{0};
        size_t n_bytes_parsed{0};
-
+#ifdef SIMDJSON_THREADS_ENABLED
        int stage1_is_ok_thread{0};
        std::thread stage_1_thread;
        simdjson::ParsedJson pj_thread;
 #ifdef SIMDJSON_THREADS_ENABLED
        /* This algorithm is used to quickly identify the buffer position of
         * the last JSON document inside the current batch.
         *
         * It does it's work by finding the last pair of structural characters
         * that represent the end followed by the start of a document.
         *
         * Simply put, we iterate over the structural characters, starting from
         * the end. We consider that we found the end of a JSON document when the
         * first element of the pair is NOT one of these characters: '{' '[' ';' ','
         * and when the second element is NOT one of these characters: '}' '}' ';' ','.
         *
         * This simple comparison works most of the time, but it does not cover cases
         * where the batch's structural indexes contain a perfect amount of documents.
         * In such a case, we do not have access to the structural index which follows
         * the last document, therefore, we do not have access to the second element in
         * the pair, and means that we cannot identify the last document. To fix this
         * issue, we keep a count of the open and closed curly/square braces we found
         * while searching for the pair. When we find a pair AND the count of open and
         * closed curly/square braces is the same, we know that we just passed a complete
         * document, therefore the last json buffer location is the end of the batch
         * */
        size_t find_last_json_buf_loc(const ParsedJson &pj);
 #endif
    };
 /* This algorithm is used to quickly identify the buffer position of
 * the last JSON document inside the current batch.
 *
 * It does its work by finding the last pair of structural characters
 * that represent the end followed by the start of a document.
 *
 * Simply put, we iterate over the structural characters, starting from
 * the end. We consider that we found the end of a JSON document when the
 * first element of the pair is NOT one of these characters: '{' '[' ';' ','
 * and when the second element is NOT one of these characters: '}' '}' ';' ','.
 *
 * This simple comparison works most of the time, but it does not cover cases
 * where the batch's structural indexes contain a perfect amount of documents.
 * In such a case, we do not have access to the structural index which follows
 * the last document, therefore, we do not have access to the second element in
 * the pair, and means that we cannot identify the last document. To fix this
 * issue, we keep a count of the open and closed curly/square braces we found
 * while searching for the pair. When we find a pair AND the count of open and
 * closed curly/square braces is the same, we know that we just passed a complete
 * document, therefore the last json buffer location is the end of the batch
 * */
 inline size_t find_last_json_buf_idx(const char * buf, size_t size, const ParsedJson &pj) {
    // this function can be generally useful
    if(pj.n_structural_indexes == 0) return 0;
    auto last_i = pj.n_structural_indexes - 1;
    if (pj.structural_indexes[last_i] == size) {
        if(last_i == 0) return 0;
        last_i = pj.n_structural_indexes - 2;
    }
    auto arr_cnt = 0;
    auto obj_cnt = 0;
    for (auto i = last_i; i > 0; i--) {
        auto idxb = pj.structural_indexes[i];
        switch (buf[idxb]) {
            case ':':
            case ',':
                continue;
            case '}':
                obj_cnt--;
                continue;
            case ']':
                arr_cnt--;
                continue;
            case '{':
                obj_cnt++;
                break;
            case '[':
                arr_cnt++;
                break;
        }
        auto idxa = pj.structural_indexes[i - 1];
        switch (buf[idxa]) {
            case '{':
            case '[':
            case ':':
            case ',':
                continue;
        }
        if (!arr_cnt && !obj_cnt) {
            return last_i+1;
        }
        return i;
    }
    return 0;
 }
 }
 #endif //SIMDJSON_JSONSTREAM_H
 /* end file include/simdjson/jsonstream.h */
--- a/src/jsonminifier.cpp
+++ b/src/jsonminifier.cpp
@ -59,8 +59,14 @@ size_t json_minify(const unsigned char *bytes, size_t how_many,
 }
 } // namespace simdjson
 #else
 //
 // This fast code is disabled in the context of runtime dispatching.
 // See issue https://github.com/lemire/simdjson/issues/384
 //
 #include "simdprune_tables.h"
 #include <cstring>
 #include <x86intrin.h> // currently, there is no runtime dispatch for the minifier
 namespace simdjson {
@ -363,18 +369,18 @@ size_t oldjson_minify(const uint8_t *buf, size_t len, uint8_t *out) {
      int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
      int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
      int pop4 = hamming((~whitespace));
-      __m256i vmask1 = _mm256_loadu2_m128i(
+      __m128i x1 = _mm256_extracti128_si256(input_lo, 0);
-          reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
+      __m128i x2 = _mm256_extracti128_si256(input_lo, 1);
-          reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
+      __m128i x3 = _mm256_extracti128_si256(input_hi, 0);
-      __m256i vmask2 = _mm256_loadu2_m128i(
+      __m128i x4 = _mm256_extracti128_si256(input_hi, 1);
-          reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
+      x1 = skinnycleanm128(x1, mask1);
-          reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
+      x2 = skinnycleanm128(x2, mask2);
-      __m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
+      x3 = skinnycleanm128(x3, mask3);
-      __m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
+      x4 = skinnycleanm128(x4, mask4);
-      _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop1),
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1);
-                           reinterpret_cast<__m128i *>(out), result1);
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2);
-      _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop3),
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3);
-                           reinterpret_cast<__m128i *>(out + pop2), result2);
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4);
      out += pop4;
    }
  }
@ -447,23 +453,24 @@ size_t oldjson_minify(const uint8_t *buf, size_t len, uint8_t *out) {
    int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
    int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
    int pop4 = hamming((~whitespace));
-    __m256i vmask1 = _mm256_loadu2_m128i(
+    __m128i x1 = _mm256_extracti128_si256(input_lo, 0);
-        reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
+    __m128i x2 = _mm256_extracti128_si256(input_lo, 1);
-        reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
+    __m128i x3 = _mm256_extracti128_si256(input_hi, 0);
-    __m256i vmask2 = _mm256_loadu2_m128i(
+    __m128i x4 = _mm256_extracti128_si256(input_hi, 1);
-        reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
+    x1 = skinnycleanm128(x1, mask1);
-        reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
+    x2 = skinnycleanm128(x2, mask2);
-    __m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
+    x3 = skinnycleanm128(x3, mask3);
-    __m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
+    x4 = skinnycleanm128(x4, mask4);
-    _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop1),
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer), x1);
-                         reinterpret_cast<__m128i *>(buffer), result1);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop1), x2);
-    _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop3),
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop2), x3);
-                         reinterpret_cast<__m128i *>(buffer + pop2), result2);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop3), x4);
    memcpy(out, buffer, pop4);
    out += pop4;
  }
  *out = '\0'; // NULL termination
  return out - initout;
 }
 } // namespace simdjson
 #endif