This removes the crazy alignment requirements. (#1073)

* This removes the crazy alignment requirements.
2020-07-27 16:19:01 -04:00 · 2020-07-27 16:19:01 -04:00 · f80668e87f
parent dcb5d47ee6
commit f80668e87f
7 changed files with 66 additions and 111 deletions
--- a/benchmark/benchmarker.h
+++ b/benchmark/benchmarker.h
@ -225,8 +225,19 @@ struct progress_bar {
  }
 };

+/**
+ * The speed at which we can allocate memory is strictly system specific.
+ * It depends on the OS and the runtime library. It is subject to various
+ * system-specific knobs. It is not something that we can reasonably 
+ * benchmark with crude timings.
+ * If someone wants to optimize how simdjson allocate memory, then it will
+ * almost surely require a distinct benchmarking tool. What is meant by
+ * "memory allocation" also requires a definition. Doing "new char[size]" can
+ * do many different things depending on the system.
+ */
+
 enum class BenchmarkStage {
-  ALL,
+  ALL, // This excludes allocation
  ALLOCATE,
  STAGE1,
  STAGE2
@ -234,7 +245,7 @@ enum class BenchmarkStage {

 const char* benchmark_stage_name(BenchmarkStage stage) {
  switch (stage) {
-    case BenchmarkStage::ALL: return "All";
+    case BenchmarkStage::ALL: return "All (Without Allocation)";
    case BenchmarkStage::ALLOCATE: return "Allocate";
    case BenchmarkStage::STAGE1: return "Stage 1";
    case BenchmarkStage::STAGE2: return "Stage 2";
@ -253,8 +264,8 @@ struct benchmarker {
  // Statistics about the JSON file independent of its speed (amount of utf-8, structurals, etc.).
  // Loaded on first parse.
  json_stats* stats;
-  // Speed and event summary for full parse (including allocation, stage 1 and stage 2)
-  event_aggregate all_stages{};
+  // Speed and event summary for full parse (stage 1 and stage 2, but *excluding* allocation)
+  event_aggregate all_stages_without_allocation{};
  // Speed and event summary for stage 1
  event_aggregate stage1{};
  // Speed and event summary for stage 2
@ -285,23 +296,24 @@ struct benchmarker {

  const event_aggregate& operator[](BenchmarkStage stage) const {
    switch (stage) {
-      case BenchmarkStage::ALL: return this->all_stages;
+      case BenchmarkStage::ALL: return this->all_stages_without_allocation;
      case BenchmarkStage::STAGE1: return this->stage1;
      case BenchmarkStage::STAGE2: return this->stage2;
      case BenchmarkStage::ALLOCATE: return this->allocate_stage;
-      default: exit_error("Unknown stage"); return this->all_stages;
+      default: exit_error("Unknown stage"); return this->all_stages_without_allocation;
    }
  }

  int iterations() const {
-    return all_stages.iterations;
+    return all_stages_without_allocation.iterations;
  }

  really_inline void run_iteration(bool stage1_only, bool hotbuffers=false) {
    // Allocate dom::parser
    collector.start();
    dom::parser parser;
-    error_code error = parser.allocate(json.size());
+    // We always allocate at least 64KB. Smaller allocations may actually be slower under some systems.
+    error_code error = parser.allocate(json.size() < 65536 ? 65536 : json.size());
    if (error) {
      exit_error(string("Unable to allocate_stage ") + to_string(json.size()) + " bytes for the JSON result: " + error_message(error));
    }
@ -329,7 +341,7 @@ struct benchmarker {
    // Stage 2 (unified machine) and the rest

    if (stage1_only) {
-      all_stages << stage1_count;
+      all_stages_without_allocation << stage1_count;
    } else {
      event_count stage2_count;
      collector.start();
@ -339,7 +351,7 @@ struct benchmarker {
      }
      stage2_count = collector.end();
      stage2 << stage2_count;
-      all_stages << allocate_count + stage1_count + stage2_count;
+      all_stages_without_allocation << stage1_count + stage2_count;
    }
    // Calculate stats the first time we parse
    if (stats == NULL) {
@ -386,7 +398,7 @@ struct benchmarker {
      prefix,
      "Speed",
      stage.elapsed_ns() / static_cast<double>(stats->blocks), // per block
-      percent(stage.elapsed_sec(), all_stages.elapsed_sec()), // %
+      percent(stage.elapsed_sec(), all_stages_without_allocation.elapsed_sec()), // %
      stage.elapsed_ns() / static_cast<double>(stats->bytes), // per byte
      stage.elapsed_ns() / static_cast<double>(stats->structurals), // per structural
      (static_cast<double>(json.size()) / 1000000000.0) / stage.elapsed_sec() // GB/s
@ -397,7 +409,7 @@ struct benchmarker {
        prefix,
        "Cycles",
        stage.cycles() / static_cast<double>(stats->blocks),
-        percent(stage.cycles(), all_stages.cycles()),
+        percent(stage.cycles(), all_stages_without_allocation.cycles()),
        stage.cycles() / static_cast<double>(stats->bytes),
        stage.cycles() / static_cast<double>(stats->structurals),
        (stage.cycles() / stage.elapsed_sec()) / 1000000000.0
@ -406,7 +418,7 @@ struct benchmarker {
        prefix,
        "Instructions",
        stage.instructions() / static_cast<double>(stats->blocks),
-        percent(stage.instructions(), all_stages.instructions()),
+        percent(stage.instructions(), all_stages_without_allocation.instructions()),
        stage.instructions() / static_cast<double>(stats->bytes),
        stage.instructions() / static_cast<double>(stats->structurals),
        stage.instructions() / static_cast<double>(stage.cycles())
@ -417,9 +429,9 @@ struct benchmarker {
        prefix,
        "Misses",
        stage.branch_misses(),
-        percent(stage.branch_misses(), all_stages.branch_misses()),
+        percent(stage.branch_misses(), all_stages_without_allocation.branch_misses()),
        stage.cache_misses(),
-        percent(stage.cache_misses(), all_stages.cache_misses()),
+        percent(stage.cache_misses(), all_stages_without_allocation.cache_misses()),
        stage.cache_references()
      );
    }
@ -456,14 +468,14 @@ struct benchmarker {
                allocate_stage.best.cycles() / static_cast<double>(json.size()),
                stage1.best.cycles() / static_cast<double>(json.size()),
                stage2.best.cycles() / static_cast<double>(json.size()),
-                all_stages.best.cycles() / static_cast<double>(json.size()),
-                gb / all_stages.best.elapsed_sec(),
+                all_stages_without_allocation.best.cycles() / static_cast<double>(json.size()),
+                gb / all_stages_without_allocation.best.elapsed_sec(),
                gb / stage1.best.elapsed_sec(),
                gb / stage2.best.elapsed_sec());
      } else {
        printf("\"%s\"\t\t\t\t\t%f\t%f\t%f\n",
                base,
-                gb / all_stages.best.elapsed_sec(),
+                gb / all_stages_without_allocation.best.elapsed_sec(),
                gb / stage1.best.elapsed_sec(),
                gb / stage2.best.elapsed_sec());
      }
@ -490,10 +502,10 @@ struct benchmarker {
          stats->blocks_with_16_structurals_flipped, percent(stats->blocks_with_16_structurals_flipped, stats->blocks));
      }
      printf("\n");
-      printf("All Stages\n");
-      print_aggregate("|    "   , all_stages.best);
+      printf("All Stages (excluding allocation)\n");
+      print_aggregate("|    "   , all_stages_without_allocation.best);
      // frequently, allocation is a tiny fraction of the running time so we omit it
-      if(allocate_stage.best.elapsed_sec() > 0.01 * all_stages.best.elapsed_sec()) {
+      if(allocate_stage.best.elapsed_sec() > 0.01 * all_stages_without_allocation.best.elapsed_sec()) {
        printf("|- Allocation\n");
        print_aggregate("|    ", allocate_stage.best);
      }
@ -504,17 +516,16 @@ struct benchmarker {
      if (collector.has_events()) {
        double freq1 = (stage1.best.cycles() / stage1.best.elapsed_sec()) / 1000000000.0;
        double freq2 = (stage2.best.cycles() / stage2.best.elapsed_sec()) / 1000000000.0;
-        double freqall = (all_stages.best.cycles() / all_stages.best.elapsed_sec()) / 1000000000.0;
+        double freqall = (all_stages_without_allocation.best.cycles() / all_stages_without_allocation.best.elapsed_sec()) / 1000000000.0;
        double freqmin = min(freq1, freq2);
        double freqmax = max(freq1, freq2);
        if((freqall < 0.95 * freqmin) or (freqall > 1.05 * freqmax)) {
          printf("\nWarning: The processor frequency fluctuates in an expected way!!!\n"
-          "Expect the overall speed not to match stage 1 and stage 2 speeds.\n"
          "Range for stage 1 and stage 2 : [%.3f GHz, %.3f GHz], overall: %.3f GHz.\n",
          freqmin, freqmax, freqall);
        }
      }
-      printf("\n%.1f documents parsed per second (best)\n", 1.0/static_cast<double>(all_stages.best.elapsed_sec()));
+      printf("\n%.1f documents parsed per second (best)\n", 1.0/static_cast<double>(all_stages_without_allocation.best.elapsed_sec()));
    }
  }
 };
--- a/benchmark/minifiercompetition.cpp
+++ b/benchmark/minifiercompetition.cpp
@ -93,6 +93,10 @@ int main(int argc, char *argv[]) {
    std::cout << std::endl;
  }
  char *buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1);
+  if(buffer == nullptr) {
+    std::cerr << "Out of memory!" << std::endl;
+    abort();
+  }
  memcpy(buffer, p.data(), p.size());
  buffer[p.size()] = '\0';

@ -139,6 +143,10 @@ int main(int argc, char *argv[]) {
            !just_data);

  char *mini_buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1);
+  if(mini_buffer == nullptr) {
+    std::cerr << "Out of memory" << std::endl;
+    abort();
+  }
  size_t minisize;
  auto minierror = minify(p.data(), p.size(),mini_buffer, minisize);
  if (!minierror) { std::cerr << minierror << std::endl; exit(1); }
--- a/include/simdjson/dom/parser.h
+++ b/include/simdjson/dom/parser.h
@ -391,12 +391,7 @@ private:
  /**
   * The loaded buffer (reused each time load() is called)
   */
-  #if defined(_MSC_VER) && _MSC_VER < 1910
-  // older versions of Visual Studio lack proper support for unique_ptr.
  std::unique_ptr<char[]> loaded_bytes;
-  #else
-  std::unique_ptr<char[], decltype(&aligned_free_char)> loaded_bytes;
-  #endif

  /** Capacity of loaded_bytes buffer. */
  size_t _loaded_bytes_capacity{0};
--- a/include/simdjson/inline/padded_string.h
+++ b/include/simdjson/inline/padded_string.h
@ -12,26 +12,21 @@
 namespace simdjson {
 namespace internal {

-// low-level function to allocate memory with padding so we can read past the
-// "length" bytes safely. if you must provide a pointer to some data, create it
-// with this function: length is the max. size in bytes of the string caller is
-// responsible to free the memory (free(...))
+// The allocate_padded_buffer function is a low-level function to allocate memory 
+// with padding so we can read past the "length" bytes safely. It is used by 
+// the padded_string class automatically. It returns nullptr in case
+// of error: the caller should check for a null pointer.
+// The length parameter is the maximum size in bytes of the string. 
+// The caller is responsible to free the memory (e.g., delete[] (...)).
 inline char *allocate_padded_buffer(size_t length) noexcept {
-  // we could do a simple malloc
-  // return (char *) malloc(length + SIMDJSON_PADDING);
-  // However, we might as well align to cache lines...
  size_t totalpaddedlength = length + SIMDJSON_PADDING;
-#if defined(_MSC_VER) && _MSC_VER < 1910
-  // For legacy Visual Studio 2015 since it does not have proper C++11 support
-  char *padded_buffer = new[totalpaddedlength];
-#else
-  char *padded_buffer = aligned_malloc_char(64, totalpaddedlength);
-#endif
-#ifndef NDEBUG
+  char *padded_buffer = new (std::nothrow) char[totalpaddedlength];
  if (padded_buffer == nullptr) {
    return nullptr;
  }
-#endif // NDEBUG
+  // We write zeroes in the padded region to avoid having uninitized 
+  // garbage. If nothing else, garbage getting read might trigger a 
+  // warning in a memory checking.
  memset(padded_buffer + length, 0, totalpaddedlength - length);
  return padded_buffer;
 } // allocate_padded_buffer()
@ -74,7 +69,7 @@ inline padded_string::padded_string(padded_string &&o) noexcept
 }

 inline padded_string &padded_string::operator=(padded_string &&o) noexcept {
-  aligned_free_char(data_ptr);
+  delete[] data_ptr;
  data_ptr = o.data_ptr;
  viable_size = o.viable_size;
  o.data_ptr = nullptr; // we take ownership
@ -92,7 +87,7 @@ inline void padded_string::swap(padded_string &o) noexcept {
 }

 inline padded_string::~padded_string() noexcept {
-  aligned_free_char(data_ptr);
+  delete[] data_ptr;
 }

 inline size_t padded_string::size() const noexcept { return viable_size; }
--- a/include/simdjson/inline/parser.h
+++ b/include/simdjson/inline/parser.h
@ -15,18 +15,10 @@ namespace dom {
 //
 // parser inline implementation
 //
-#if defined(_MSC_VER) && _MSC_VER < 1910
-// older versions of Visual Studio lack proper support for unique_ptr.
 really_inline parser::parser(size_t max_capacity) noexcept
  : _max_capacity{max_capacity},
    loaded_bytes(nullptr) {
 }
-#else 
-really_inline parser::parser(size_t max_capacity) noexcept
-  : _max_capacity{max_capacity},
-    loaded_bytes(nullptr, &aligned_free_char) {
-}
-#endif
 really_inline parser::parser(parser &&other) noexcept = default;
 really_inline parser &parser::operator=(parser &&other) noexcept = default;

@ -101,19 +93,14 @@ inline simdjson_result<document_stream> parser::load_many(const std::string &pat
 inline simdjson_result<element> parser::parse(const uint8_t *buf, size_t len, bool realloc_if_needed) & noexcept {
  error_code _error = ensure_capacity(len);
  if (_error) { return _error; }
+  std::unique_ptr<uint8_t[]> tmp_buf;

  if (realloc_if_needed) {
-    const uint8_t *tmp_buf = buf;
-    buf = (uint8_t *)internal::allocate_padded_buffer(len);
-    if (buf == nullptr)
-      return MEMALLOC;
-    memcpy((void *)buf, tmp_buf, len);
-  }
-
-  _error = implementation->parse(buf, len, doc);
-  if (realloc_if_needed) {
-    aligned_free((void *)buf); // must free before we exit
+    tmp_buf.reset((uint8_t *)internal::allocate_padded_buffer(len));
+    if (tmp_buf.get() == nullptr) { return MEMALLOC; }
+    memcpy((void *)tmp_buf.get(), buf, len);
  }
+  _error = implementation->parse(realloc_if_needed ? tmp_buf.get() : buf, len, doc);
  if (_error) { return _error; }

  return doc.root();
--- a/include/simdjson/padded_string.h
+++ b/include/simdjson/padded_string.h
@ -144,10 +144,12 @@ inline simdjson::padded_string operator "" _padded(const char *str, size_t len)
 namespace simdjson {
 namespace internal {

-// low-level function to allocate memory with padding so we can read past the
-// "length" bytes safely. if you must provide a pointer to some data, create it
-// with this function: length is the max. size in bytes of the string caller is
-// responsible to free the memory (free(...))
+// The allocate_padded_buffer function is a low-level function to allocate memory 
+// with padding so we can read past the "length" bytes safely. It is used by 
+// the padded_string class automatically. It returns nullptr in case
+// of error: the caller should check for a null pointer.
+// The length parameter is the maximum size in bytes of the string. 
+// The caller is responsible to free the memory (e.g., delete[] (...)).
 inline char *allocate_padded_buffer(size_t length) noexcept;

 } // namespace internal
--- a/include/simdjson/portability.h
+++ b/include/simdjson/portability.h
@ -174,48 +174,6 @@ use a 64-bit target such as x64 or 64-bit ARM.")
 #define simdjson_strncasecmp strncasecmp
 #endif

-namespace simdjson {
-/** @private portable version of  posix_memalign */
-static inline void *aligned_malloc(size_t alignment, size_t size) {
-  void *p;
-#ifdef SIMDJSON_VISUAL_STUDIO
-  p = _aligned_malloc(size, alignment);
-#elif defined(__MINGW32__) || defined(__MINGW64__)
-  p = __mingw_aligned_malloc(size, alignment);
-#else
-  // somehow, if this is used before including "x86intrin.h", it creates an
-  // implicit defined warning.
-  if (posix_memalign(&p, alignment, size) != 0) {
-    return nullptr;
-  }
-#endif
-  return p;
-}
-
-/** @private */
-static inline char *aligned_malloc_char(size_t alignment, size_t size) {
-  return (char *)aligned_malloc(alignment, size);
-}
-
-/** @private */
-static inline void aligned_free(void *mem_block) {
-  if (mem_block == nullptr) {
-    return;
-  }
-#ifdef SIMDJSON_VISUAL_STUDIO
-  _aligned_free(mem_block);
-#elif defined(__MINGW32__) || defined(__MINGW64__)
-  __mingw_aligned_free(mem_block);
-#else
-  free(mem_block);
-#endif
-}
-
-/** @private */
-static inline void aligned_free_char(char *mem_block) {
-  aligned_free((void *)mem_block);
-}
-
 #ifdef NDEBUG

 #ifdef SIMDJSON_VISUAL_STUDIO
@ -233,5 +191,4 @@ static inline void aligned_free_char(char *mem_block) {

 #endif

-} // namespace simdjson
 #endif // SIMDJSON_PORTABILITY_H