This removes the crazy alignment requirements. (#1073)

* This removes the crazy alignment requirements.
This commit is contained in:
Daniel Lemire 2020-07-27 16:19:01 -04:00 committed by GitHub
parent dcb5d47ee6
commit f80668e87f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 66 additions and 111 deletions

View File

@ -225,8 +225,19 @@ struct progress_bar {
}
};
/**
* The speed at which we can allocate memory is strictly system specific.
* It depends on the OS and the runtime library. It is subject to various
* system-specific knobs. It is not something that we can reasonably
* benchmark with crude timings.
* If someone wants to optimize how simdjson allocate memory, then it will
* almost surely require a distinct benchmarking tool. What is meant by
* "memory allocation" also requires a definition. Doing "new char[size]" can
* do many different things depending on the system.
*/
enum class BenchmarkStage {
ALL,
ALL, // This excludes allocation
ALLOCATE,
STAGE1,
STAGE2
@ -234,7 +245,7 @@ enum class BenchmarkStage {
const char* benchmark_stage_name(BenchmarkStage stage) {
switch (stage) {
case BenchmarkStage::ALL: return "All";
case BenchmarkStage::ALL: return "All (Without Allocation)";
case BenchmarkStage::ALLOCATE: return "Allocate";
case BenchmarkStage::STAGE1: return "Stage 1";
case BenchmarkStage::STAGE2: return "Stage 2";
@ -253,8 +264,8 @@ struct benchmarker {
// Statistics about the JSON file independent of its speed (amount of utf-8, structurals, etc.).
// Loaded on first parse.
json_stats* stats;
// Speed and event summary for full parse (including allocation, stage 1 and stage 2)
event_aggregate all_stages{};
// Speed and event summary for full parse (stage 1 and stage 2, but *excluding* allocation)
event_aggregate all_stages_without_allocation{};
// Speed and event summary for stage 1
event_aggregate stage1{};
// Speed and event summary for stage 2
@ -285,23 +296,24 @@ struct benchmarker {
const event_aggregate& operator[](BenchmarkStage stage) const {
switch (stage) {
case BenchmarkStage::ALL: return this->all_stages;
case BenchmarkStage::ALL: return this->all_stages_without_allocation;
case BenchmarkStage::STAGE1: return this->stage1;
case BenchmarkStage::STAGE2: return this->stage2;
case BenchmarkStage::ALLOCATE: return this->allocate_stage;
default: exit_error("Unknown stage"); return this->all_stages;
default: exit_error("Unknown stage"); return this->all_stages_without_allocation;
}
}
int iterations() const {
return all_stages.iterations;
return all_stages_without_allocation.iterations;
}
really_inline void run_iteration(bool stage1_only, bool hotbuffers=false) {
// Allocate dom::parser
collector.start();
dom::parser parser;
error_code error = parser.allocate(json.size());
// We always allocate at least 64KB. Smaller allocations may actually be slower under some systems.
error_code error = parser.allocate(json.size() < 65536 ? 65536 : json.size());
if (error) {
exit_error(string("Unable to allocate_stage ") + to_string(json.size()) + " bytes for the JSON result: " + error_message(error));
}
@ -329,7 +341,7 @@ struct benchmarker {
// Stage 2 (unified machine) and the rest
if (stage1_only) {
all_stages << stage1_count;
all_stages_without_allocation << stage1_count;
} else {
event_count stage2_count;
collector.start();
@ -339,7 +351,7 @@ struct benchmarker {
}
stage2_count = collector.end();
stage2 << stage2_count;
all_stages << allocate_count + stage1_count + stage2_count;
all_stages_without_allocation << stage1_count + stage2_count;
}
// Calculate stats the first time we parse
if (stats == NULL) {
@ -386,7 +398,7 @@ struct benchmarker {
prefix,
"Speed",
stage.elapsed_ns() / static_cast<double>(stats->blocks), // per block
percent(stage.elapsed_sec(), all_stages.elapsed_sec()), // %
percent(stage.elapsed_sec(), all_stages_without_allocation.elapsed_sec()), // %
stage.elapsed_ns() / static_cast<double>(stats->bytes), // per byte
stage.elapsed_ns() / static_cast<double>(stats->structurals), // per structural
(static_cast<double>(json.size()) / 1000000000.0) / stage.elapsed_sec() // GB/s
@ -397,7 +409,7 @@ struct benchmarker {
prefix,
"Cycles",
stage.cycles() / static_cast<double>(stats->blocks),
percent(stage.cycles(), all_stages.cycles()),
percent(stage.cycles(), all_stages_without_allocation.cycles()),
stage.cycles() / static_cast<double>(stats->bytes),
stage.cycles() / static_cast<double>(stats->structurals),
(stage.cycles() / stage.elapsed_sec()) / 1000000000.0
@ -406,7 +418,7 @@ struct benchmarker {
prefix,
"Instructions",
stage.instructions() / static_cast<double>(stats->blocks),
percent(stage.instructions(), all_stages.instructions()),
percent(stage.instructions(), all_stages_without_allocation.instructions()),
stage.instructions() / static_cast<double>(stats->bytes),
stage.instructions() / static_cast<double>(stats->structurals),
stage.instructions() / static_cast<double>(stage.cycles())
@ -417,9 +429,9 @@ struct benchmarker {
prefix,
"Misses",
stage.branch_misses(),
percent(stage.branch_misses(), all_stages.branch_misses()),
percent(stage.branch_misses(), all_stages_without_allocation.branch_misses()),
stage.cache_misses(),
percent(stage.cache_misses(), all_stages.cache_misses()),
percent(stage.cache_misses(), all_stages_without_allocation.cache_misses()),
stage.cache_references()
);
}
@ -456,14 +468,14 @@ struct benchmarker {
allocate_stage.best.cycles() / static_cast<double>(json.size()),
stage1.best.cycles() / static_cast<double>(json.size()),
stage2.best.cycles() / static_cast<double>(json.size()),
all_stages.best.cycles() / static_cast<double>(json.size()),
gb / all_stages.best.elapsed_sec(),
all_stages_without_allocation.best.cycles() / static_cast<double>(json.size()),
gb / all_stages_without_allocation.best.elapsed_sec(),
gb / stage1.best.elapsed_sec(),
gb / stage2.best.elapsed_sec());
} else {
printf("\"%s\"\t\t\t\t\t%f\t%f\t%f\n",
base,
gb / all_stages.best.elapsed_sec(),
gb / all_stages_without_allocation.best.elapsed_sec(),
gb / stage1.best.elapsed_sec(),
gb / stage2.best.elapsed_sec());
}
@ -490,10 +502,10 @@ struct benchmarker {
stats->blocks_with_16_structurals_flipped, percent(stats->blocks_with_16_structurals_flipped, stats->blocks));
}
printf("\n");
printf("All Stages\n");
print_aggregate("| " , all_stages.best);
printf("All Stages (excluding allocation)\n");
print_aggregate("| " , all_stages_without_allocation.best);
// frequently, allocation is a tiny fraction of the running time so we omit it
if(allocate_stage.best.elapsed_sec() > 0.01 * all_stages.best.elapsed_sec()) {
if(allocate_stage.best.elapsed_sec() > 0.01 * all_stages_without_allocation.best.elapsed_sec()) {
printf("|- Allocation\n");
print_aggregate("| ", allocate_stage.best);
}
@ -504,17 +516,16 @@ struct benchmarker {
if (collector.has_events()) {
double freq1 = (stage1.best.cycles() / stage1.best.elapsed_sec()) / 1000000000.0;
double freq2 = (stage2.best.cycles() / stage2.best.elapsed_sec()) / 1000000000.0;
double freqall = (all_stages.best.cycles() / all_stages.best.elapsed_sec()) / 1000000000.0;
double freqall = (all_stages_without_allocation.best.cycles() / all_stages_without_allocation.best.elapsed_sec()) / 1000000000.0;
double freqmin = min(freq1, freq2);
double freqmax = max(freq1, freq2);
if((freqall < 0.95 * freqmin) or (freqall > 1.05 * freqmax)) {
printf("\nWarning: The processor frequency fluctuates in an expected way!!!\n"
"Expect the overall speed not to match stage 1 and stage 2 speeds.\n"
"Range for stage 1 and stage 2 : [%.3f GHz, %.3f GHz], overall: %.3f GHz.\n",
freqmin, freqmax, freqall);
}
}
printf("\n%.1f documents parsed per second (best)\n", 1.0/static_cast<double>(all_stages.best.elapsed_sec()));
printf("\n%.1f documents parsed per second (best)\n", 1.0/static_cast<double>(all_stages_without_allocation.best.elapsed_sec()));
}
}
};

View File

@ -93,6 +93,10 @@ int main(int argc, char *argv[]) {
std::cout << std::endl;
}
char *buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1);
if(buffer == nullptr) {
std::cerr << "Out of memory!" << std::endl;
abort();
}
memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0';
@ -139,6 +143,10 @@ int main(int argc, char *argv[]) {
!just_data);
char *mini_buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1);
if(mini_buffer == nullptr) {
std::cerr << "Out of memory" << std::endl;
abort();
}
size_t minisize;
auto minierror = minify(p.data(), p.size(),mini_buffer, minisize);
if (!minierror) { std::cerr << minierror << std::endl; exit(1); }

View File

@ -391,12 +391,7 @@ private:
/**
* The loaded buffer (reused each time load() is called)
*/
#if defined(_MSC_VER) && _MSC_VER < 1910
// older versions of Visual Studio lack proper support for unique_ptr.
std::unique_ptr<char[]> loaded_bytes;
#else
std::unique_ptr<char[], decltype(&aligned_free_char)> loaded_bytes;
#endif
/** Capacity of loaded_bytes buffer. */
size_t _loaded_bytes_capacity{0};

View File

@ -12,26 +12,21 @@
namespace simdjson {
namespace internal {
// low-level function to allocate memory with padding so we can read past the
// "length" bytes safely. if you must provide a pointer to some data, create it
// with this function: length is the max. size in bytes of the string caller is
// responsible to free the memory (free(...))
// The allocate_padded_buffer function is a low-level function to allocate memory
// with padding so we can read past the "length" bytes safely. It is used by
// the padded_string class automatically. It returns nullptr in case
// of error: the caller should check for a null pointer.
// The length parameter is the maximum size in bytes of the string.
// The caller is responsible to free the memory (e.g., delete[] (...)).
inline char *allocate_padded_buffer(size_t length) noexcept {
// we could do a simple malloc
// return (char *) malloc(length + SIMDJSON_PADDING);
// However, we might as well align to cache lines...
size_t totalpaddedlength = length + SIMDJSON_PADDING;
#if defined(_MSC_VER) && _MSC_VER < 1910
// For legacy Visual Studio 2015 since it does not have proper C++11 support
char *padded_buffer = new[totalpaddedlength];
#else
char *padded_buffer = aligned_malloc_char(64, totalpaddedlength);
#endif
#ifndef NDEBUG
char *padded_buffer = new (std::nothrow) char[totalpaddedlength];
if (padded_buffer == nullptr) {
return nullptr;
}
#endif // NDEBUG
// We write zeroes in the padded region to avoid having uninitized
// garbage. If nothing else, garbage getting read might trigger a
// warning in a memory checking.
memset(padded_buffer + length, 0, totalpaddedlength - length);
return padded_buffer;
} // allocate_padded_buffer()
@ -74,7 +69,7 @@ inline padded_string::padded_string(padded_string &&o) noexcept
}
inline padded_string &padded_string::operator=(padded_string &&o) noexcept {
aligned_free_char(data_ptr);
delete[] data_ptr;
data_ptr = o.data_ptr;
viable_size = o.viable_size;
o.data_ptr = nullptr; // we take ownership
@ -92,7 +87,7 @@ inline void padded_string::swap(padded_string &o) noexcept {
}
inline padded_string::~padded_string() noexcept {
aligned_free_char(data_ptr);
delete[] data_ptr;
}
inline size_t padded_string::size() const noexcept { return viable_size; }

View File

@ -15,18 +15,10 @@ namespace dom {
//
// parser inline implementation
//
#if defined(_MSC_VER) && _MSC_VER < 1910
// older versions of Visual Studio lack proper support for unique_ptr.
really_inline parser::parser(size_t max_capacity) noexcept
: _max_capacity{max_capacity},
loaded_bytes(nullptr) {
}
#else
really_inline parser::parser(size_t max_capacity) noexcept
: _max_capacity{max_capacity},
loaded_bytes(nullptr, &aligned_free_char) {
}
#endif
really_inline parser::parser(parser &&other) noexcept = default;
really_inline parser &parser::operator=(parser &&other) noexcept = default;
@ -101,19 +93,14 @@ inline simdjson_result<document_stream> parser::load_many(const std::string &pat
inline simdjson_result<element> parser::parse(const uint8_t *buf, size_t len, bool realloc_if_needed) & noexcept {
error_code _error = ensure_capacity(len);
if (_error) { return _error; }
std::unique_ptr<uint8_t[]> tmp_buf;
if (realloc_if_needed) {
const uint8_t *tmp_buf = buf;
buf = (uint8_t *)internal::allocate_padded_buffer(len);
if (buf == nullptr)
return MEMALLOC;
memcpy((void *)buf, tmp_buf, len);
}
_error = implementation->parse(buf, len, doc);
if (realloc_if_needed) {
aligned_free((void *)buf); // must free before we exit
tmp_buf.reset((uint8_t *)internal::allocate_padded_buffer(len));
if (tmp_buf.get() == nullptr) { return MEMALLOC; }
memcpy((void *)tmp_buf.get(), buf, len);
}
_error = implementation->parse(realloc_if_needed ? tmp_buf.get() : buf, len, doc);
if (_error) { return _error; }
return doc.root();

View File

@ -144,10 +144,12 @@ inline simdjson::padded_string operator "" _padded(const char *str, size_t len)
namespace simdjson {
namespace internal {
// low-level function to allocate memory with padding so we can read past the
// "length" bytes safely. if you must provide a pointer to some data, create it
// with this function: length is the max. size in bytes of the string caller is
// responsible to free the memory (free(...))
// The allocate_padded_buffer function is a low-level function to allocate memory
// with padding so we can read past the "length" bytes safely. It is used by
// the padded_string class automatically. It returns nullptr in case
// of error: the caller should check for a null pointer.
// The length parameter is the maximum size in bytes of the string.
// The caller is responsible to free the memory (e.g., delete[] (...)).
inline char *allocate_padded_buffer(size_t length) noexcept;
} // namespace internal

View File

@ -174,48 +174,6 @@ use a 64-bit target such as x64 or 64-bit ARM.")
#define simdjson_strncasecmp strncasecmp
#endif
namespace simdjson {
/** @private portable version of posix_memalign */
static inline void *aligned_malloc(size_t alignment, size_t size) {
void *p;
#ifdef SIMDJSON_VISUAL_STUDIO
p = _aligned_malloc(size, alignment);
#elif defined(__MINGW32__) || defined(__MINGW64__)
p = __mingw_aligned_malloc(size, alignment);
#else
// somehow, if this is used before including "x86intrin.h", it creates an
// implicit defined warning.
if (posix_memalign(&p, alignment, size) != 0) {
return nullptr;
}
#endif
return p;
}
/** @private */
static inline char *aligned_malloc_char(size_t alignment, size_t size) {
return (char *)aligned_malloc(alignment, size);
}
/** @private */
static inline void aligned_free(void *mem_block) {
if (mem_block == nullptr) {
return;
}
#ifdef SIMDJSON_VISUAL_STUDIO
_aligned_free(mem_block);
#elif defined(__MINGW32__) || defined(__MINGW64__)
__mingw_aligned_free(mem_block);
#else
free(mem_block);
#endif
}
/** @private */
static inline void aligned_free_char(char *mem_block) {
aligned_free((void *)mem_block);
}
#ifdef NDEBUG
#ifdef SIMDJSON_VISUAL_STUDIO
@ -233,5 +191,4 @@ static inline void aligned_free_char(char *mem_block) {
#endif
} // namespace simdjson
#endif // SIMDJSON_PORTABILITY_H