This removes the crazy alignment requirements. (#1073)
* This removes the crazy alignment requirements.
This commit is contained in:
parent
dcb5d47ee6
commit
f80668e87f
|
@ -225,8 +225,19 @@ struct progress_bar {
|
|||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* The speed at which we can allocate memory is strictly system specific.
|
||||
* It depends on the OS and the runtime library. It is subject to various
|
||||
* system-specific knobs. It is not something that we can reasonably
|
||||
* benchmark with crude timings.
|
||||
* If someone wants to optimize how simdjson allocate memory, then it will
|
||||
* almost surely require a distinct benchmarking tool. What is meant by
|
||||
* "memory allocation" also requires a definition. Doing "new char[size]" can
|
||||
* do many different things depending on the system.
|
||||
*/
|
||||
|
||||
enum class BenchmarkStage {
|
||||
ALL,
|
||||
ALL, // This excludes allocation
|
||||
ALLOCATE,
|
||||
STAGE1,
|
||||
STAGE2
|
||||
|
@ -234,7 +245,7 @@ enum class BenchmarkStage {
|
|||
|
||||
const char* benchmark_stage_name(BenchmarkStage stage) {
|
||||
switch (stage) {
|
||||
case BenchmarkStage::ALL: return "All";
|
||||
case BenchmarkStage::ALL: return "All (Without Allocation)";
|
||||
case BenchmarkStage::ALLOCATE: return "Allocate";
|
||||
case BenchmarkStage::STAGE1: return "Stage 1";
|
||||
case BenchmarkStage::STAGE2: return "Stage 2";
|
||||
|
@ -253,8 +264,8 @@ struct benchmarker {
|
|||
// Statistics about the JSON file independent of its speed (amount of utf-8, structurals, etc.).
|
||||
// Loaded on first parse.
|
||||
json_stats* stats;
|
||||
// Speed and event summary for full parse (including allocation, stage 1 and stage 2)
|
||||
event_aggregate all_stages{};
|
||||
// Speed and event summary for full parse (stage 1 and stage 2, but *excluding* allocation)
|
||||
event_aggregate all_stages_without_allocation{};
|
||||
// Speed and event summary for stage 1
|
||||
event_aggregate stage1{};
|
||||
// Speed and event summary for stage 2
|
||||
|
@ -285,23 +296,24 @@ struct benchmarker {
|
|||
|
||||
const event_aggregate& operator[](BenchmarkStage stage) const {
|
||||
switch (stage) {
|
||||
case BenchmarkStage::ALL: return this->all_stages;
|
||||
case BenchmarkStage::ALL: return this->all_stages_without_allocation;
|
||||
case BenchmarkStage::STAGE1: return this->stage1;
|
||||
case BenchmarkStage::STAGE2: return this->stage2;
|
||||
case BenchmarkStage::ALLOCATE: return this->allocate_stage;
|
||||
default: exit_error("Unknown stage"); return this->all_stages;
|
||||
default: exit_error("Unknown stage"); return this->all_stages_without_allocation;
|
||||
}
|
||||
}
|
||||
|
||||
int iterations() const {
|
||||
return all_stages.iterations;
|
||||
return all_stages_without_allocation.iterations;
|
||||
}
|
||||
|
||||
really_inline void run_iteration(bool stage1_only, bool hotbuffers=false) {
|
||||
// Allocate dom::parser
|
||||
collector.start();
|
||||
dom::parser parser;
|
||||
error_code error = parser.allocate(json.size());
|
||||
// We always allocate at least 64KB. Smaller allocations may actually be slower under some systems.
|
||||
error_code error = parser.allocate(json.size() < 65536 ? 65536 : json.size());
|
||||
if (error) {
|
||||
exit_error(string("Unable to allocate_stage ") + to_string(json.size()) + " bytes for the JSON result: " + error_message(error));
|
||||
}
|
||||
|
@ -329,7 +341,7 @@ struct benchmarker {
|
|||
// Stage 2 (unified machine) and the rest
|
||||
|
||||
if (stage1_only) {
|
||||
all_stages << stage1_count;
|
||||
all_stages_without_allocation << stage1_count;
|
||||
} else {
|
||||
event_count stage2_count;
|
||||
collector.start();
|
||||
|
@ -339,7 +351,7 @@ struct benchmarker {
|
|||
}
|
||||
stage2_count = collector.end();
|
||||
stage2 << stage2_count;
|
||||
all_stages << allocate_count + stage1_count + stage2_count;
|
||||
all_stages_without_allocation << stage1_count + stage2_count;
|
||||
}
|
||||
// Calculate stats the first time we parse
|
||||
if (stats == NULL) {
|
||||
|
@ -386,7 +398,7 @@ struct benchmarker {
|
|||
prefix,
|
||||
"Speed",
|
||||
stage.elapsed_ns() / static_cast<double>(stats->blocks), // per block
|
||||
percent(stage.elapsed_sec(), all_stages.elapsed_sec()), // %
|
||||
percent(stage.elapsed_sec(), all_stages_without_allocation.elapsed_sec()), // %
|
||||
stage.elapsed_ns() / static_cast<double>(stats->bytes), // per byte
|
||||
stage.elapsed_ns() / static_cast<double>(stats->structurals), // per structural
|
||||
(static_cast<double>(json.size()) / 1000000000.0) / stage.elapsed_sec() // GB/s
|
||||
|
@ -397,7 +409,7 @@ struct benchmarker {
|
|||
prefix,
|
||||
"Cycles",
|
||||
stage.cycles() / static_cast<double>(stats->blocks),
|
||||
percent(stage.cycles(), all_stages.cycles()),
|
||||
percent(stage.cycles(), all_stages_without_allocation.cycles()),
|
||||
stage.cycles() / static_cast<double>(stats->bytes),
|
||||
stage.cycles() / static_cast<double>(stats->structurals),
|
||||
(stage.cycles() / stage.elapsed_sec()) / 1000000000.0
|
||||
|
@ -406,7 +418,7 @@ struct benchmarker {
|
|||
prefix,
|
||||
"Instructions",
|
||||
stage.instructions() / static_cast<double>(stats->blocks),
|
||||
percent(stage.instructions(), all_stages.instructions()),
|
||||
percent(stage.instructions(), all_stages_without_allocation.instructions()),
|
||||
stage.instructions() / static_cast<double>(stats->bytes),
|
||||
stage.instructions() / static_cast<double>(stats->structurals),
|
||||
stage.instructions() / static_cast<double>(stage.cycles())
|
||||
|
@ -417,9 +429,9 @@ struct benchmarker {
|
|||
prefix,
|
||||
"Misses",
|
||||
stage.branch_misses(),
|
||||
percent(stage.branch_misses(), all_stages.branch_misses()),
|
||||
percent(stage.branch_misses(), all_stages_without_allocation.branch_misses()),
|
||||
stage.cache_misses(),
|
||||
percent(stage.cache_misses(), all_stages.cache_misses()),
|
||||
percent(stage.cache_misses(), all_stages_without_allocation.cache_misses()),
|
||||
stage.cache_references()
|
||||
);
|
||||
}
|
||||
|
@ -456,14 +468,14 @@ struct benchmarker {
|
|||
allocate_stage.best.cycles() / static_cast<double>(json.size()),
|
||||
stage1.best.cycles() / static_cast<double>(json.size()),
|
||||
stage2.best.cycles() / static_cast<double>(json.size()),
|
||||
all_stages.best.cycles() / static_cast<double>(json.size()),
|
||||
gb / all_stages.best.elapsed_sec(),
|
||||
all_stages_without_allocation.best.cycles() / static_cast<double>(json.size()),
|
||||
gb / all_stages_without_allocation.best.elapsed_sec(),
|
||||
gb / stage1.best.elapsed_sec(),
|
||||
gb / stage2.best.elapsed_sec());
|
||||
} else {
|
||||
printf("\"%s\"\t\t\t\t\t%f\t%f\t%f\n",
|
||||
base,
|
||||
gb / all_stages.best.elapsed_sec(),
|
||||
gb / all_stages_without_allocation.best.elapsed_sec(),
|
||||
gb / stage1.best.elapsed_sec(),
|
||||
gb / stage2.best.elapsed_sec());
|
||||
}
|
||||
|
@ -490,10 +502,10 @@ struct benchmarker {
|
|||
stats->blocks_with_16_structurals_flipped, percent(stats->blocks_with_16_structurals_flipped, stats->blocks));
|
||||
}
|
||||
printf("\n");
|
||||
printf("All Stages\n");
|
||||
print_aggregate("| " , all_stages.best);
|
||||
printf("All Stages (excluding allocation)\n");
|
||||
print_aggregate("| " , all_stages_without_allocation.best);
|
||||
// frequently, allocation is a tiny fraction of the running time so we omit it
|
||||
if(allocate_stage.best.elapsed_sec() > 0.01 * all_stages.best.elapsed_sec()) {
|
||||
if(allocate_stage.best.elapsed_sec() > 0.01 * all_stages_without_allocation.best.elapsed_sec()) {
|
||||
printf("|- Allocation\n");
|
||||
print_aggregate("| ", allocate_stage.best);
|
||||
}
|
||||
|
@ -504,17 +516,16 @@ struct benchmarker {
|
|||
if (collector.has_events()) {
|
||||
double freq1 = (stage1.best.cycles() / stage1.best.elapsed_sec()) / 1000000000.0;
|
||||
double freq2 = (stage2.best.cycles() / stage2.best.elapsed_sec()) / 1000000000.0;
|
||||
double freqall = (all_stages.best.cycles() / all_stages.best.elapsed_sec()) / 1000000000.0;
|
||||
double freqall = (all_stages_without_allocation.best.cycles() / all_stages_without_allocation.best.elapsed_sec()) / 1000000000.0;
|
||||
double freqmin = min(freq1, freq2);
|
||||
double freqmax = max(freq1, freq2);
|
||||
if((freqall < 0.95 * freqmin) or (freqall > 1.05 * freqmax)) {
|
||||
printf("\nWarning: The processor frequency fluctuates in an expected way!!!\n"
|
||||
"Expect the overall speed not to match stage 1 and stage 2 speeds.\n"
|
||||
"Range for stage 1 and stage 2 : [%.3f GHz, %.3f GHz], overall: %.3f GHz.\n",
|
||||
freqmin, freqmax, freqall);
|
||||
}
|
||||
}
|
||||
printf("\n%.1f documents parsed per second (best)\n", 1.0/static_cast<double>(all_stages.best.elapsed_sec()));
|
||||
printf("\n%.1f documents parsed per second (best)\n", 1.0/static_cast<double>(all_stages_without_allocation.best.elapsed_sec()));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -93,6 +93,10 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << std::endl;
|
||||
}
|
||||
char *buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1);
|
||||
if(buffer == nullptr) {
|
||||
std::cerr << "Out of memory!" << std::endl;
|
||||
abort();
|
||||
}
|
||||
memcpy(buffer, p.data(), p.size());
|
||||
buffer[p.size()] = '\0';
|
||||
|
||||
|
@ -139,6 +143,10 @@ int main(int argc, char *argv[]) {
|
|||
!just_data);
|
||||
|
||||
char *mini_buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1);
|
||||
if(mini_buffer == nullptr) {
|
||||
std::cerr << "Out of memory" << std::endl;
|
||||
abort();
|
||||
}
|
||||
size_t minisize;
|
||||
auto minierror = minify(p.data(), p.size(),mini_buffer, minisize);
|
||||
if (!minierror) { std::cerr << minierror << std::endl; exit(1); }
|
||||
|
|
|
@ -391,12 +391,7 @@ private:
|
|||
/**
|
||||
* The loaded buffer (reused each time load() is called)
|
||||
*/
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1910
|
||||
// older versions of Visual Studio lack proper support for unique_ptr.
|
||||
std::unique_ptr<char[]> loaded_bytes;
|
||||
#else
|
||||
std::unique_ptr<char[], decltype(&aligned_free_char)> loaded_bytes;
|
||||
#endif
|
||||
|
||||
/** Capacity of loaded_bytes buffer. */
|
||||
size_t _loaded_bytes_capacity{0};
|
||||
|
|
|
@ -12,26 +12,21 @@
|
|||
namespace simdjson {
|
||||
namespace internal {
|
||||
|
||||
// low-level function to allocate memory with padding so we can read past the
|
||||
// "length" bytes safely. if you must provide a pointer to some data, create it
|
||||
// with this function: length is the max. size in bytes of the string caller is
|
||||
// responsible to free the memory (free(...))
|
||||
// The allocate_padded_buffer function is a low-level function to allocate memory
|
||||
// with padding so we can read past the "length" bytes safely. It is used by
|
||||
// the padded_string class automatically. It returns nullptr in case
|
||||
// of error: the caller should check for a null pointer.
|
||||
// The length parameter is the maximum size in bytes of the string.
|
||||
// The caller is responsible to free the memory (e.g., delete[] (...)).
|
||||
inline char *allocate_padded_buffer(size_t length) noexcept {
|
||||
// we could do a simple malloc
|
||||
// return (char *) malloc(length + SIMDJSON_PADDING);
|
||||
// However, we might as well align to cache lines...
|
||||
size_t totalpaddedlength = length + SIMDJSON_PADDING;
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1910
|
||||
// For legacy Visual Studio 2015 since it does not have proper C++11 support
|
||||
char *padded_buffer = new[totalpaddedlength];
|
||||
#else
|
||||
char *padded_buffer = aligned_malloc_char(64, totalpaddedlength);
|
||||
#endif
|
||||
#ifndef NDEBUG
|
||||
char *padded_buffer = new (std::nothrow) char[totalpaddedlength];
|
||||
if (padded_buffer == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
#endif // NDEBUG
|
||||
// We write zeroes in the padded region to avoid having uninitized
|
||||
// garbage. If nothing else, garbage getting read might trigger a
|
||||
// warning in a memory checking.
|
||||
memset(padded_buffer + length, 0, totalpaddedlength - length);
|
||||
return padded_buffer;
|
||||
} // allocate_padded_buffer()
|
||||
|
@ -74,7 +69,7 @@ inline padded_string::padded_string(padded_string &&o) noexcept
|
|||
}
|
||||
|
||||
inline padded_string &padded_string::operator=(padded_string &&o) noexcept {
|
||||
aligned_free_char(data_ptr);
|
||||
delete[] data_ptr;
|
||||
data_ptr = o.data_ptr;
|
||||
viable_size = o.viable_size;
|
||||
o.data_ptr = nullptr; // we take ownership
|
||||
|
@ -92,7 +87,7 @@ inline void padded_string::swap(padded_string &o) noexcept {
|
|||
}
|
||||
|
||||
inline padded_string::~padded_string() noexcept {
|
||||
aligned_free_char(data_ptr);
|
||||
delete[] data_ptr;
|
||||
}
|
||||
|
||||
inline size_t padded_string::size() const noexcept { return viable_size; }
|
||||
|
|
|
@ -15,18 +15,10 @@ namespace dom {
|
|||
//
|
||||
// parser inline implementation
|
||||
//
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1910
|
||||
// older versions of Visual Studio lack proper support for unique_ptr.
|
||||
really_inline parser::parser(size_t max_capacity) noexcept
|
||||
: _max_capacity{max_capacity},
|
||||
loaded_bytes(nullptr) {
|
||||
}
|
||||
#else
|
||||
really_inline parser::parser(size_t max_capacity) noexcept
|
||||
: _max_capacity{max_capacity},
|
||||
loaded_bytes(nullptr, &aligned_free_char) {
|
||||
}
|
||||
#endif
|
||||
really_inline parser::parser(parser &&other) noexcept = default;
|
||||
really_inline parser &parser::operator=(parser &&other) noexcept = default;
|
||||
|
||||
|
@ -101,19 +93,14 @@ inline simdjson_result<document_stream> parser::load_many(const std::string &pat
|
|||
inline simdjson_result<element> parser::parse(const uint8_t *buf, size_t len, bool realloc_if_needed) & noexcept {
|
||||
error_code _error = ensure_capacity(len);
|
||||
if (_error) { return _error; }
|
||||
std::unique_ptr<uint8_t[]> tmp_buf;
|
||||
|
||||
if (realloc_if_needed) {
|
||||
const uint8_t *tmp_buf = buf;
|
||||
buf = (uint8_t *)internal::allocate_padded_buffer(len);
|
||||
if (buf == nullptr)
|
||||
return MEMALLOC;
|
||||
memcpy((void *)buf, tmp_buf, len);
|
||||
}
|
||||
|
||||
_error = implementation->parse(buf, len, doc);
|
||||
if (realloc_if_needed) {
|
||||
aligned_free((void *)buf); // must free before we exit
|
||||
tmp_buf.reset((uint8_t *)internal::allocate_padded_buffer(len));
|
||||
if (tmp_buf.get() == nullptr) { return MEMALLOC; }
|
||||
memcpy((void *)tmp_buf.get(), buf, len);
|
||||
}
|
||||
_error = implementation->parse(realloc_if_needed ? tmp_buf.get() : buf, len, doc);
|
||||
if (_error) { return _error; }
|
||||
|
||||
return doc.root();
|
||||
|
|
|
@ -144,10 +144,12 @@ inline simdjson::padded_string operator "" _padded(const char *str, size_t len)
|
|||
namespace simdjson {
|
||||
namespace internal {
|
||||
|
||||
// low-level function to allocate memory with padding so we can read past the
|
||||
// "length" bytes safely. if you must provide a pointer to some data, create it
|
||||
// with this function: length is the max. size in bytes of the string caller is
|
||||
// responsible to free the memory (free(...))
|
||||
// The allocate_padded_buffer function is a low-level function to allocate memory
|
||||
// with padding so we can read past the "length" bytes safely. It is used by
|
||||
// the padded_string class automatically. It returns nullptr in case
|
||||
// of error: the caller should check for a null pointer.
|
||||
// The length parameter is the maximum size in bytes of the string.
|
||||
// The caller is responsible to free the memory (e.g., delete[] (...)).
|
||||
inline char *allocate_padded_buffer(size_t length) noexcept;
|
||||
|
||||
} // namespace internal
|
||||
|
|
|
@ -174,48 +174,6 @@ use a 64-bit target such as x64 or 64-bit ARM.")
|
|||
#define simdjson_strncasecmp strncasecmp
|
||||
#endif
|
||||
|
||||
namespace simdjson {
|
||||
/** @private portable version of posix_memalign */
|
||||
static inline void *aligned_malloc(size_t alignment, size_t size) {
|
||||
void *p;
|
||||
#ifdef SIMDJSON_VISUAL_STUDIO
|
||||
p = _aligned_malloc(size, alignment);
|
||||
#elif defined(__MINGW32__) || defined(__MINGW64__)
|
||||
p = __mingw_aligned_malloc(size, alignment);
|
||||
#else
|
||||
// somehow, if this is used before including "x86intrin.h", it creates an
|
||||
// implicit defined warning.
|
||||
if (posix_memalign(&p, alignment, size) != 0) {
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
return p;
|
||||
}
|
||||
|
||||
/** @private */
|
||||
static inline char *aligned_malloc_char(size_t alignment, size_t size) {
|
||||
return (char *)aligned_malloc(alignment, size);
|
||||
}
|
||||
|
||||
/** @private */
|
||||
static inline void aligned_free(void *mem_block) {
|
||||
if (mem_block == nullptr) {
|
||||
return;
|
||||
}
|
||||
#ifdef SIMDJSON_VISUAL_STUDIO
|
||||
_aligned_free(mem_block);
|
||||
#elif defined(__MINGW32__) || defined(__MINGW64__)
|
||||
__mingw_aligned_free(mem_block);
|
||||
#else
|
||||
free(mem_block);
|
||||
#endif
|
||||
}
|
||||
|
||||
/** @private */
|
||||
static inline void aligned_free_char(char *mem_block) {
|
||||
aligned_free((void *)mem_block);
|
||||
}
|
||||
|
||||
#ifdef NDEBUG
|
||||
|
||||
#ifdef SIMDJSON_VISUAL_STUDIO
|
||||
|
@ -233,5 +191,4 @@ static inline void aligned_free_char(char *mem_block) {
|
|||
|
||||
#endif
|
||||
|
||||
} // namespace simdjson
|
||||
#endif // SIMDJSON_PORTABILITY_H
|
||||
|
|
Loading…
Reference in New Issue