Merge branch 'master' of github.com:lemire/simdjson

This commit is contained in:
Daniel Lemire 2018-11-09 15:16:25 -05:00
commit 0e5b939568
6 changed files with 44 additions and 46 deletions

View File

@ -4,4 +4,4 @@ To simplify the engineering, we make some assumptions that can be lifted with so
- We assume AVX2 support which is available in all recent mainstream x86 processors produced by AMD and Intel. No support for non-x86 processors is included.
- We only support GNU GCC and LLVM Clang at this time. There is no support for Microsoft Visual Studio at this time.
- This library cannot parse JSON document of size 16MB or more.
- We expect the input memory pointer to 256-bit aligned and to be padded (e.g., with spaces) so that it can be read entirely in blocks of 256 bits.
- We expect the input memory pointer to 256-bit aligned and to be padded (e.g., with spaces) so that it can be read entirely in blocks of 256 bits. In practice, this means that users should allocate the memory where the JSON bytes are located using the `allocate_aligned_buffer` function or the equivalent.

View File

@ -77,13 +77,13 @@ uint64_t global_rdtsc_overhead = (uint64_t)UINT64_MAX;
* test, repeat is the number of times we should repeat and size is the
* number of operations represented by test.
*/
#define BEST_TIME(test, expected, pre, repeat, size, verbose) \
#define BEST_TIME(name, test, expected, pre, repeat, size, verbose) \
do { \
if (global_rdtsc_overhead == UINT64_MAX) { \
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \
} \
if (verbose) \
printf("%-40s\t: ", #test); \
printf("%-40s\t: ", name); \
fflush(NULL); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t)-1; \
@ -117,13 +117,13 @@ uint64_t global_rdtsc_overhead = (uint64_t)UINT64_MAX;
} while (0)
// like BEST_TIME, but no check
#define BEST_TIME_NOCHECK(test, pre, repeat, size, verbose) \
#define BEST_TIME_NOCHECK(name, test, pre, repeat, size, verbose) \
do { \
if (global_rdtsc_overhead == UINT64_MAX) { \
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \
} \
if (verbose) \
printf("%-40s\t: ", #test); \
printf("%-40s\t: ", name); \
fflush(NULL); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t)-1; \
@ -160,7 +160,7 @@ uint64_t global_rdtsc_overhead = (uint64_t)UINT64_MAX;
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \
} \
if (verbose) \
printf("%-60s\t: ", #test); \
printf("%-60s\t:\n", #test); \
fflush(NULL); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t)-1; \

View File

@ -64,7 +64,7 @@ int main(int argc, char *argv[]) {
std::cout << p.second << " B ";
std::cout << std::endl;
}
char *buffer = (char *)malloc(p.second + 1);
char *buffer = allocate_aligned_buffer(p.second + 1);
memcpy(buffer, p.first, p.second);
buffer[p.second] = '\0';
@ -75,8 +75,8 @@ int main(int argc, char *argv[]) {
if (verbose)
std::cout << "input length is " << p.second << " stringified length is "
<< strlength << std::endl;
BEST_TIME_NOCHECK(rapidstringme((char *)p.first), , repeat, volume, true);
BEST_TIME_NOCHECK(rapidstringmeInsitu((char *)buffer),
BEST_TIME_NOCHECK("despacing with RapidJSON", rapidstringme((char *)p.first), , repeat, volume, true);
BEST_TIME_NOCHECK("despacing with RapidJSON Insitu", rapidstringmeInsitu((char *)buffer),
memcpy(buffer, p.first, p.second), repeat, volume, true);
memcpy(buffer, p.first, p.second);
@ -86,49 +86,42 @@ int main(int argc, char *argv[]) {
std::cout << "jsonminify length is " << outlength << std::endl;
uint8_t *cbuffer = (uint8_t *)buffer;
BEST_TIME(jsonminify(cbuffer, p.second, cbuffer), outlength,
BEST_TIME("jsonminify", jsonminify(cbuffer, p.second, cbuffer), outlength,
memcpy(buffer, p.first, p.second), repeat, volume, true);
printf("minisize = %zu, original size = %zu (minified down to %.2f percent of original) \n", outlength, p.second, outlength * 100.0 / p.second);
/***
* Is it worth it to minify before parsing?
***/
rapidjson::Document d;
printf("\n");
printf("parsing with RapidJSON before despacing:\n");
BEST_TIME(d.ParseInsitu(buffer).HasParseError(), false,
BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(), false,
memcpy(buffer, p.first, p.second), repeat, volume, true);
printf("parsing with RapidJSON after despacing:\n");
char *minibuffer = (char *)malloc(p.second + 1);
char *minibuffer = allocate_aligned_buffer(p.second + 1);
size_t minisize = jsonminify((const uint8_t *)p.first, p.second, (uint8_t*) minibuffer);
minibuffer[minisize] = '\0';
BEST_TIME(d.ParseInsitu(buffer).HasParseError(), false,
BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(), false,
memcpy(buffer, minibuffer, p.second),
repeat, volume, true);
printf("\n");
size_t astbuffersize = p.second * 2;
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
printf("parsing with sajson before despacing:\n");
BEST_TIME(sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.second, buffer)).is_valid(), true, memcpy(buffer, p.first, p.second), repeat, volume, true);
BEST_TIME("sajson orig", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.second, buffer)).is_valid(), true, memcpy(buffer, p.first, p.second), repeat, volume, true);
BEST_TIME("sajson despaced", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(minisize, buffer)).is_valid(), true, memcpy(buffer, minibuffer, p.second), repeat, volume, true);
printf("parsing with sajson after despacing:\n");
BEST_TIME(sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(minisize, buffer)).is_valid(), true, memcpy(buffer, minibuffer, p.second), repeat, volume, true);
printf("parsing before despacing:\n");
ParsedJson *pj_ptr = allocate_ParsedJson(p.second);
ParsedJson &pj(*pj_ptr);
BEST_TIME(json_parse(p.first, p.second, pj), true, , repeat, volume, true);
BEST_TIME("json_parse orig", json_parse((const u8*)buffer, p.second, pj), true, memcpy(buffer, p.first, p.second), repeat, volume, true);
printf("parsing after despacing:\n");
ParsedJson *pj_ptr2 = allocate_ParsedJson(minisize);
ParsedJson *pj_ptr2 = allocate_ParsedJson(p.second);
ParsedJson &pj2(*pj_ptr2);
BEST_TIME(json_parse((const u8*)minibuffer, minisize, pj2), true, , repeat, volume, true);
BEST_TIME("json_parse despaced", json_parse((const u8*)buffer, minisize, pj2), true, memcpy(buffer, minibuffer, p.second), repeat, volume, true);
free(buffer);
free(p.first);

View File

@ -49,7 +49,7 @@ int main(int argc, char *argv[]) {
int repeat = 10;
int volume = p.second;
BEST_TIME(json_parse(p.first, p.second, pj), true, , repeat, volume, true);
BEST_TIME("json_parse", json_parse(p.first, p.second, pj), true, , repeat, volume, true);
rapidjson::Document d;
@ -57,22 +57,18 @@ int main(int argc, char *argv[]) {
memcpy(buffer, p.first, p.second);
buffer[p.second] = '\0';
BEST_TIME(
BEST_TIME("RapidJSON",
d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(),
false, memcpy(buffer, p.first, p.second), repeat, volume, true);
BEST_TIME(d.Parse((const char *)buffer).HasParseError(), false,
memcpy(buffer, p.first, p.second), repeat, volume, true);
BEST_TIME(d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), false,
memcpy(buffer, p.first, p.second), repeat, volume, true);
BEST_TIME(d.ParseInsitu(buffer).HasParseError(), false,
BEST_TIME("RapidJSON Insitu", d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), false,
memcpy(buffer, p.first, p.second), repeat, volume, true);
BEST_TIME(sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.second, buffer)).is_valid(), true, memcpy(buffer, p.first, p.second), repeat, volume, true);
BEST_TIME("sajson (dynamic mem)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.second, buffer)).is_valid(), true, memcpy(buffer, p.first, p.second), repeat, volume, true);
size_t astbuffersize = p.second;
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
BEST_TIME(sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.second, buffer)).is_valid(), true, memcpy(buffer, p.first, p.second), repeat, volume, true);
BEST_TIME("sajson (static alloc)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.second, buffer)).is_valid(), true, memcpy(buffer, p.first, p.second), repeat, volume, true);
free(buffer);
free(p.first);

View File

@ -9,6 +9,11 @@
#include "common_defs.h"
// if you must provide a pointer to some data, create it with this function:
// length is the max. size in bytes of the string
// caller is responsible to free the memory (free(...))
char * allocate_aligned_buffer(size_t length);
// load a file in memory...
// get a corpus; pad out to cache line so we can always use SIMD
// throws exceptions in case of failure

View File

@ -3,12 +3,7 @@
#define AVXOVERALLOCATE
std::pair<u8 *, size_t> get_corpus(std::string filename) {
std::ifstream is(filename, std::ios::binary);
if (is) {
std::stringstream buffer;
buffer << is.rdbuf();
size_t length = buffer.str().size(); // +1 for null
char * allocate_aligned_buffer(size_t length) {
char *aligned_buffer;
size_t paddedlength = ROUNDUP_N(length, 64);
#ifdef AVXOVERALLOCATE
@ -22,10 +17,19 @@ std::pair<u8 *, size_t> get_corpus(std::string filename) {
throw std::runtime_error("Could not allocate sufficient memory");
};
#endif
//memset(aligned_buffer, 0x20, ROUNDUP_N(length + 1, 64));
memcpy(aligned_buffer, buffer.str().c_str(), length);
memset(aligned_buffer + length, 0x20, paddedlength - length);
aligned_buffer[paddedlength] = '\0';
memset(aligned_buffer + length, 0x20, paddedlength - length);
return aligned_buffer;
}
std::pair<u8 *, size_t> get_corpus(std::string filename) {
std::ifstream is(filename, std::ios::binary);
if (is) {
std::stringstream buffer;
buffer << is.rdbuf();
size_t length = buffer.str().size(); // +1 for null
u8* aligned_buffer = (u8 *)allocate_aligned_buffer(length);
memcpy(aligned_buffer, buffer.str().c_str(), length);
is.close();
return std::make_pair((u8 *)aligned_buffer, length);
}