More cleaning.
This commit is contained in:
parent
0e4804137c
commit
c11eefca32
|
@ -12,11 +12,11 @@ Goal: Speed up the parsing of JSON per se.
|
||||||
/...
|
/...
|
||||||
|
|
||||||
const char * filename = ... //
|
const char * filename = ... //
|
||||||
simdjsonstring p = get_corpus(filename);
|
std::string_view p = get_corpus(filename);
|
||||||
ParsedJson pj;
|
ParsedJson pj;
|
||||||
size_t maxdepth = 1024; // support documents have nesting "depth" up to 1024
|
size_t maxdepth = 1024; // support documents have nesting "depth" up to 1024
|
||||||
pj.allocateCapacity(p.size(), maxdepth); // allocate memory for parsing up to p.size() bytes
|
pj.allocateCapacity(p.size(), maxdepth); // allocate memory for parsing up to p.size() bytes
|
||||||
bool is_ok = json_parse(p.first, p.second, pj); // do the parsing, return false on error
|
bool is_ok = json_parse(p, pj); // do the parsing, return false on error
|
||||||
// parsing is done!
|
// parsing is done!
|
||||||
// js can be reused with other json_parse calls.
|
// js can be reused with other json_parse calls.
|
||||||
```
|
```
|
||||||
|
|
|
@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
const char * filename = argv[optind];
|
const char * filename = argv[optind];
|
||||||
simdjsonstring p;
|
std::string_view p;
|
||||||
try {
|
try {
|
||||||
p = get_corpus(filename);
|
p = get_corpus(filename);
|
||||||
} catch (const std::exception& e) { // caught by reference to base
|
} catch (const std::exception& e) { // caught by reference to base
|
||||||
|
@ -79,20 +79,20 @@ int main(int argc, char *argv[]) {
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
char *buffer = allocate_aligned_buffer(p.size() + 1);
|
char *buffer = allocate_aligned_buffer(p.size() + 1);
|
||||||
memcpy(buffer, p.c_str(), p.size());
|
memcpy(buffer, p.data(), p.size());
|
||||||
buffer[p.size()] = '\0';
|
buffer[p.size()] = '\0';
|
||||||
|
|
||||||
int repeat = 10;
|
int repeat = 10;
|
||||||
int volume = p.size();
|
int volume = p.size();
|
||||||
|
|
||||||
size_t strlength = rapidstringme((char *)p.c_str()).size();
|
size_t strlength = rapidstringme((char *)p.data()).size();
|
||||||
if (verbose)
|
if (verbose)
|
||||||
std::cout << "input length is " << p.size() << " stringified length is "
|
std::cout << "input length is " << p.size() << " stringified length is "
|
||||||
<< strlength << std::endl;
|
<< strlength << std::endl;
|
||||||
BEST_TIME_NOCHECK("despacing with RapidJSON", rapidstringme((char *)p.c_str()), , repeat, volume, true);
|
BEST_TIME_NOCHECK("despacing with RapidJSON", rapidstringme((char *)p.data()), , repeat, volume, true);
|
||||||
BEST_TIME_NOCHECK("despacing with RapidJSON Insitu", rapidstringmeInsitu((char *)buffer),
|
BEST_TIME_NOCHECK("despacing with RapidJSON Insitu", rapidstringmeInsitu((char *)buffer),
|
||||||
memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
|
memcpy(buffer, p.data(), p.size()), repeat, volume, true);
|
||||||
memcpy(buffer, p.c_str(), p.size());
|
memcpy(buffer, p.data(), p.size());
|
||||||
|
|
||||||
size_t outlength =
|
size_t outlength =
|
||||||
jsonminify((const uint8_t *)buffer, p.size(), (uint8_t *)buffer);
|
jsonminify((const uint8_t *)buffer, p.size(), (uint8_t *)buffer);
|
||||||
|
@ -101,7 +101,7 @@ int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
uint8_t *cbuffer = (uint8_t *)buffer;
|
uint8_t *cbuffer = (uint8_t *)buffer;
|
||||||
BEST_TIME("jsonminify", jsonminify(cbuffer, p.size(), cbuffer), outlength,
|
BEST_TIME("jsonminify", jsonminify(cbuffer, p.size(), cbuffer), outlength,
|
||||||
memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
|
memcpy(buffer, p.data(), p.size()), repeat, volume, true);
|
||||||
printf("minisize = %zu, original size = %zu (minified down to %.2f percent of original) \n", outlength, p.size(), outlength * 100.0 / p.size());
|
printf("minisize = %zu, original size = %zu (minified down to %.2f percent of original) \n", outlength, p.size(), outlength * 100.0 / p.size());
|
||||||
|
|
||||||
/***
|
/***
|
||||||
|
@ -109,10 +109,10 @@ int main(int argc, char *argv[]) {
|
||||||
***/
|
***/
|
||||||
rapidjson::Document d;
|
rapidjson::Document d;
|
||||||
BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(), false,
|
BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(), false,
|
||||||
memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
|
memcpy(buffer, p.data(), p.size()), repeat, volume, true);
|
||||||
|
|
||||||
char *minibuffer = allocate_aligned_buffer(p.size() + 1);
|
char *minibuffer = allocate_aligned_buffer(p.size() + 1);
|
||||||
size_t minisize = jsonminify((const uint8_t *)p.c_str(), p.size(), (uint8_t*) minibuffer);
|
size_t minisize = jsonminify((const uint8_t *)p.data(), p.size(), (uint8_t*) minibuffer);
|
||||||
minibuffer[minisize] = '\0';
|
minibuffer[minisize] = '\0';
|
||||||
|
|
||||||
BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(), false,
|
BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(), false,
|
||||||
|
@ -122,14 +122,14 @@ int main(int argc, char *argv[]) {
|
||||||
size_t astbuffersize = p.size() * 2;
|
size_t astbuffersize = p.size() * 2;
|
||||||
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
|
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
|
||||||
|
|
||||||
BEST_TIME("sajson orig", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
|
BEST_TIME("sajson orig", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
|
||||||
|
|
||||||
|
|
||||||
BEST_TIME("sajson despaced", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(minisize, buffer)).is_valid(), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, true);
|
BEST_TIME("sajson despaced", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(minisize, buffer)).is_valid(), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, true);
|
||||||
|
|
||||||
ParsedJson pj;
|
ParsedJson pj;
|
||||||
pj.allocateCapacity(p.size(), 1024);
|
pj.allocateCapacity(p.size(), 1024);
|
||||||
BEST_TIME("json_parse orig", json_parse((const u8*)buffer, p.size(), pj), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
|
BEST_TIME("json_parse orig", json_parse((const u8*)buffer, p.size(), pj), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
|
||||||
|
|
||||||
ParsedJson pj2;
|
ParsedJson pj2;
|
||||||
pj2.allocateCapacity(p.size(), 1024);
|
pj2.allocateCapacity(p.size(), 1024);
|
||||||
|
|
|
@ -65,7 +65,7 @@ int main(int argc, char *argv[]) {
|
||||||
cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
|
cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
|
||||||
}
|
}
|
||||||
if(verbose) cout << "[verbose] loading " << filename << endl;
|
if(verbose) cout << "[verbose] loading " << filename << endl;
|
||||||
simdjsonstring p;
|
std::string_view p;
|
||||||
try {
|
try {
|
||||||
p = get_corpus(filename);
|
p = get_corpus(filename);
|
||||||
} catch (const std::exception& e) { // caught by reference to base
|
} catch (const std::exception& e) { // caught by reference to base
|
||||||
|
@ -118,7 +118,7 @@ int main(int argc, char *argv[]) {
|
||||||
#ifndef SQUASH_COUNTERS
|
#ifndef SQUASH_COUNTERS
|
||||||
unified.start();
|
unified.start();
|
||||||
#endif
|
#endif
|
||||||
isok = find_structural_bits(p.c_str(), p.size(), pj);
|
isok = find_structural_bits(p.data(), p.size(), pj);
|
||||||
#ifndef SQUASH_COUNTERS
|
#ifndef SQUASH_COUNTERS
|
||||||
unified.end(results);
|
unified.end(results);
|
||||||
cy1 += results[0];
|
cy1 += results[0];
|
||||||
|
@ -147,7 +147,7 @@ int main(int argc, char *argv[]) {
|
||||||
unified.start();
|
unified.start();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
isok = isok && unified_machine(p.c_str(), p.size(), pj);
|
isok = isok && unified_machine(p.data(), p.size(), pj);
|
||||||
#ifndef SQUASH_COUNTERS
|
#ifndef SQUASH_COUNTERS
|
||||||
unified.end(results);
|
unified.end(results);
|
||||||
cy3 += results[0];
|
cy3 += results[0];
|
||||||
|
|
|
@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
|
||||||
if(optind + 1 < argc) {
|
if(optind + 1 < argc) {
|
||||||
cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
|
cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
|
||||||
}
|
}
|
||||||
simdjsonstring p;
|
std::string_view p;
|
||||||
try {
|
try {
|
||||||
p = get_corpus(filename);
|
p = get_corpus(filename);
|
||||||
} catch (const std::exception& e) { // caught by reference to base
|
} catch (const std::exception& e) { // caught by reference to base
|
||||||
|
@ -93,32 +93,32 @@ int main(int argc, char *argv[]) {
|
||||||
rapidjson::Document d;
|
rapidjson::Document d;
|
||||||
|
|
||||||
char *buffer = (char *)malloc(p.size() + 1);
|
char *buffer = (char *)malloc(p.size() + 1);
|
||||||
memcpy(buffer, p.c_str(), p.size());
|
memcpy(buffer, p.data(), p.size());
|
||||||
buffer[p.size()] = '\0';
|
buffer[p.size()] = '\0';
|
||||||
|
|
||||||
BEST_TIME("RapidJSON",
|
BEST_TIME("RapidJSON",
|
||||||
d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(),
|
d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(),
|
||||||
false, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
|
false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
|
||||||
BEST_TIME("RapidJSON Insitu", d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), false,
|
BEST_TIME("RapidJSON Insitu", d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), false,
|
||||||
memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
|
memcpy(buffer, p.data(), p.size()), repeat, volume, true);
|
||||||
|
|
||||||
BEST_TIME("sajson (dynamic mem)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
|
BEST_TIME("sajson (dynamic mem)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
|
||||||
|
|
||||||
size_t astbuffersize = p.size();
|
size_t astbuffersize = p.size();
|
||||||
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
|
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
|
||||||
|
|
||||||
BEST_TIME("sajson (static alloc)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
|
BEST_TIME("sajson (static alloc)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
|
||||||
std::string json11err;
|
std::string json11err;
|
||||||
if(all) BEST_TIME("dropbox (json11) ", (( json11::Json::parse(buffer,json11err).is_null() ) || ( ! json11err.empty() )), false, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
|
if(all) BEST_TIME("dropbox (json11) ", (( json11::Json::parse(buffer,json11err).is_null() ) || ( ! json11err.empty() )), false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
|
||||||
|
|
||||||
if(all) BEST_TIME("fastjson ", fastjson_parse(buffer), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
|
if(all) BEST_TIME("fastjson ", fastjson_parse(buffer), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
|
||||||
JsonValue value;
|
JsonValue value;
|
||||||
JsonAllocator allocator;
|
JsonAllocator allocator;
|
||||||
char *endptr;
|
char *endptr;
|
||||||
if(all) BEST_TIME("gason ", jsonParse(buffer, &endptr, &value, allocator), JSON_OK, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
|
if(all) BEST_TIME("gason ", jsonParse(buffer, &endptr, &value, allocator), JSON_OK, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
|
||||||
void *state;
|
void *state;
|
||||||
if(all) BEST_TIME("ultrajson ", (UJDecode(buffer, p.size(), NULL, &state) == NULL), false, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
|
if(all) BEST_TIME("ultrajson ", (UJDecode(buffer, p.size(), NULL, &state) == NULL), false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
|
||||||
BEST_TIME("memcpy ", (memcpy(buffer, p.c_str(), p.size()) == buffer), true, , repeat, volume, true);
|
BEST_TIME("memcpy ", (memcpy(buffer, p.data(), p.size()) == buffer), true, , repeat, volume, true);
|
||||||
free(ast_buffer);
|
free(ast_buffer);
|
||||||
free(buffer);
|
free(buffer);
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,6 +14,6 @@ static inline size_t jsonminify(const char *buf, size_t len, char *out) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static inline size_t jsonminify(const simdjsonstring & p, char *out) {
|
static inline size_t jsonminify(const std::string_view & p, char *out) {
|
||||||
return jsonminify(p.c_str(), p.size(), out);
|
return jsonminify(p.data(), p.size(), out);
|
||||||
}
|
}
|
|
@ -10,10 +10,6 @@ char * allocate_aligned_buffer(size_t length) {
|
||||||
if (posix_memalign((void **)&aligned_buffer, 64, totalpaddedlength)) {
|
if (posix_memalign((void **)&aligned_buffer, 64, totalpaddedlength)) {
|
||||||
throw std::runtime_error("Could not allocate sufficient memory");
|
throw std::runtime_error("Could not allocate sufficient memory");
|
||||||
};
|
};
|
||||||
aligned_buffer[length] = '\0';
|
|
||||||
for(size_t i = length + 1; i < totalpaddedlength; i++) aligned_buffer[i] = 0x20;
|
|
||||||
//aligned_buffer[paddedlength] = '\0';
|
|
||||||
//memset(aligned_buffer + length, 0x20, paddedlength - length);
|
|
||||||
return aligned_buffer;
|
return aligned_buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -29,6 +25,7 @@ std::string_view get_corpus(std::string filename) {
|
||||||
}
|
}
|
||||||
std::rewind(fp);
|
std::rewind(fp);
|
||||||
std::fread(buf, 1, len, fp);
|
std::fread(buf, 1, len, fp);
|
||||||
|
buf[len] = '\0';
|
||||||
std::fclose(fp);
|
std::fclose(fp);
|
||||||
return std::string_view(buf,len);
|
return std::string_view(buf,len);
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,9 +11,13 @@ bool json_parse(const u8 *buf, size_t len, ParsedJson &pj) {
|
||||||
bool isok = find_structural_bits(buf, len, pj);
|
bool isok = find_structural_bits(buf, len, pj);
|
||||||
if (isok) {
|
if (isok) {
|
||||||
isok = flatten_indexes(len, pj);
|
isok = flatten_indexes(len, pj);
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
if (isok) {
|
if (isok) {
|
||||||
isok = unified_machine(buf, len, pj);
|
isok = unified_machine(buf, len, pj);
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
return isok;
|
return isok;
|
||||||
}
|
}
|
||||||
|
|
|
@ -61,8 +61,9 @@ WARN_UNUSED
|
||||||
// effectively the very first char is considered to follow "whitespace" for the
|
// effectively the very first char is considered to follow "whitespace" for the
|
||||||
// purposes of psuedo-structural character detection
|
// purposes of psuedo-structural character detection
|
||||||
u64 prev_iter_ends_pseudo_pred = 1ULL;
|
u64 prev_iter_ends_pseudo_pred = 1ULL;
|
||||||
|
size_t lenminus64 = len + 1 < 64 ? 0 : len + 1 - 64; // len + 1 because of the NULL termination
|
||||||
for (size_t idx = 0; idx < len; idx += 64) {
|
size_t idx = 0;
|
||||||
|
for (; idx < lenminus64; idx += 64) {
|
||||||
__builtin_prefetch(buf + idx + 128);
|
__builtin_prefetch(buf + idx + 128);
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
cout << "Idx is " << idx << "\n";
|
cout << "Idx is " << idx << "\n";
|
||||||
|
@ -249,21 +250,163 @@ WARN_UNUSED
|
||||||
"final structurals and pseudo structurals after close quote removal");
|
"final structurals and pseudo structurals after close quote removal");
|
||||||
*(u64 *)(pj.structurals + idx / 8) = structurals;
|
*(u64 *)(pj.structurals + idx / 8) = structurals;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////
|
||||||
|
/// we use a giant copy-paste which is ugly.
|
||||||
|
/// but otherwise the string needs to be properly padded or else we
|
||||||
|
/// risk invalidating the UTF-8 checks.
|
||||||
|
////////////
|
||||||
|
if (idx < len + 1) { // +1 due to NULL termination
|
||||||
|
u8 tmpbuf[64];
|
||||||
|
memset(tmpbuf,0x20,64);
|
||||||
|
memcpy(tmpbuf,buf+idx,len - idx + 1);// +1 due to NULL termination
|
||||||
|
m256 input_lo = _mm256_loadu_si256((const m256 *)(tmpbuf + 0));
|
||||||
|
m256 input_hi = _mm256_loadu_si256((const m256 *)(tmpbuf + 32));
|
||||||
|
#ifdef UTF8VALIDATE
|
||||||
|
m256 highbit = _mm256_set1_epi8(0x80);
|
||||||
|
if((_mm256_testz_si256(_mm256_or_si256(input_lo, input_hi),highbit)) == 1) {
|
||||||
|
// it is ascii, we just check continuation
|
||||||
|
has_error = _mm256_or_si256(
|
||||||
|
_mm256_cmpgt_epi8(previous.carried_continuations,
|
||||||
|
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||||
|
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||||
|
9, 9, 9, 9, 9, 9, 9, 1)),has_error);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// it is not ascii so we have to do heavy work
|
||||||
|
previous = avxcheckUTF8Bytes(input_lo, &previous, &has_error);
|
||||||
|
previous = avxcheckUTF8Bytes(input_hi, &previous, &has_error);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Step 1: detect odd sequences of backslashes
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
u64 bs_bits =
|
||||||
|
cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('\\'));
|
||||||
|
u64 start_edges = bs_bits & ~(bs_bits << 1);
|
||||||
|
// flip lowest if we have an odd-length run at the end of the prior
|
||||||
|
// iteration
|
||||||
|
u64 even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
|
||||||
|
u64 even_starts = start_edges & even_start_mask;
|
||||||
|
u64 odd_starts = start_edges & ~even_start_mask;
|
||||||
|
u64 even_carries = bs_bits + even_starts;
|
||||||
|
|
||||||
|
u64 odd_carries;
|
||||||
|
// must record the carry-out of our odd-carries out of bit 63; this
|
||||||
|
// indicates whether the sense of any edge going to the next iteration
|
||||||
|
// should be flipped
|
||||||
|
bool iter_ends_odd_backslash =
|
||||||
|
__builtin_uaddll_overflow(bs_bits, odd_starts, &odd_carries);
|
||||||
|
|
||||||
|
odd_carries |=
|
||||||
|
prev_iter_ends_odd_backslash; // push in bit zero as a potential end
|
||||||
|
// if we had an odd-numbered run at the
|
||||||
|
// end of the previous iteration
|
||||||
|
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
|
||||||
|
u64 even_carry_ends = even_carries & ~bs_bits;
|
||||||
|
u64 odd_carry_ends = odd_carries & ~bs_bits;
|
||||||
|
u64 even_start_odd_end = even_carry_ends & odd_bits;
|
||||||
|
u64 odd_start_even_end = odd_carry_ends & even_bits;
|
||||||
|
u64 odd_ends = even_start_odd_end | odd_start_even_end;
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Step 2: detect insides of quote pairs
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
u64 quote_bits =
|
||||||
|
cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
|
||||||
|
quote_bits = quote_bits & ~odd_ends;
|
||||||
|
u64 quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
||||||
|
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
|
||||||
|
quote_mask ^= prev_iter_inside_quote;
|
||||||
|
prev_iter_inside_quote = (u64)((s64)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20
|
||||||
|
|
||||||
|
// How do we build up a user traversable data structure
|
||||||
|
// first, do a 'shufti' to detect structural JSON characters
|
||||||
|
// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
|
||||||
|
// these go into the first 3 buckets of the comparison (1/2/4)
|
||||||
|
|
||||||
|
// we are also interested in the four whitespace characters
|
||||||
|
// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
|
||||||
|
// these go into the next 2 buckets of the comparison (8/16)
|
||||||
|
const m256 low_nibble_mask = _mm256_setr_epi8(
|
||||||
|
// 0 9 a b c d
|
||||||
|
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 8, 12, 1, 2, 9, 0, 0);
|
||||||
|
const m256 high_nibble_mask = _mm256_setr_epi8(
|
||||||
|
// 0 2 3 5 7
|
||||||
|
8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
|
||||||
|
1, 0, 0, 0, 3, 2, 1, 0, 0);
|
||||||
|
|
||||||
|
m256 structural_shufti_mask = _mm256_set1_epi8(0x7);
|
||||||
|
m256 whitespace_shufti_mask = _mm256_set1_epi8(0x18);
|
||||||
|
|
||||||
|
m256 v_lo = _mm256_and_si256(
|
||||||
|
_mm256_shuffle_epi8(low_nibble_mask, input_lo),
|
||||||
|
_mm256_shuffle_epi8(high_nibble_mask,
|
||||||
|
_mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
|
||||||
|
_mm256_set1_epi8(0x7f))));
|
||||||
|
|
||||||
|
m256 v_hi = _mm256_and_si256(
|
||||||
|
_mm256_shuffle_epi8(low_nibble_mask, input_hi),
|
||||||
|
_mm256_shuffle_epi8(high_nibble_mask,
|
||||||
|
_mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
|
||||||
|
_mm256_set1_epi8(0x7f))));
|
||||||
|
m256 tmp_lo = _mm256_cmpeq_epi8(
|
||||||
|
_mm256_and_si256(v_lo, structural_shufti_mask), _mm256_set1_epi8(0));
|
||||||
|
m256 tmp_hi = _mm256_cmpeq_epi8(
|
||||||
|
_mm256_and_si256(v_hi, structural_shufti_mask), _mm256_set1_epi8(0));
|
||||||
|
|
||||||
|
u64 structural_res_0 = (u32)_mm256_movemask_epi8(tmp_lo);
|
||||||
|
u64 structural_res_1 = _mm256_movemask_epi8(tmp_hi);
|
||||||
|
u64 structurals = ~(structural_res_0 | (structural_res_1 << 32));
|
||||||
|
|
||||||
|
// this additional mask and transfer is non-trivially expensive,
|
||||||
|
// unfortunately
|
||||||
|
m256 tmp_ws_lo = _mm256_cmpeq_epi8(
|
||||||
|
_mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
|
||||||
|
m256 tmp_ws_hi = _mm256_cmpeq_epi8(
|
||||||
|
_mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
|
||||||
|
|
||||||
|
u64 ws_res_0 = (u32)_mm256_movemask_epi8(tmp_ws_lo);
|
||||||
|
u64 ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
|
||||||
|
u64 whitespace = ~(ws_res_0 | (ws_res_1 << 32));
|
||||||
|
|
||||||
|
|
||||||
|
// mask off anything inside quotes
|
||||||
|
structurals &= ~quote_mask;
|
||||||
|
|
||||||
|
// add the real quote bits back into our bitmask as well, so we can
|
||||||
|
// quickly traverse the strings we've spent all this trouble gathering
|
||||||
|
structurals |= quote_bits;
|
||||||
|
|
||||||
|
// Now, establish "pseudo-structural characters". These are non-whitespace
|
||||||
|
// characters that are (a) outside quotes and (b) have a predecessor that's
|
||||||
|
// either whitespace or a structural character. This means that subsequent
|
||||||
|
// passes will get a chance to encounter the first character of every string
|
||||||
|
// of non-whitespace and, if we're parsing an atom like true/false/null or a
|
||||||
|
// number we can stop at the first whitespace or structural character
|
||||||
|
// following it.
|
||||||
|
|
||||||
|
// a qualified predecessor is something that can happen 1 position before an
|
||||||
|
// psuedo-structural character
|
||||||
|
u64 pseudo_pred = structurals | whitespace;
|
||||||
|
u64 shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
|
||||||
|
prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
|
||||||
|
u64 pseudo_structurals =
|
||||||
|
shifted_pseudo_pred & (~whitespace) & (~quote_mask);
|
||||||
|
structurals |= pseudo_structurals;
|
||||||
|
|
||||||
|
// now, we've used our close quotes all we need to. So let's switch them off
|
||||||
|
// they will be off in the quote mask and on in quote bits.
|
||||||
|
structurals &= ~(quote_bits & ~quote_mask);
|
||||||
|
*(u64 *)(pj.structurals + idx / 8) = structurals;
|
||||||
|
}
|
||||||
if(buf[len] != '\0') {
|
if(buf[len] != '\0') {
|
||||||
std::cerr << "Your string should be NULL terminated." << std::endl;
|
std::cerr << "Your string should be NULL terminated." << std::endl;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// we are going to zero out everything after len:
|
|
||||||
size_t count_last_64bits = len % 64;
|
|
||||||
if(count_last_64bits != 0) { // we have a "final" word where only count_last_64bits matter
|
|
||||||
u64 lastword = *(u64 *)(pj.structurals + len / 8);
|
|
||||||
printf("last word %zu \n", lastword);
|
|
||||||
printf("count_last_64bits%zu \n", count_last_64bits);
|
|
||||||
lastword &= ( UINT64_C(1) << count_last_64bits) - 1;
|
|
||||||
*(u64 *)(pj.structurals + len / 8) = lastword;
|
|
||||||
}
|
|
||||||
|
|
||||||
//pj.structural_indexes[pj.n_structural_indexes++] = len; // the final NULL is used as a pseudo-structural character
|
|
||||||
#ifdef UTF8VALIDATE
|
#ifdef UTF8VALIDATE
|
||||||
return _mm256_testz_si256(has_error, has_error);
|
return _mm256_testz_si256(has_error, has_error);
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -119,7 +119,7 @@ bool flatten_indexes(size_t len, ParsedJson &pj) {
|
||||||
}
|
}
|
||||||
pj.n_structural_indexes = base;
|
pj.n_structural_indexes = base;
|
||||||
if(len != base_ptr[pj.n_structural_indexes-1]) {
|
if(len != base_ptr[pj.n_structural_indexes-1]) {
|
||||||
printf("last structural should be pointing at the end of the string\n");
|
// can happen with malformed JSON such as unclosed quotes (["this is an unclosed string ])
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array
|
base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array
|
||||||
|
|
|
@ -58,7 +58,7 @@ int main(int argc, char *argv[]) {
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
const char * filename = argv[optind];
|
const char * filename = argv[optind];
|
||||||
simdjsonstring p;
|
std::string_view p;
|
||||||
try {
|
try {
|
||||||
p = get_corpus(filename);
|
p = get_corpus(filename);
|
||||||
} catch (const std::exception& e) { // caught by reference to base
|
} catch (const std::exception& e) { // caught by reference to base
|
||||||
|
@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
|
||||||
rapidjson::Document d;
|
rapidjson::Document d;
|
||||||
|
|
||||||
char *buffer = (char *)malloc(p.size() + 1);
|
char *buffer = (char *)malloc(p.size() + 1);
|
||||||
memcpy(buffer, p.c_str(), p.size());
|
memcpy(buffer, p.data(), p.size());
|
||||||
buffer[p.size()] = '\0';
|
buffer[p.size()] = '\0';
|
||||||
bool rapid_correct = (d.Parse((const char *)buffer).HasParseError() == false);
|
bool rapid_correct = (d.Parse((const char *)buffer).HasParseError() == false);
|
||||||
bool rapid_correct_checkencoding = (d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError() == false);
|
bool rapid_correct_checkencoding = (d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError() == false);
|
||||||
|
|
|
@ -49,7 +49,8 @@ bool validate(const char *dirname) {
|
||||||
for (int i = 0; i < c; i++) {
|
for (int i = 0; i < c; i++) {
|
||||||
const char *name = entry_list[i]->d_name;
|
const char *name = entry_list[i]->d_name;
|
||||||
if (hasExtension(name, extension)) {
|
if (hasExtension(name, extension)) {
|
||||||
//printf("validating: file %s \n", name);
|
printf("validating: file %s ", name);
|
||||||
|
fflush(NULL);
|
||||||
size_t filelen = strlen(name);
|
size_t filelen = strlen(name);
|
||||||
char *fullpath = (char *)malloc(dirlen + filelen + 1 + 1);
|
char *fullpath = (char *)malloc(dirlen + filelen + 1 + 1);
|
||||||
strcpy(fullpath, dirname);
|
strcpy(fullpath, dirname);
|
||||||
|
@ -74,6 +75,7 @@ bool validate(const char *dirname) {
|
||||||
}
|
}
|
||||||
++howmany;
|
++howmany;
|
||||||
bool isok = json_parse(p, pj);
|
bool isok = json_parse(p, pj);
|
||||||
|
printf("%s\n", isok ? "ok" : "invalid");
|
||||||
if(contains("EXCLUDE",name)) {
|
if(contains("EXCLUDE",name)) {
|
||||||
// skipping
|
// skipping
|
||||||
howmany--;
|
howmany--;
|
||||||
|
@ -89,9 +91,6 @@ bool validate(const char *dirname) {
|
||||||
printf("warning: file %s should fail but it passes.\n", name);
|
printf("warning: file %s should fail but it passes.\n", name);
|
||||||
everythingfine = false;
|
everythingfine = false;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
printf("File %s %s.\n", name,
|
|
||||||
isok ? " is valid JSON " : " is not valid JSON");
|
|
||||||
}
|
}
|
||||||
free(fullpath);
|
free(fullpath);
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,7 @@ int main(int argc, char *argv[]) {
|
||||||
std::cerr << "Usage: " << argv[0] << " <jsonfile>\n";
|
std::cerr << "Usage: " << argv[0] << " <jsonfile>\n";
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
simdjsonstring p;
|
std::string_view p;
|
||||||
std::string filename = argv[argc - 1];
|
std::string filename = argv[argc - 1];
|
||||||
try{
|
try{
|
||||||
p = get_corpus(filename);
|
p = get_corpus(filename);
|
||||||
|
@ -16,6 +16,6 @@ int main(int argc, char *argv[]) {
|
||||||
std::cout << "Could not load the file " << filename << std::endl;
|
std::cout << "Could not load the file " << filename << std::endl;
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
jsonminify(p, &p[0]);
|
jsonminify(p, (char *)p.data());
|
||||||
printf("%s",p.data());
|
printf("%s",p.data());
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue