More cleaning.

This commit is contained in:
Daniel Lemire 2018-11-30 21:31:05 -05:00
parent 0e4804137c
commit c11eefca32
12 changed files with 199 additions and 56 deletions

View File

@ -12,11 +12,11 @@ Goal: Speed up the parsing of JSON per se.
/... /...
const char * filename = ... // const char * filename = ... //
simdjsonstring p = get_corpus(filename); std::string_view p = get_corpus(filename);
ParsedJson pj; ParsedJson pj;
size_t maxdepth = 1024; // support documents have nesting "depth" up to 1024 size_t maxdepth = 1024; // support documents have nesting "depth" up to 1024
pj.allocateCapacity(p.size(), maxdepth); // allocate memory for parsing up to p.size() bytes pj.allocateCapacity(p.size(), maxdepth); // allocate memory for parsing up to p.size() bytes
bool is_ok = json_parse(p.first, p.second, pj); // do the parsing, return false on error bool is_ok = json_parse(p, pj); // do the parsing, return false on error
// parsing is done! // parsing is done!
// js can be reused with other json_parse calls. // js can be reused with other json_parse calls.
``` ```

View File

@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
exit(1); exit(1);
} }
const char * filename = argv[optind]; const char * filename = argv[optind];
simdjsonstring p; std::string_view p;
try { try {
p = get_corpus(filename); p = get_corpus(filename);
} catch (const std::exception& e) { // caught by reference to base } catch (const std::exception& e) { // caught by reference to base
@ -79,20 +79,20 @@ int main(int argc, char *argv[]) {
std::cout << std::endl; std::cout << std::endl;
} }
char *buffer = allocate_aligned_buffer(p.size() + 1); char *buffer = allocate_aligned_buffer(p.size() + 1);
memcpy(buffer, p.c_str(), p.size()); memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0'; buffer[p.size()] = '\0';
int repeat = 10; int repeat = 10;
int volume = p.size(); int volume = p.size();
size_t strlength = rapidstringme((char *)p.c_str()).size(); size_t strlength = rapidstringme((char *)p.data()).size();
if (verbose) if (verbose)
std::cout << "input length is " << p.size() << " stringified length is " std::cout << "input length is " << p.size() << " stringified length is "
<< strlength << std::endl; << strlength << std::endl;
BEST_TIME_NOCHECK("despacing with RapidJSON", rapidstringme((char *)p.c_str()), , repeat, volume, true); BEST_TIME_NOCHECK("despacing with RapidJSON", rapidstringme((char *)p.data()), , repeat, volume, true);
BEST_TIME_NOCHECK("despacing with RapidJSON Insitu", rapidstringmeInsitu((char *)buffer), BEST_TIME_NOCHECK("despacing with RapidJSON Insitu", rapidstringmeInsitu((char *)buffer),
memcpy(buffer, p.c_str(), p.size()), repeat, volume, true); memcpy(buffer, p.data(), p.size()), repeat, volume, true);
memcpy(buffer, p.c_str(), p.size()); memcpy(buffer, p.data(), p.size());
size_t outlength = size_t outlength =
jsonminify((const uint8_t *)buffer, p.size(), (uint8_t *)buffer); jsonminify((const uint8_t *)buffer, p.size(), (uint8_t *)buffer);
@ -101,7 +101,7 @@ int main(int argc, char *argv[]) {
uint8_t *cbuffer = (uint8_t *)buffer; uint8_t *cbuffer = (uint8_t *)buffer;
BEST_TIME("jsonminify", jsonminify(cbuffer, p.size(), cbuffer), outlength, BEST_TIME("jsonminify", jsonminify(cbuffer, p.size(), cbuffer), outlength,
memcpy(buffer, p.c_str(), p.size()), repeat, volume, true); memcpy(buffer, p.data(), p.size()), repeat, volume, true);
printf("minisize = %zu, original size = %zu (minified down to %.2f percent of original) \n", outlength, p.size(), outlength * 100.0 / p.size()); printf("minisize = %zu, original size = %zu (minified down to %.2f percent of original) \n", outlength, p.size(), outlength * 100.0 / p.size());
/*** /***
@ -109,10 +109,10 @@ int main(int argc, char *argv[]) {
***/ ***/
rapidjson::Document d; rapidjson::Document d;
BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(), false, BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(), false,
memcpy(buffer, p.c_str(), p.size()), repeat, volume, true); memcpy(buffer, p.data(), p.size()), repeat, volume, true);
char *minibuffer = allocate_aligned_buffer(p.size() + 1); char *minibuffer = allocate_aligned_buffer(p.size() + 1);
size_t minisize = jsonminify((const uint8_t *)p.c_str(), p.size(), (uint8_t*) minibuffer); size_t minisize = jsonminify((const uint8_t *)p.data(), p.size(), (uint8_t*) minibuffer);
minibuffer[minisize] = '\0'; minibuffer[minisize] = '\0';
BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(), false, BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(), false,
@ -122,14 +122,14 @@ int main(int argc, char *argv[]) {
size_t astbuffersize = p.size() * 2; size_t astbuffersize = p.size() * 2;
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t)); size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
BEST_TIME("sajson orig", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true); BEST_TIME("sajson orig", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
BEST_TIME("sajson despaced", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(minisize, buffer)).is_valid(), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, true); BEST_TIME("sajson despaced", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(minisize, buffer)).is_valid(), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, true);
ParsedJson pj; ParsedJson pj;
pj.allocateCapacity(p.size(), 1024); pj.allocateCapacity(p.size(), 1024);
BEST_TIME("json_parse orig", json_parse((const u8*)buffer, p.size(), pj), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true); BEST_TIME("json_parse orig", json_parse((const u8*)buffer, p.size(), pj), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
ParsedJson pj2; ParsedJson pj2;
pj2.allocateCapacity(p.size(), 1024); pj2.allocateCapacity(p.size(), 1024);

View File

@ -65,7 +65,7 @@ int main(int argc, char *argv[]) {
cerr << "warning: ignoring everything after " << argv[optind + 1] << endl; cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
} }
if(verbose) cout << "[verbose] loading " << filename << endl; if(verbose) cout << "[verbose] loading " << filename << endl;
simdjsonstring p; std::string_view p;
try { try {
p = get_corpus(filename); p = get_corpus(filename);
} catch (const std::exception& e) { // caught by reference to base } catch (const std::exception& e) { // caught by reference to base
@ -118,7 +118,7 @@ int main(int argc, char *argv[]) {
#ifndef SQUASH_COUNTERS #ifndef SQUASH_COUNTERS
unified.start(); unified.start();
#endif #endif
isok = find_structural_bits(p.c_str(), p.size(), pj); isok = find_structural_bits(p.data(), p.size(), pj);
#ifndef SQUASH_COUNTERS #ifndef SQUASH_COUNTERS
unified.end(results); unified.end(results);
cy1 += results[0]; cy1 += results[0];
@ -147,7 +147,7 @@ int main(int argc, char *argv[]) {
unified.start(); unified.start();
#endif #endif
isok = isok && unified_machine(p.c_str(), p.size(), pj); isok = isok && unified_machine(p.data(), p.size(), pj);
#ifndef SQUASH_COUNTERS #ifndef SQUASH_COUNTERS
unified.end(results); unified.end(results);
cy3 += results[0]; cy3 += results[0];

View File

@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
if(optind + 1 < argc) { if(optind + 1 < argc) {
cerr << "warning: ignoring everything after " << argv[optind + 1] << endl; cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
} }
simdjsonstring p; std::string_view p;
try { try {
p = get_corpus(filename); p = get_corpus(filename);
} catch (const std::exception& e) { // caught by reference to base } catch (const std::exception& e) { // caught by reference to base
@ -93,32 +93,32 @@ int main(int argc, char *argv[]) {
rapidjson::Document d; rapidjson::Document d;
char *buffer = (char *)malloc(p.size() + 1); char *buffer = (char *)malloc(p.size() + 1);
memcpy(buffer, p.c_str(), p.size()); memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0'; buffer[p.size()] = '\0';
BEST_TIME("RapidJSON", BEST_TIME("RapidJSON",
d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(), d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(),
false, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true); false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
BEST_TIME("RapidJSON Insitu", d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), false, BEST_TIME("RapidJSON Insitu", d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), false,
memcpy(buffer, p.c_str(), p.size()), repeat, volume, true); memcpy(buffer, p.data(), p.size()), repeat, volume, true);
BEST_TIME("sajson (dynamic mem)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true); BEST_TIME("sajson (dynamic mem)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
size_t astbuffersize = p.size(); size_t astbuffersize = p.size();
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t)); size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
BEST_TIME("sajson (static alloc)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true); BEST_TIME("sajson (static alloc)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
std::string json11err; std::string json11err;
if(all) BEST_TIME("dropbox (json11) ", (( json11::Json::parse(buffer,json11err).is_null() ) || ( ! json11err.empty() )), false, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true); if(all) BEST_TIME("dropbox (json11) ", (( json11::Json::parse(buffer,json11err).is_null() ) || ( ! json11err.empty() )), false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
if(all) BEST_TIME("fastjson ", fastjson_parse(buffer), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true); if(all) BEST_TIME("fastjson ", fastjson_parse(buffer), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
JsonValue value; JsonValue value;
JsonAllocator allocator; JsonAllocator allocator;
char *endptr; char *endptr;
if(all) BEST_TIME("gason ", jsonParse(buffer, &endptr, &value, allocator), JSON_OK, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true); if(all) BEST_TIME("gason ", jsonParse(buffer, &endptr, &value, allocator), JSON_OK, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
void *state; void *state;
if(all) BEST_TIME("ultrajson ", (UJDecode(buffer, p.size(), NULL, &state) == NULL), false, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true); if(all) BEST_TIME("ultrajson ", (UJDecode(buffer, p.size(), NULL, &state) == NULL), false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
BEST_TIME("memcpy ", (memcpy(buffer, p.c_str(), p.size()) == buffer), true, , repeat, volume, true); BEST_TIME("memcpy ", (memcpy(buffer, p.data(), p.size()) == buffer), true, , repeat, volume, true);
free(ast_buffer); free(ast_buffer);
free(buffer); free(buffer);
} }

View File

@ -14,6 +14,6 @@ static inline size_t jsonminify(const char *buf, size_t len, char *out) {
} }
static inline size_t jsonminify(const simdjsonstring & p, char *out) { static inline size_t jsonminify(const std::string_view & p, char *out) {
return jsonminify(p.c_str(), p.size(), out); return jsonminify(p.data(), p.size(), out);
} }

View File

@ -10,10 +10,6 @@ char * allocate_aligned_buffer(size_t length) {
if (posix_memalign((void **)&aligned_buffer, 64, totalpaddedlength)) { if (posix_memalign((void **)&aligned_buffer, 64, totalpaddedlength)) {
throw std::runtime_error("Could not allocate sufficient memory"); throw std::runtime_error("Could not allocate sufficient memory");
}; };
aligned_buffer[length] = '\0';
for(size_t i = length + 1; i < totalpaddedlength; i++) aligned_buffer[i] = 0x20;
//aligned_buffer[paddedlength] = '\0';
//memset(aligned_buffer + length, 0x20, paddedlength - length);
return aligned_buffer; return aligned_buffer;
} }
@ -29,6 +25,7 @@ std::string_view get_corpus(std::string filename) {
} }
std::rewind(fp); std::rewind(fp);
std::fread(buf, 1, len, fp); std::fread(buf, 1, len, fp);
buf[len] = '\0';
std::fclose(fp); std::fclose(fp);
return std::string_view(buf,len); return std::string_view(buf,len);
} }

View File

@ -11,9 +11,13 @@ bool json_parse(const u8 *buf, size_t len, ParsedJson &pj) {
bool isok = find_structural_bits(buf, len, pj); bool isok = find_structural_bits(buf, len, pj);
if (isok) { if (isok) {
isok = flatten_indexes(len, pj); isok = flatten_indexes(len, pj);
} else {
return false;
} }
if (isok) { if (isok) {
isok = unified_machine(buf, len, pj); isok = unified_machine(buf, len, pj);
} else {
return false;
} }
return isok; return isok;
} }

View File

@ -61,8 +61,9 @@ WARN_UNUSED
// effectively the very first char is considered to follow "whitespace" for the // effectively the very first char is considered to follow "whitespace" for the
// purposes of psuedo-structural character detection // purposes of psuedo-structural character detection
u64 prev_iter_ends_pseudo_pred = 1ULL; u64 prev_iter_ends_pseudo_pred = 1ULL;
size_t lenminus64 = len + 1 < 64 ? 0 : len + 1 - 64; // len + 1 because of the NULL termination
for (size_t idx = 0; idx < len; idx += 64) { size_t idx = 0;
for (; idx < lenminus64; idx += 64) {
__builtin_prefetch(buf + idx + 128); __builtin_prefetch(buf + idx + 128);
#ifdef DEBUG #ifdef DEBUG
cout << "Idx is " << idx << "\n"; cout << "Idx is " << idx << "\n";
@ -249,21 +250,163 @@ WARN_UNUSED
"final structurals and pseudo structurals after close quote removal"); "final structurals and pseudo structurals after close quote removal");
*(u64 *)(pj.structurals + idx / 8) = structurals; *(u64 *)(pj.structurals + idx / 8) = structurals;
} }
////////////////
/// we use a giant copy-paste which is ugly.
/// but otherwise the string needs to be properly padded or else we
/// risk invalidating the UTF-8 checks.
////////////
if (idx < len + 1) { // +1 due to NULL termination
u8 tmpbuf[64];
memset(tmpbuf,0x20,64);
memcpy(tmpbuf,buf+idx,len - idx + 1);// +1 due to NULL termination
m256 input_lo = _mm256_loadu_si256((const m256 *)(tmpbuf + 0));
m256 input_hi = _mm256_loadu_si256((const m256 *)(tmpbuf + 32));
#ifdef UTF8VALIDATE
m256 highbit = _mm256_set1_epi8(0x80);
if((_mm256_testz_si256(_mm256_or_si256(input_lo, input_hi),highbit)) == 1) {
// it is ascii, we just check continuation
has_error = _mm256_or_si256(
_mm256_cmpgt_epi8(previous.carried_continuations,
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 1)),has_error);
} else {
// it is not ascii so we have to do heavy work
previous = avxcheckUTF8Bytes(input_lo, &previous, &has_error);
previous = avxcheckUTF8Bytes(input_hi, &previous, &has_error);
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////
// Step 1: detect odd sequences of backslashes
////////////////////////////////////////////////////////////////////////////////////////////
u64 bs_bits =
cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('\\'));
u64 start_edges = bs_bits & ~(bs_bits << 1);
// flip lowest if we have an odd-length run at the end of the prior
// iteration
u64 even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
u64 even_starts = start_edges & even_start_mask;
u64 odd_starts = start_edges & ~even_start_mask;
u64 even_carries = bs_bits + even_starts;
u64 odd_carries;
// must record the carry-out of our odd-carries out of bit 63; this
// indicates whether the sense of any edge going to the next iteration
// should be flipped
bool iter_ends_odd_backslash =
__builtin_uaddll_overflow(bs_bits, odd_starts, &odd_carries);
odd_carries |=
prev_iter_ends_odd_backslash; // push in bit zero as a potential end
// if we had an odd-numbered run at the
// end of the previous iteration
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
u64 even_carry_ends = even_carries & ~bs_bits;
u64 odd_carry_ends = odd_carries & ~bs_bits;
u64 even_start_odd_end = even_carry_ends & odd_bits;
u64 odd_start_even_end = odd_carry_ends & even_bits;
u64 odd_ends = even_start_odd_end | odd_start_even_end;
////////////////////////////////////////////////////////////////////////////////////////////
// Step 2: detect insides of quote pairs
////////////////////////////////////////////////////////////////////////////////////////////
u64 quote_bits =
cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
quote_bits = quote_bits & ~odd_ends;
u64 quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
prev_iter_inside_quote = (u64)((s64)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20
// How do we build up a user traversable data structure
// first, do a 'shufti' to detect structural JSON characters
// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
// these go into the first 3 buckets of the comparison (1/2/4)
// we are also interested in the four whitespace characters
// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
// these go into the next 2 buckets of the comparison (8/16)
const m256 low_nibble_mask = _mm256_setr_epi8(
// 0 9 a b c d
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, 0,
0, 0, 8, 12, 1, 2, 9, 0, 0);
const m256 high_nibble_mask = _mm256_setr_epi8(
// 0 2 3 5 7
8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
1, 0, 0, 0, 3, 2, 1, 0, 0);
m256 structural_shufti_mask = _mm256_set1_epi8(0x7);
m256 whitespace_shufti_mask = _mm256_set1_epi8(0x18);
m256 v_lo = _mm256_and_si256(
_mm256_shuffle_epi8(low_nibble_mask, input_lo),
_mm256_shuffle_epi8(high_nibble_mask,
_mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
_mm256_set1_epi8(0x7f))));
m256 v_hi = _mm256_and_si256(
_mm256_shuffle_epi8(low_nibble_mask, input_hi),
_mm256_shuffle_epi8(high_nibble_mask,
_mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
_mm256_set1_epi8(0x7f))));
m256 tmp_lo = _mm256_cmpeq_epi8(
_mm256_and_si256(v_lo, structural_shufti_mask), _mm256_set1_epi8(0));
m256 tmp_hi = _mm256_cmpeq_epi8(
_mm256_and_si256(v_hi, structural_shufti_mask), _mm256_set1_epi8(0));
u64 structural_res_0 = (u32)_mm256_movemask_epi8(tmp_lo);
u64 structural_res_1 = _mm256_movemask_epi8(tmp_hi);
u64 structurals = ~(structural_res_0 | (structural_res_1 << 32));
// this additional mask and transfer is non-trivially expensive,
// unfortunately
m256 tmp_ws_lo = _mm256_cmpeq_epi8(
_mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
m256 tmp_ws_hi = _mm256_cmpeq_epi8(
_mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
u64 ws_res_0 = (u32)_mm256_movemask_epi8(tmp_ws_lo);
u64 ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
u64 whitespace = ~(ws_res_0 | (ws_res_1 << 32));
// mask off anything inside quotes
structurals &= ~quote_mask;
// add the real quote bits back into our bitmask as well, so we can
// quickly traverse the strings we've spent all this trouble gathering
structurals |= quote_bits;
// Now, establish "pseudo-structural characters". These are non-whitespace
// characters that are (a) outside quotes and (b) have a predecessor that's
// either whitespace or a structural character. This means that subsequent
// passes will get a chance to encounter the first character of every string
// of non-whitespace and, if we're parsing an atom like true/false/null or a
// number we can stop at the first whitespace or structural character
// following it.
// a qualified predecessor is something that can happen 1 position before an
// psuedo-structural character
u64 pseudo_pred = structurals | whitespace;
u64 shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
u64 pseudo_structurals =
shifted_pseudo_pred & (~whitespace) & (~quote_mask);
structurals |= pseudo_structurals;
// now, we've used our close quotes all we need to. So let's switch them off
// they will be off in the quote mask and on in quote bits.
structurals &= ~(quote_bits & ~quote_mask);
*(u64 *)(pj.structurals + idx / 8) = structurals;
}
if(buf[len] != '\0') { if(buf[len] != '\0') {
std::cerr << "Your string should be NULL terminated." << std::endl; std::cerr << "Your string should be NULL terminated." << std::endl;
return false; return false;
} }
// we are going to zero out everything after len:
size_t count_last_64bits = len % 64;
if(count_last_64bits != 0) { // we have a "final" word where only count_last_64bits matter
u64 lastword = *(u64 *)(pj.structurals + len / 8);
printf("last word %zu \n", lastword);
printf("count_last_64bits%zu \n", count_last_64bits);
lastword &= ( UINT64_C(1) << count_last_64bits) - 1;
*(u64 *)(pj.structurals + len / 8) = lastword;
}
//pj.structural_indexes[pj.n_structural_indexes++] = len; // the final NULL is used as a pseudo-structural character
#ifdef UTF8VALIDATE #ifdef UTF8VALIDATE
return _mm256_testz_si256(has_error, has_error); return _mm256_testz_si256(has_error, has_error);
#else #else

View File

@ -119,7 +119,7 @@ bool flatten_indexes(size_t len, ParsedJson &pj) {
} }
pj.n_structural_indexes = base; pj.n_structural_indexes = base;
if(len != base_ptr[pj.n_structural_indexes-1]) { if(len != base_ptr[pj.n_structural_indexes-1]) {
printf("last structural should be pointing at the end of the string\n"); // can happen with malformed JSON such as unclosed quotes (["this is an unclosed string ])
return false; return false;
} }
base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array

View File

@ -58,7 +58,7 @@ int main(int argc, char *argv[]) {
exit(1); exit(1);
} }
const char * filename = argv[optind]; const char * filename = argv[optind];
simdjsonstring p; std::string_view p;
try { try {
p = get_corpus(filename); p = get_corpus(filename);
} catch (const std::exception& e) { // caught by reference to base } catch (const std::exception& e) { // caught by reference to base
@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
rapidjson::Document d; rapidjson::Document d;
char *buffer = (char *)malloc(p.size() + 1); char *buffer = (char *)malloc(p.size() + 1);
memcpy(buffer, p.c_str(), p.size()); memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0'; buffer[p.size()] = '\0';
bool rapid_correct = (d.Parse((const char *)buffer).HasParseError() == false); bool rapid_correct = (d.Parse((const char *)buffer).HasParseError() == false);
bool rapid_correct_checkencoding = (d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError() == false); bool rapid_correct_checkencoding = (d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError() == false);

View File

@ -49,7 +49,8 @@ bool validate(const char *dirname) {
for (int i = 0; i < c; i++) { for (int i = 0; i < c; i++) {
const char *name = entry_list[i]->d_name; const char *name = entry_list[i]->d_name;
if (hasExtension(name, extension)) { if (hasExtension(name, extension)) {
//printf("validating: file %s \n", name); printf("validating: file %s ", name);
fflush(NULL);
size_t filelen = strlen(name); size_t filelen = strlen(name);
char *fullpath = (char *)malloc(dirlen + filelen + 1 + 1); char *fullpath = (char *)malloc(dirlen + filelen + 1 + 1);
strcpy(fullpath, dirname); strcpy(fullpath, dirname);
@ -74,6 +75,7 @@ bool validate(const char *dirname) {
} }
++howmany; ++howmany;
bool isok = json_parse(p, pj); bool isok = json_parse(p, pj);
printf("%s\n", isok ? "ok" : "invalid");
if(contains("EXCLUDE",name)) { if(contains("EXCLUDE",name)) {
// skipping // skipping
howmany--; howmany--;
@ -89,10 +91,7 @@ bool validate(const char *dirname) {
printf("warning: file %s should fail but it passes.\n", name); printf("warning: file %s should fail but it passes.\n", name);
everythingfine = false; everythingfine = false;
} }
} else { }
printf("File %s %s.\n", name,
isok ? " is valid JSON " : " is not valid JSON");
}
free(fullpath); free(fullpath);
} }
} }

View File

@ -8,7 +8,7 @@ int main(int argc, char *argv[]) {
std::cerr << "Usage: " << argv[0] << " <jsonfile>\n"; std::cerr << "Usage: " << argv[0] << " <jsonfile>\n";
exit(1); exit(1);
} }
simdjsonstring p; std::string_view p;
std::string filename = argv[argc - 1]; std::string filename = argv[argc - 1];
try{ try{
p = get_corpus(filename); p = get_corpus(filename);
@ -16,6 +16,6 @@ int main(int argc, char *argv[]) {
std::cout << "Could not load the file " << filename << std::endl; std::cout << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE; return EXIT_FAILURE;
} }
jsonminify(p, &p[0]); jsonminify(p, (char *)p.data());
printf("%s",p.data()); printf("%s",p.data());
} }