Cleaning.

This commit is contained in:
Daniel Lemire 2018-12-10 16:47:02 -05:00
parent 7fda77d51a
commit 05636f3a1d
8 changed files with 47 additions and 22 deletions

View File

@ -71,7 +71,6 @@ To simplify the engineering, we make some assumptions.
- We assume AVX2 support which is available in all recent mainstream x86 processors produced by AMD and Intel. No support for non-x86 processors is included.
- We only support GNU GCC and LLVM Clang at this time. There is no support for Microsoft Visual Studio, though it should not be difficult.
- We expect the input memory pointer to be padded (e.g., with spaces) so that it can be read entirely in blocks of 512 bits (a cache line). In practice, this means that users should allocate the memory where the JSON bytes are located using the `allocate_aligned_buffer` function or the equivalent. Of course, the data you may want to processed could be on a buffer that does have this padding. However, copying the data is relatively cheap (much cheaper than parsing JSON), and we can eventually remove this constraint.
- The input string should be NULL terminated.
## Features

View File

@ -101,15 +101,15 @@ int main(int argc, char *argv[]) {
BEST_TIME("RapidJSON",
d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(),
false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
BEST_TIME("RapidJSON Insitu", d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), false,
BEST_TIME("RapidJSON (insitu)", d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), false,
memcpy(buffer, p.data(), p.size()), repeat, volume, true);
BEST_TIME("sajson (dynamic mem)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
BEST_TIME("sajson (dynamic mem, insitu)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
size_t astbuffersize = p.size();
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
BEST_TIME("sajson (static alloc)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
BEST_TIME("sajson (static alloc, insitu)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
std::string json11err;
if(all) BEST_TIME("dropbox (json11) ", (( json11::Json::parse(buffer,json11err).is_null() ) || ( ! json11err.empty() )), false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);

View File

@ -7,6 +7,14 @@
static inline void print_with_escapes(const unsigned char *src) {
while (*src) {
switch (*src) {
case '\b':
putchar('\\');
putchar('b');
break;
case '\f':
putchar('\\');
putchar('f');
break;
case '\n':
putchar('\\');
putchar('n');
@ -40,6 +48,14 @@ static inline void print_with_escapes(const unsigned char *src) {
static inline void print_with_escapes(const unsigned char *src, std::ostream &os) {
while (*src) {
switch (*src) {
case '\b':
os << '\\';
os << 'b';
break;
case '\f':
os << '\\';
os << 'f';
break;
case '\n':
os << '\\';
os << 'n';

View File

@ -10,7 +10,6 @@
// Parse a document found in buf, need to preallocate ParsedJson.
// Return false in case of a failure. You can also check validity
// by calling pj.isValid(). The same ParsedJson can be reused.
// The string should be NULL terminated.
WARN_UNUSED
bool json_parse(const u8 *buf, size_t len, ParsedJson &pj);

View File

@ -132,20 +132,21 @@ public:
size_t *inobjectidx = new size_t[depthcapacity];
int depth = 1; // only root at level 0
inobjectidx[depth] = 0;
inobject[depth] = false;
for (; tapeidx < howmany; tapeidx++) {
tape_val = tape[tapeidx];
u64 payload = tape_val & JSONVALUEMASK;
type = (tape_val >> 56);
if (!inobject[depth]) {
if ((inobjectidx[depth] > 0) && (type != ']'))
os << ", ";
os << ",";
inobjectidx[depth]++;
} else { // if (inobject) {
if ((inobjectidx[depth] > 0) && ((inobjectidx[depth] & 1) == 0) &&
(type != '}'))
os << ", ";
os << ",";
if (((inobjectidx[depth] & 1) == 1))
os << " : ";
os << ":";
inobjectidx[depth]++;
}
switch (type) {
@ -176,7 +177,6 @@ public:
os << "false";
break;
case '{': // we have an object
os << '\n';
os << '{';
depth++;
inobject[depth] = true;
@ -187,7 +187,6 @@ public:
os << '}';
break;
case '[': // we start an array
os << '\n';
os << '[';
depth++;
inobject[depth] = false;
@ -237,6 +236,7 @@ public:
os << "string \"";
print_with_escapes((const unsigned char *)(string_buf + payload));
os << '"';
os << '\n';
break;
case 'l': // we have a long int
if (tapeidx + 1 >= howmany)

View File

@ -25,7 +25,7 @@ std::string_view get_corpus(std::string filename) {
}
std::rewind(fp);
std::fread(buf, 1, len, fp);
buf[len] = '\0';
//buf[len] = '\0';
std::fclose(fp);
return std::string_view(buf,len);
}

View File

@ -13,8 +13,7 @@
#define UTF8VALIDATE
// It seems that many parsers do UTF-8 validation.
// RapidJSON does not do it by default, but a flag
// allows it. It appears that sajson might do utf-8
// validation
// allows it.
#ifdef UTF8VALIDATE
#include "simdjson/simdutf8check.h"
#endif
@ -61,7 +60,7 @@ WARN_UNUSED
// effectively the very first char is considered to follow "whitespace" for the
// purposes of psuedo-structural character detection
u64 prev_iter_ends_pseudo_pred = 1ULL;
size_t lenminus64 = len + 1 < 64 ? 0 : len + 1 - 64; // len + 1 because of the NULL termination
size_t lenminus64 = len < 64 ? 0 : len - 64;
size_t idx = 0;
for (; idx < lenminus64; idx += 64) {
__builtin_prefetch(buf + idx + 128);
@ -256,10 +255,10 @@ WARN_UNUSED
/// but otherwise the string needs to be properly padded or else we
/// risk invalidating the UTF-8 checks.
////////////
if (idx < len + 1) { // +1 due to NULL termination
if (idx < len) {
u8 tmpbuf[64];
memset(tmpbuf,0x20,64);
memcpy(tmpbuf,buf+idx,len - idx + 1);// +1 due to NULL termination
memcpy(tmpbuf,buf+idx,len - idx);
m256 input_lo = _mm256_loadu_si256((const m256 *)(tmpbuf + 0));
m256 input_hi = _mm256_loadu_si256((const m256 *)(tmpbuf + 32));
#ifdef UTF8VALIDATE
@ -403,10 +402,6 @@ WARN_UNUSED
structurals &= ~(quote_bits & ~quote_mask);
*(u64 *)(pj.structurals + idx / 8) = structurals;
}
if(buf[len] != '\0') {
std::cerr << "Your string should be NULL terminated." << std::endl;
return false;
}
#ifdef UTF8VALIDATE
return _mm256_testz_si256(has_error, has_error);
#else

View File

@ -153,15 +153,31 @@ bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
case '7':
case '8':
case '9': {
if (!parse_number(buf, pj, idx, false)) {
// we need to make a copy to make sure that the string is NULL terminated.
// this is done only for JSON documents made of a sole number
char * copy = (char *) malloc(len + 1);
memcpy(copy, buf, len);
copy[len] = '\0';
if(copy == NULL) goto fail;
if (!parse_number((const u8 *)copy, pj, idx, false)) {
free(copy);
goto fail;
}
free(copy);
break;
}
case '-': {
if (!parse_number(buf, pj, idx, true)) {
// we need to make a copy to make sure that the string is NULL terminated.
// this is done only for JSON documents made of a sole number
char * copy = (char *) malloc(len + 1);
memcpy(copy, buf, len);
copy[len] = '\0';
if(copy == NULL) goto fail;
if (!parse_number((const u8 *)copy, pj, idx, true)) {
free(copy);
goto fail;
}
free(copy);
break;
}
#endif // ALLOWANYTHINGINROOT