Fixing...

This commit is contained in:
Daniel Lemire 2018-12-10 17:39:19 -05:00
parent 05636f3a1d
commit 7296d4d48b
6 changed files with 26 additions and 18 deletions

View File

@ -70,7 +70,7 @@ To simplify the engineering, we make some assumptions.
- We support UTF-8 (and thus ASCII), nothing else (no Latin, no UTF-16).
- We assume AVX2 support which is available in all recent mainstream x86 processors produced by AMD and Intel. No support for non-x86 processors is included.
- We only support GNU GCC and LLVM Clang at this time. There is no support for Microsoft Visual Studio, though it should not be difficult.
- We expect the input memory pointer to be padded (e.g., with spaces) so that it can be read entirely in blocks of 512 bits (a cache line). In practice, this means that users should allocate the memory where the JSON bytes are located using the `allocate_aligned_buffer` function or the equivalent. Of course, the data you may want to processed could be on a buffer that does have this padding. However, copying the data is relatively cheap (much cheaper than parsing JSON), and we can eventually remove this constraint.
- We expect the input memory pointer to be padded (e.g., with spaces) so that it can be read entirely in blocks of 512 bits (a cache line). In practice, this means that users may allocate the memory where the JSON bytes are located using the `allocate_padded_buffer` function or the equivalent. Of course, the data you may want to process could be on a buffer that does have this padding. However, copying the data is relatively cheap (much cheaper than parsing JSON), and we can eventually remove this constraint.
## Features
@ -78,6 +78,7 @@ To simplify the engineering, we make some assumptions.
- We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.)
- We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.)
- We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tags in strings.)
- The input string is unmodified.
## Architecture

View File

@ -10,11 +10,12 @@
#include "simdjson/common_defs.h"
// low-level function
// low-level function to allocate memory with padding so we can read passed the "length" bytes
// safely.
// if you must provide a pointer to some data, create it with this function:
// length is the max. size in bytes of the string
// caller is responsible to free the memory (free(...))
char * allocate_aligned_buffer(size_t length);
char * allocate_padded_buffer(size_t length);

View File

@ -2,7 +2,7 @@
#include <cstring>
char * allocate_aligned_buffer(size_t length) {
char * allocate_padded_buffer(size_t length) {
char *aligned_buffer;
size_t paddedlength = ROUNDUP_N(length, 64);
// allocate an extra sizeof(__m256i) just so we can always use AVX safely
@ -18,14 +18,14 @@ std::string_view get_corpus(std::string filename) {
if (fp) {
std::fseek(fp, 0, SEEK_END);
size_t len = std::ftell(fp);
char * buf = allocate_aligned_buffer(len);
char * buf = allocate_padded_buffer(len);
if(buf == NULL) {
std::fclose(fp);
throw std::runtime_error("could not allocate memory");
}
std::rewind(fp);
std::fread(buf, 1, len, fp);
//buf[len] = '\0';
//buf[len] = '\0';// no need
std::fclose(fp);
return std::string_view(buf,len);
}

View File

@ -118,7 +118,12 @@ bool flatten_indexes(size_t len, ParsedJson &pj) {
#endif
}
pj.n_structural_indexes = base;
if(base_ptr[pj.n_structural_indexes-1] > len) {
printf("Internal bug\n");
return false;
}
if(len != base_ptr[pj.n_structural_indexes-1]) {
// the string might not be NULL terminated, but we add a virtual NULL ending character.
base_ptr[pj.n_structural_indexes++] = len;
}
base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array

View File

@ -155,10 +155,11 @@ bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
case '9': {
// we need to make a copy to make sure that the string is NULL terminated.
// this is done only for JSON documents made of a sole number
char * copy = (char *) malloc(len + 1);
char * copy = (char *) malloc(len + 1 + 64);
if(copy == NULL) goto fail;
//memset(copy, 0, len + 1 + 64);
memcpy(copy, buf, len);
copy[len] = '\0';
if(copy == NULL) goto fail;
if (!parse_number((const u8 *)copy, pj, idx, false)) {
free(copy);
goto fail;
@ -169,10 +170,11 @@ bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
case '-': {
// we need to make a copy to make sure that the string is NULL terminated.
// this is done only for JSON documents made of a sole number
char * copy = (char *) malloc(len + 1);
char * copy = (char *) malloc(len + 1 + 64);
if(copy == NULL) goto fail;
//memset(copy, 0, len + 1 + 64);
memcpy(copy, buf, len);
copy[len] = '\0';
if(copy == NULL) goto fail;
if (!parse_number((const u8 *)copy, pj, idx, true)) {
free(copy);
goto fail;
@ -189,17 +191,14 @@ bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
pj.annotate_previousloc(pj.containing_scope_offset[depth],
pj.get_current_loc());
#endif // ALLOWANYTHINGINROOT
start_continue:
DEBUG_PRINTF("in start_object_close\n");
UPDATE_CHAR();
switch (c) {
case 0:
// the string might not be NULL terminated.
if(i + 1 == pj.n_structural_indexes) {
goto succeed;
default:
} else {
goto fail;
}
////////////////////////////// OBJECT STATES /////////////////////////////
object_begin:

View File

@ -25,10 +25,12 @@ size_t invalid_count;
const char *really_bad[] = {"013}", "0x14", "0e]", "0e+]", "0e+-1]"};
bool startsWith(const char *pre, const char *str) {
size_t lenpre = strlen(pre), lenstr = strlen(str);
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
size_t lenpre = strlen(pre);
return strncmp(pre, str, lenpre) == 0;
}
bool is_in_bad_list(const char *buf) {
if(buf[0] != '0') return false;
for (size_t i = 0; i < sizeof(really_bad) / sizeof(really_bad[0]); i++)
if (startsWith(really_bad[i], buf))
return true;