Updating the amalgamation.

This commit is contained in:
Daniel Lemire 2019-07-09 15:21:07 -04:00
parent 7369339c88
commit a1ea37c336
3 changed files with 1208 additions and 757 deletions

View File

@ -1,4 +1,4 @@
/* auto-generated on Tue 02 Jul 2019 04:34:44 PM EDT. Do not edit! */ /* auto-generated on Tue 9 Jul 2019 15:20:54 EDT. Do not edit! */
#include <iostream> #include <iostream>
#include "simdjson.h" #include "simdjson.h"

View File

@ -1,4 +1,4 @@
/* auto-generated on Tue 02 Jul 2019 04:34:44 PM EDT. Do not edit! */ /* auto-generated on Tue 9 Jul 2019 15:20:54 EDT. Do not edit! */
#include "simdjson.h" #include "simdjson.h"
/* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */ /* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
@ -335,10 +335,10 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
#ifdef __AVX2__ #ifdef __AVX2__
json_parse_functype* avx_implementation = &json_parse_implementation<instruction_set::avx2>; json_parse_functype* avx_implementation = &json_parse_implementation<instruction_set::avx2>;
#endif #endif
#ifdef __SSE4_2__ #if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
// json_parse_functype* sse4_2_implementation = &json_parse_implementation<instruction_set::sse4_2>; // not implemented yet json_parse_functype* sse4_2_implementation = &json_parse_implementation<instruction_set::sse4_2>;
#endif #endif
#ifdef __ARM_NEON #if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
json_parse_functype* neon_implementation = &json_parse_implementation<instruction_set::neon>; json_parse_functype* neon_implementation = &json_parse_implementation<instruction_set::neon>;
#endif #endif
@ -346,9 +346,9 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
// Should be done at runtime. Does not make any sense on preprocessor. // Should be done at runtime. Does not make any sense on preprocessor.
#ifdef __AVX2__ #ifdef __AVX2__
instruction_set best_implementation = instruction_set::avx2; instruction_set best_implementation = instruction_set::avx2;
#elif defined (__SSE4_2__) #elif defined (__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
instruction_set best_implementation = instruction_set::sse4_2; instruction_set best_implementation = instruction_set::sse4_2;
#elif defined (__ARM_NEON) #elif defined (__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
instruction_set best_implementation = instruction_set::neon; instruction_set best_implementation = instruction_set::neon;
#else #else
instruction_set best_implementation = instruction_set::none; instruction_set best_implementation = instruction_set::none;
@ -360,11 +360,13 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
case instruction_set::avx2 : case instruction_set::avx2 :
json_parse_ptr = avx_implementation; json_parse_ptr = avx_implementation;
break; break;
#elif defined (__SSE4_2__) #endif
/*case instruction_set::sse4_2 : #if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
case instruction_set::sse4_2 :
json_parse_ptr = sse4_2_implementation; json_parse_ptr = sse4_2_implementation;
break;*/ break;
#elif defined (__ARM_NEON) #endif
#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
case instruction_set::neon : case instruction_set::neon :
json_parse_ptr = neon_implementation; json_parse_ptr = neon_implementation;
break; break;
@ -390,585 +392,13 @@ ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool reallocifneede
} }
return pj; return pj;
} }
}/* end file src/jsonparser.cpp */ }
/* end file src/jsonparser.cpp */
/* begin file src/stage1_find_marks.cpp */ /* begin file src/stage1_find_marks.cpp */
// File kept in case we want to reuse it soon. (many configuration files to edit) // File kept in case we want to reuse it soon. (many configuration files to edit)
/* end file src/stage1_find_marks.cpp */ /* end file src/stage1_find_marks.cpp */
/* begin file src/stage2_build_tape.cpp */ /* begin file src/stage2_build_tape.cpp */
#include <cassert> // File kept in case we want to reuse it soon. (many configuration files to edit)
#include <cstring>
#include <iostream>
#define PATH_SEP '/'
namespace simdjson {
WARN_UNUSED
really_inline bool is_valid_true_atom(const uint8_t *loc) {
uint64_t tv = *reinterpret_cast<const uint64_t *>("true ");
uint64_t mask4 = 0x00000000ffffffff;
uint32_t error = 0;
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
std::memcpy(&locval, loc, sizeof(uint64_t));
error = (locval & mask4) ^ tv;
error |= is_not_structural_or_whitespace(loc[4]);
return error == 0;
}
WARN_UNUSED
really_inline bool is_valid_false_atom(const uint8_t *loc) {
// We have to use an integer constant because the space in the cast
// below would lead to values illegally being qualified
// uint64_t fv = *reinterpret_cast<const uint64_t *>("false ");
// using this constant (that is the same false) but nulls out the
// unused bits solves that
uint64_t fv = 0x00000065736c6166; // takes into account endianness
uint64_t mask5 = 0x000000ffffffffff;
// we can't use the 32 bit value for checking for errors otherwise
// the last character of false (it being 5 byte long!) would be
// ignored
uint64_t error = 0;
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
std::memcpy(&locval, loc, sizeof(uint64_t));
error = (locval & mask5) ^ fv;
error |= is_not_structural_or_whitespace(loc[5]);
return error == 0;
}
WARN_UNUSED
really_inline bool is_valid_null_atom(const uint8_t *loc) {
uint64_t nv = *reinterpret_cast<const uint64_t *>("null ");
uint64_t mask4 = 0x00000000ffffffff;
uint32_t error = 0;
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
std::memcpy(&locval, loc, sizeof(uint64_t));
error = (locval & mask4) ^ nv;
error |= is_not_structural_or_whitespace(loc[4]);
return error == 0;
}
/************
* The JSON is parsed to a tape, see the accompanying tape.md file
* for documentation.
***********/
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
memset((uint8_t*)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
#endif
uint32_t i = 0; // index of the structural character (0,1,2,3...)
uint32_t idx; // location of the structural character in the input (buf)
uint8_t c; // used to track the (structural) character we are looking at, updated
// by UPDATE_CHAR macro
uint32_t depth = 0; // could have an arbitrary starting depth
pj.init(); // sets isvalid to false
if(pj.bytecapacity < len) {
pj.errorcode = CAPACITY;
return pj.errorcode;
}
// this macro reads the next structural character, updating idx, i and c.
#define UPDATE_CHAR() \
{ \
idx = pj.structural_indexes[i++]; \
c = buf[idx]; \
}
////////////////////////////// START STATE /////////////////////////////
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&start_continue;
#else
pj.ret_address[depth] = 's';
#endif
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
// the root is used, if nothing else, to capture the size of the tape
depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
if (depth >= pj.depthcapacity) {
goto fail;
}
UPDATE_CHAR();
switch (c) {
case '{':
pj.containing_scope_offset[depth] = pj.get_current_loc();
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&start_continue;
#else
pj.ret_address[depth] = 's';
#endif
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
pj.write_tape(0, c); // strangely, moving this to object_begin slows things down
goto object_begin;
case '[':
pj.containing_scope_offset[depth] = pj.get_current_loc();
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&start_continue;
#else
pj.ret_address[depth] = 's';
#endif
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
pj.write_tape(0, c);
goto array_begin;
#define SIMDJSON_ALLOWANYTHINGINROOT
// A JSON text is a serialized value. Note that certain previous
// specifications of JSON constrained a JSON text to be an object or an
// array. Implementations that generate only objects or arrays where a
// JSON text is called for will be interoperable in the sense that all
// implementations will accept these as conforming JSON texts.
// https://tools.ietf.org/html/rfc8259
#ifdef SIMDJSON_ALLOWANYTHINGINROOT
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't': {
// we need to make a copy to make sure that the string is space terminated.
// this only applies to the JSON document made solely of the true value.
// this will almost never be called in practice
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case 'f': {
// we need to make a copy to make sure that the string is space terminated.
// this only applies to the JSON document made solely of the false value.
// this will almost never be called in practice
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case 'n': {
// we need to make a copy to make sure that the string is space terminated.
// this only applies to the JSON document made solely of the null value.
// this will almost never be called in practice
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
// we need to make a copy to make sure that the string is space terminated.
// this is done only for JSON documents made of a sole number
// this will almost never be called in practice. We terminate with a space
// because we do not want to allow NULLs in the middle of a number (whereas a
// space in the middle of a number would be identified in stage 1).
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, false)) {
free(copy);
goto fail;
}
free(copy);
break;
}
case '-': {
// we need to make a copy to make sure that the string is NULL terminated.
// this is done only for JSON documents made of a sole number
// this will almost never be called in practice
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = '\0';
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
free(copy);
goto fail;
}
free(copy);
break;
}
#endif // ALLOWANYTHINGINROOT
default:
goto fail;
}
start_continue:
// the string might not be NULL terminated.
if(i + 1 == pj.n_structural_indexes) {
goto succeed;
} else {
goto fail;
}
////////////////////////////// OBJECT STATES /////////////////////////////
object_begin:
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
goto object_key_state;
}
case '}':
goto scope_end; // could also go to object_continue
default:
goto fail;
}
object_key_state:
UPDATE_CHAR();
if (c != ':') {
goto fail;
}
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (!parse_number(buf, pj, idx, false)) {
goto fail;
}
break;
}
case '-': {
if (!parse_number(buf, pj, idx, true)) {
goto fail;
}
break;
}
case '{': {
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
// we have not yet encountered } so we need to come back for it
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&object_continue;
#else
pj.ret_address[depth] = 'o';
#endif
// we found an object inside an object, so we need to increment the depth
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
goto object_begin;
}
case '[': {
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
// we have not yet encountered } so we need to come back for it
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&object_continue;
#else
pj.ret_address[depth] = 'o';
#endif
// we found an array inside an object, so we need to increment the depth
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
goto array_begin;
}
default:
goto fail;
}
object_continue:
UPDATE_CHAR();
switch (c) {
case ',':
UPDATE_CHAR();
if (c != '"') {
goto fail;
} else {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
goto object_key_state;
}
case '}':
goto scope_end;
default:
goto fail;
}
////////////////////////////// COMMON STATE /////////////////////////////
scope_end:
// write our tape location to the header scope
depth--;
pj.write_tape(pj.containing_scope_offset[depth], c);
pj.annotate_previousloc(pj.containing_scope_offset[depth],
pj.get_current_loc());
// goto saved_state
#ifdef SIMDJSON_USE_COMPUTED_GOTO
goto *pj.ret_address[depth];
#else
if(pj.ret_address[depth] == 'a') {
goto array_continue;
} else if (pj.ret_address[depth] == 'o') {
goto object_continue;
} else goto start_continue;
#endif
////////////////////////////// ARRAY STATES /////////////////////////////
array_begin:
UPDATE_CHAR();
if (c == ']') {
goto scope_end; // could also go to array_continue
}
main_array_switch:
// we call update char on all paths in, so we can peek at c on the
// on paths that can accept a close square brace (post-, and at start)
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break; // goto array_continue;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (!parse_number(buf, pj, idx, false)) {
goto fail;
}
break; // goto array_continue;
}
case '-': {
if (!parse_number(buf, pj, idx, true)) {
goto fail;
}
break; // goto array_continue;
}
case '{': {
// we have not yet encountered ] so we need to come back for it
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&array_continue;
#else
pj.ret_address[depth] = 'a';
#endif
// we found an object inside an array, so we need to increment the depth
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
goto object_begin;
}
case '[': {
// we have not yet encountered ] so we need to come back for it
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&array_continue;
#else
pj.ret_address[depth] = 'a';
#endif
// we found an array inside an array, so we need to increment the depth
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
goto array_begin;
}
default:
goto fail;
}
array_continue:
UPDATE_CHAR();
switch (c) {
case ',':
UPDATE_CHAR();
goto main_array_switch;
case ']':
goto scope_end;
default:
goto fail;
}
////////////////////////////// FINAL STATES /////////////////////////////
succeed:
depth --;
if(depth != 0) {
fprintf(stderr, "internal bug\n");
abort();
}
if(pj.containing_scope_offset[depth] != 0) {
fprintf(stderr, "internal bug\n");
abort();
}
pj.annotate_previousloc(pj.containing_scope_offset[depth],
pj.get_current_loc());
pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
pj.isvalid = true;
pj.errorcode = SUCCESS;
return pj.errorcode;
fail:
// we do not need the next line because this is done by pj.init(), pessimistically.
// pj.isvalid = false;
// At this point in the code, we have all the time in the world.
// Note that we know exactly where we are in the document so we could,
// without any overhead on the processing code, report a specific location.
// We could even trigger special code paths to assess what happened carefully,
// all without any added cost.
if (depth >= pj.depthcapacity) {
pj.errorcode = DEPTH_ERROR;
return pj.errorcode;
}
switch(c) {
case '"':
pj.errorcode = STRING_ERROR;
return pj.errorcode;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
pj.errorcode = NUMBER_ERROR;
return pj.errorcode;
case 't':
pj.errorcode = T_ATOM_ERROR;
return pj.errorcode;
case 'n':
pj.errorcode = N_ATOM_ERROR;
return pj.errorcode;
case 'f':
pj.errorcode = F_ATOM_ERROR;
return pj.errorcode;
default:
break;
}
pj.errorcode = TAPE_ERROR;
return pj.errorcode;
}
int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
return unified_machine(reinterpret_cast<const uint8_t*>(buf), len, pj);
}
}
/* end file src/stage2_build_tape.cpp */ /* end file src/stage2_build_tape.cpp */
/* begin file src/parsedjson.cpp */ /* begin file src/parsedjson.cpp */

File diff suppressed because it is too large Load Diff