Stage2 refactored to simplify multiple implementations
This commit is contained in:
parent
aa78b70d69
commit
3f24879157
|
@ -157,7 +157,8 @@ int main(int argc, char *argv[]) {
|
|||
break;
|
||||
}
|
||||
unified.start();
|
||||
isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
|
||||
// The default template is simdjson::instruction_set::native.
|
||||
isok = isok && (simdjson::SUCCESS == unified_machine<>(p.data(), p.size(), pj));
|
||||
unified.end(results);
|
||||
cy2 += results[0];
|
||||
cl2 += results[1];
|
||||
|
@ -188,7 +189,7 @@ int main(int argc, char *argv[]) {
|
|||
auto start = std::chrono::steady_clock::now();
|
||||
// The default template is simdjson::instruction_set::native.
|
||||
isok = (find_structural_bits<>(p.data(), p.size(), pj) == simdjson::SUCCESS);
|
||||
isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
|
||||
isok = isok && (simdjson::SUCCESS == unified_machine<>(p.data(), p.size(), pj));
|
||||
auto end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> secs = end - start;
|
||||
res[i] = secs.count();
|
||||
|
|
|
@ -25,7 +25,6 @@ using json_parse_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj,
|
|||
// Pointer that holds the json_parse implementation corresponding to the available SIMD instruction set
|
||||
extern json_parse_functype *json_parse_ptr;
|
||||
|
||||
|
||||
// json_parse_implementation is the generic function, it is specialized for various
|
||||
// SIMD instruction sets, e.g., as json_parse_implementation<simdjson::instruction_set::avx2>
|
||||
// or json_parse_implementation<simdjson::instruction_set::neon>
|
||||
|
@ -68,7 +67,7 @@ int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj, bo
|
|||
pj.errorcode = stage1_is_ok;
|
||||
return pj.errorcode;
|
||||
}
|
||||
int res = unified_machine(buf, len, pj);
|
||||
int res = unified_machine<T>(buf, len, pj);
|
||||
if(reallocated) { aligned_free((void*)buf);}
|
||||
return res;
|
||||
}
|
||||
|
|
|
@ -6,6 +6,13 @@
|
|||
#include "simdjson/parsedjson.h"
|
||||
#include "simdjson/portability.h"
|
||||
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
void foundInvalidNumber(const uint8_t *buf);
|
||||
void foundInteger(int64_t result, const uint8_t *buf);
|
||||
void foundFloat(double result, const uint8_t *buf);
|
||||
#endif
|
||||
|
||||
|
||||
// Allowable floating-point values range from std::numeric_limits<double>::lowest()
|
||||
// to std::numeric_limits<double>::max(), so from
|
||||
// -1.7976e308 all the way to 1.7975e308 in binary64. The lowest non-zero
|
||||
|
@ -375,9 +382,6 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
|
|||
return is_structural_or_whitespace(*p);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// parse the number at buf + offset
|
||||
// define JSON_TEST_NUMBERS for unit testing
|
||||
//
|
||||
|
|
|
@ -1,16 +1,586 @@
|
|||
#ifndef SIMDJSON_STAGE2_BUILD_TAPE_H
|
||||
#define SIMDJSON_STAGE2_BUILD_TAPE_H
|
||||
|
||||
#include "simdjson/common_defs.h"
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
|
||||
struct ParsedJson;
|
||||
#include "simdjson/common_defs.h"
|
||||
#include "simdjson/jsoncharutils.h"
|
||||
#include "simdjson/numberparsing.h"
|
||||
#include "simdjson/parsedjson.h"
|
||||
#include "simdjson/stringparsing.h"
|
||||
#include "simdjson/simdjson.h"
|
||||
|
||||
#define PATH_SEP '/'
|
||||
|
||||
void init_state_machine();
|
||||
|
||||
WARN_UNUSED
|
||||
int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
|
||||
really_inline bool is_valid_true_atom(const uint8_t *loc) {
|
||||
uint64_t tv = *reinterpret_cast<const uint64_t *>("true ");
|
||||
uint64_t mask4 = 0x00000000ffffffff;
|
||||
uint32_t error = 0;
|
||||
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
||||
std::memcpy(&locval, loc, sizeof(uint64_t));
|
||||
error = (locval & mask4) ^ tv;
|
||||
error |= is_not_structural_or_whitespace(loc[4]);
|
||||
return error == 0;
|
||||
}
|
||||
|
||||
WARN_UNUSED
|
||||
int unified_machine(const char *buf, size_t len, ParsedJson &pj);
|
||||
really_inline bool is_valid_false_atom(const uint8_t *loc) {
|
||||
// We have to use an integer constant because the space in the cast
|
||||
// below would lead to values illegally being qualified
|
||||
// uint64_t fv = *reinterpret_cast<const uint64_t *>("false ");
|
||||
// using this constant (that is the same false) but nulls out the
|
||||
// unused bits solves that
|
||||
uint64_t fv = 0x00000065736c6166; // takes into account endianness
|
||||
uint64_t mask5 = 0x000000ffffffffff;
|
||||
// we can't use the 32 bit value for checking for errors otherwise
|
||||
// the last character of false (it being 5 byte long!) would be
|
||||
// ignored
|
||||
uint64_t error = 0;
|
||||
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
||||
std::memcpy(&locval, loc, sizeof(uint64_t));
|
||||
error = (locval & mask5) ^ fv;
|
||||
error |= is_not_structural_or_whitespace(loc[5]);
|
||||
return error == 0;
|
||||
}
|
||||
|
||||
WARN_UNUSED
|
||||
really_inline bool is_valid_null_atom(const uint8_t *loc) {
|
||||
uint64_t nv = *reinterpret_cast<const uint64_t *>("null ");
|
||||
uint64_t mask4 = 0x00000000ffffffff;
|
||||
uint32_t error = 0;
|
||||
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
||||
std::memcpy(&locval, loc, sizeof(uint64_t));
|
||||
error = (locval & mask4) ^ nv;
|
||||
error |= is_not_structural_or_whitespace(loc[4]);
|
||||
return error == 0;
|
||||
}
|
||||
|
||||
|
||||
/************
|
||||
* The JSON is parsed to a tape, see the accompanying tape.md file
|
||||
* for documentation.
|
||||
***********/
|
||||
template<simdjson::instruction_set T = simdjson::instruction_set::native>
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
|
||||
int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
||||
#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
|
||||
memset((uint8_t*)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
|
||||
#endif
|
||||
uint32_t i = 0; // index of the structural character (0,1,2,3...)
|
||||
uint32_t idx; // location of the structural character in the input (buf)
|
||||
uint8_t c; // used to track the (structural) character we are looking at, updated
|
||||
// by UPDATE_CHAR macro
|
||||
uint32_t depth = 0; // could have an arbitrary starting depth
|
||||
pj.init(); // sets isvalid to false
|
||||
if(pj.bytecapacity < len) {
|
||||
pj.errorcode = simdjson::CAPACITY;
|
||||
return pj.errorcode;
|
||||
}
|
||||
// this macro reads the next structural character, updating idx, i and c.
|
||||
#define UPDATE_CHAR() \
|
||||
{ \
|
||||
idx = pj.structural_indexes[i++]; \
|
||||
c = buf[idx]; \
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////// START STATE /////////////////////////////
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&start_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 's';
|
||||
#endif
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
|
||||
// the root is used, if nothing else, to capture the size of the tape
|
||||
depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
UPDATE_CHAR();
|
||||
switch (c) {
|
||||
case '{':
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&start_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 's';
|
||||
#endif
|
||||
depth++;
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c); // strangely, moving this to object_begin slows things down
|
||||
goto object_begin;
|
||||
case '[':
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&start_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 's';
|
||||
#endif
|
||||
depth++;
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
goto array_begin;
|
||||
#define SIMDJSON_ALLOWANYTHINGINROOT
|
||||
// A JSON text is a serialized value. Note that certain previous
|
||||
// specifications of JSON constrained a JSON text to be an object or an
|
||||
// array. Implementations that generate only objects or arrays where a
|
||||
// JSON text is called for will be interoperable in the sense that all
|
||||
// implementations will accept these as conforming JSON texts.
|
||||
// https://tools.ietf.org/html/rfc8259
|
||||
#ifdef SIMDJSON_ALLOWANYTHINGINROOT
|
||||
case '"': {
|
||||
if (!parse_string<T>(buf, len, pj, depth, idx)) {
|
||||
goto fail;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 't': {
|
||||
// we need to make a copy to make sure that the string is space terminated.
|
||||
// this only applies to the JSON document made solely of the true value.
|
||||
// this will almost never be called in practice
|
||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||
if(copy == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
memcpy(copy, buf, len);
|
||||
copy[len] = ' ';
|
||||
if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
|
||||
free(copy);
|
||||
goto fail;
|
||||
}
|
||||
free(copy);
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
}
|
||||
case 'f': {
|
||||
// we need to make a copy to make sure that the string is space terminated.
|
||||
// this only applies to the JSON document made solely of the false value.
|
||||
// this will almost never be called in practice
|
||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||
if(copy == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
memcpy(copy, buf, len);
|
||||
copy[len] = ' ';
|
||||
if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
|
||||
free(copy);
|
||||
goto fail;
|
||||
}
|
||||
free(copy);
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
}
|
||||
case 'n': {
|
||||
// we need to make a copy to make sure that the string is space terminated.
|
||||
// this only applies to the JSON document made solely of the null value.
|
||||
// this will almost never be called in practice
|
||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||
if(copy == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
memcpy(copy, buf, len);
|
||||
copy[len] = ' ';
|
||||
if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
|
||||
free(copy);
|
||||
goto fail;
|
||||
}
|
||||
free(copy);
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
}
|
||||
case '0':
|
||||
case '1':
|
||||
case '2':
|
||||
case '3':
|
||||
case '4':
|
||||
case '5':
|
||||
case '6':
|
||||
case '7':
|
||||
case '8':
|
||||
case '9': {
|
||||
// we need to make a copy to make sure that the string is space terminated.
|
||||
// this is done only for JSON documents made of a sole number
|
||||
// this will almost never be called in practice. We terminate with a space
|
||||
// because we do not want to allow NULLs in the middle of a number (whereas a
|
||||
// space in the middle of a number would be identified in stage 1).
|
||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||
if(copy == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
memcpy(copy, buf, len);
|
||||
copy[len] = ' ';
|
||||
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, false)) {
|
||||
free(copy);
|
||||
goto fail;
|
||||
}
|
||||
free(copy);
|
||||
break;
|
||||
}
|
||||
case '-': {
|
||||
// we need to make a copy to make sure that the string is NULL terminated.
|
||||
// this is done only for JSON documents made of a sole number
|
||||
// this will almost never be called in practice
|
||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||
if(copy == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
memcpy(copy, buf, len);
|
||||
copy[len] = '\0';
|
||||
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
|
||||
free(copy);
|
||||
goto fail;
|
||||
}
|
||||
free(copy);
|
||||
break;
|
||||
}
|
||||
#endif // ALLOWANYTHINGINROOT
|
||||
default:
|
||||
goto fail;
|
||||
}
|
||||
start_continue:
|
||||
// the string might not be NULL terminated.
|
||||
if(i + 1 == pj.n_structural_indexes) {
|
||||
goto succeed;
|
||||
} else {
|
||||
goto fail;
|
||||
}
|
||||
////////////////////////////// OBJECT STATES /////////////////////////////
|
||||
|
||||
object_begin:
|
||||
UPDATE_CHAR();
|
||||
switch (c) {
|
||||
case '"': {
|
||||
if (!parse_string<T>(buf, len, pj, depth, idx)) {
|
||||
goto fail;
|
||||
}
|
||||
goto object_key_state;
|
||||
}
|
||||
case '}':
|
||||
goto scope_end; // could also go to object_continue
|
||||
default:
|
||||
goto fail;
|
||||
}
|
||||
|
||||
object_key_state:
|
||||
UPDATE_CHAR();
|
||||
if (c != ':') {
|
||||
goto fail;
|
||||
}
|
||||
UPDATE_CHAR();
|
||||
switch (c) {
|
||||
case '"': {
|
||||
if (!parse_string<T>(buf, len, pj, depth, idx)) {
|
||||
goto fail;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 't':
|
||||
if (!is_valid_true_atom(buf + idx)) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
case 'f':
|
||||
if (!is_valid_false_atom(buf + idx)) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
case 'n':
|
||||
if (!is_valid_null_atom(buf + idx)) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
case '0':
|
||||
case '1':
|
||||
case '2':
|
||||
case '3':
|
||||
case '4':
|
||||
case '5':
|
||||
case '6':
|
||||
case '7':
|
||||
case '8':
|
||||
case '9': {
|
||||
if (!parse_number(buf, pj, idx, false)) {
|
||||
goto fail;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case '-': {
|
||||
if (!parse_number(buf, pj, idx, true)) {
|
||||
goto fail;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case '{': {
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
||||
// we have not yet encountered } so we need to come back for it
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&object_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 'o';
|
||||
#endif
|
||||
// we found an object inside an object, so we need to increment the depth
|
||||
depth++;
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
goto object_begin;
|
||||
}
|
||||
case '[': {
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
||||
// we have not yet encountered } so we need to come back for it
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&object_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 'o';
|
||||
#endif
|
||||
// we found an array inside an object, so we need to increment the depth
|
||||
depth++;
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
goto array_begin;
|
||||
}
|
||||
default:
|
||||
goto fail;
|
||||
}
|
||||
|
||||
object_continue:
|
||||
UPDATE_CHAR();
|
||||
switch (c) {
|
||||
case ',':
|
||||
UPDATE_CHAR();
|
||||
if (c != '"') {
|
||||
goto fail;
|
||||
} else {
|
||||
if (!parse_string<T>(buf, len, pj, depth, idx)) {
|
||||
goto fail;
|
||||
}
|
||||
goto object_key_state;
|
||||
}
|
||||
case '}':
|
||||
goto scope_end;
|
||||
default:
|
||||
goto fail;
|
||||
}
|
||||
|
||||
////////////////////////////// COMMON STATE /////////////////////////////
|
||||
|
||||
scope_end:
|
||||
// write our tape location to the header scope
|
||||
depth--;
|
||||
pj.write_tape(pj.containing_scope_offset[depth], c);
|
||||
pj.annotate_previousloc(pj.containing_scope_offset[depth],
|
||||
pj.get_current_loc());
|
||||
// goto saved_state
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
goto *pj.ret_address[depth];
|
||||
#else
|
||||
if(pj.ret_address[depth] == 'a') {
|
||||
goto array_continue;
|
||||
} else if (pj.ret_address[depth] == 'o') {
|
||||
goto object_continue;
|
||||
} else goto start_continue;
|
||||
#endif
|
||||
|
||||
////////////////////////////// ARRAY STATES /////////////////////////////
|
||||
array_begin:
|
||||
UPDATE_CHAR();
|
||||
if (c == ']') {
|
||||
goto scope_end; // could also go to array_continue
|
||||
}
|
||||
|
||||
main_array_switch:
|
||||
// we call update char on all paths in, so we can peek at c on the
|
||||
// on paths that can accept a close square brace (post-, and at start)
|
||||
switch (c) {
|
||||
case '"': {
|
||||
if (!parse_string<T>(buf, len, pj, depth, idx)) {
|
||||
goto fail;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 't':
|
||||
if (!is_valid_true_atom(buf + idx)) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
case 'f':
|
||||
if (!is_valid_false_atom(buf + idx)) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
case 'n':
|
||||
if (!is_valid_null_atom(buf + idx)) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
break; // goto array_continue;
|
||||
|
||||
case '0':
|
||||
case '1':
|
||||
case '2':
|
||||
case '3':
|
||||
case '4':
|
||||
case '5':
|
||||
case '6':
|
||||
case '7':
|
||||
case '8':
|
||||
case '9': {
|
||||
if (!parse_number(buf, pj, idx, false)) {
|
||||
goto fail;
|
||||
}
|
||||
break; // goto array_continue;
|
||||
}
|
||||
case '-': {
|
||||
if (!parse_number(buf, pj, idx, true)) {
|
||||
goto fail;
|
||||
}
|
||||
break; // goto array_continue;
|
||||
}
|
||||
case '{': {
|
||||
// we have not yet encountered ] so we need to come back for it
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&array_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 'a';
|
||||
#endif
|
||||
// we found an object inside an array, so we need to increment the depth
|
||||
depth++;
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
goto object_begin;
|
||||
}
|
||||
case '[': {
|
||||
// we have not yet encountered ] so we need to come back for it
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&array_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 'a';
|
||||
#endif
|
||||
// we found an array inside an array, so we need to increment the depth
|
||||
depth++;
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
goto array_begin;
|
||||
}
|
||||
default:
|
||||
goto fail;
|
||||
}
|
||||
|
||||
array_continue:
|
||||
UPDATE_CHAR();
|
||||
switch (c) {
|
||||
case ',':
|
||||
UPDATE_CHAR();
|
||||
goto main_array_switch;
|
||||
case ']':
|
||||
goto scope_end;
|
||||
default:
|
||||
goto fail;
|
||||
}
|
||||
|
||||
////////////////////////////// FINAL STATES /////////////////////////////
|
||||
|
||||
succeed:
|
||||
depth --;
|
||||
if(depth != 0) {
|
||||
fprintf(stderr, "internal bug\n");
|
||||
abort();
|
||||
}
|
||||
if(pj.containing_scope_offset[depth] != 0) {
|
||||
fprintf(stderr, "internal bug\n");
|
||||
abort();
|
||||
}
|
||||
pj.annotate_previousloc(pj.containing_scope_offset[depth],
|
||||
pj.get_current_loc());
|
||||
pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
|
||||
|
||||
pj.isvalid = true;
|
||||
pj.errorcode = simdjson::SUCCESS;
|
||||
return pj.errorcode;
|
||||
fail:
|
||||
// we do not need the next line because this is done by pj.init(), pessimistically.
|
||||
// pj.isvalid = false;
|
||||
// At this point in the code, we have all the time in the world.
|
||||
// Note that we know exactly where we are in the document so we could,
|
||||
// without any overhead on the processing code, report a specific location.
|
||||
// We could even trigger special code paths to assess what happened carefully,
|
||||
// all without any added cost.
|
||||
if (depth >= pj.depthcapacity) {
|
||||
pj.errorcode = simdjson::DEPTH_ERROR;
|
||||
return pj.errorcode;
|
||||
}
|
||||
switch(c) {
|
||||
case '"':
|
||||
pj.errorcode = simdjson::STRING_ERROR;
|
||||
return pj.errorcode;
|
||||
case '0':
|
||||
case '1':
|
||||
case '2':
|
||||
case '3':
|
||||
case '4':
|
||||
case '5':
|
||||
case '6':
|
||||
case '7':
|
||||
case '8':
|
||||
case '9':
|
||||
case '-':
|
||||
pj.errorcode = simdjson::NUMBER_ERROR;
|
||||
return pj.errorcode;
|
||||
case 't':
|
||||
pj.errorcode = simdjson::T_ATOM_ERROR;
|
||||
return pj.errorcode;
|
||||
case 'n':
|
||||
pj.errorcode = simdjson::N_ATOM_ERROR;
|
||||
return pj.errorcode;
|
||||
case 'f':
|
||||
pj.errorcode = simdjson::F_ATOM_ERROR;
|
||||
return pj.errorcode;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
pj.errorcode = simdjson::TAPE_ERROR;
|
||||
return pj.errorcode;
|
||||
}
|
||||
|
||||
template<simdjson::instruction_set T = simdjson::instruction_set::native>
|
||||
int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
|
||||
return unified_machine<T>(reinterpret_cast<const uint8_t*>(buf), len, pj);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -5,6 +5,11 @@
|
|||
#include "simdjson/jsoncharutils.h"
|
||||
#include "simdjson/parsedjson.h"
|
||||
|
||||
#ifdef JSON_TEST_STRINGS
|
||||
void foundString(const uint8_t *buf, const uint8_t *parsed_begin, const uint8_t *parsed_end);
|
||||
void foundBadString(const uint8_t *buf);
|
||||
#endif
|
||||
|
||||
|
||||
// begin copypasta
|
||||
// These chars yield themselves: " \ /
|
||||
|
@ -76,19 +81,19 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, uint8_t **d
|
|||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
|
||||
really_inline bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
|
||||
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
|
||||
#ifdef SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing
|
||||
pj.write_tape(0, '"');// don't bother with the string parsing at all
|
||||
return true; // always succeeds
|
||||
#else
|
||||
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
|
||||
const uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
|
||||
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
|
||||
const uint8_t *const start_of_string = dst;
|
||||
while (1) {
|
||||
// Holds backslashes and quotes locations.
|
||||
struct parse_string_helper {
|
||||
uint32_t bs_bits;
|
||||
uint32_t quote_bits;
|
||||
};
|
||||
|
||||
// Finds where the backslashes and quotes are located.
|
||||
template<simdjson::instruction_set>
|
||||
parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst);
|
||||
|
||||
#ifdef __AVX2__
|
||||
template<> really_inline
|
||||
parse_string_helper find_bs_bits_and_quote_bits<simdjson::instruction_set::avx2> (const uint8_t *src, uint8_t *dst) {
|
||||
// this can read up to 31 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
|
||||
|
@ -96,12 +101,22 @@ really_inline bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
|
|||
// store to dest unconditionally - we can overwrite the bits we don't like
|
||||
// later
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
|
||||
auto bs_bits =
|
||||
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\'))));
|
||||
//auto bs_bits =
|
||||
// static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\'))));
|
||||
auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
|
||||
auto quote_bits =
|
||||
static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask));
|
||||
#else
|
||||
//uint32_t quote_bits =
|
||||
// static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask));
|
||||
|
||||
return {
|
||||
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits
|
||||
static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask)) // quote_bits
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
template<> really_inline
|
||||
parse_string_helper find_bs_bits_and_quote_bits<simdjson::instruction_set::neon> (const uint8_t *src, uint8_t *dst) {
|
||||
// this can read up to 31 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
|
||||
|
@ -128,14 +143,32 @@ really_inline bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
|
|||
uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1);
|
||||
sum0 = vpaddq_u8(sum0, sum1);
|
||||
sum0 = vpaddq_u8(sum0, sum0);
|
||||
auto bs_bits = vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0);
|
||||
auto quote_bits = vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1);
|
||||
return {
|
||||
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits
|
||||
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits
|
||||
};
|
||||
}
|
||||
#endif
|
||||
if(((bs_bits - 1) & quote_bits) != 0 ) {
|
||||
|
||||
template<simdjson::instruction_set T>
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline
|
||||
bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
|
||||
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
|
||||
#ifdef SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing
|
||||
pj.write_tape(0, '"');// don't bother with the string parsing at all
|
||||
return true; // always succeeds
|
||||
#else
|
||||
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
|
||||
const uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
|
||||
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
|
||||
const uint8_t *const start_of_string = dst;
|
||||
while (1) {
|
||||
parse_string_helper helper = find_bs_bits_and_quote_bits<T>(src, dst);
|
||||
if(((helper.bs_bits - 1) & helper.quote_bits) != 0 ) {
|
||||
// we encountered quotes first. Move dst to point to quotes and exit
|
||||
|
||||
// find out where the quote is...
|
||||
uint32_t quote_dist = trailingzeroes(quote_bits);
|
||||
uint32_t quote_dist = trailingzeroes(helper.quote_bits);
|
||||
|
||||
// NULL termination is still handy if you expect all your strings to be NULL terminated?
|
||||
// It comes at a small cost
|
||||
|
@ -158,9 +191,9 @@ really_inline bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
|
|||
#endif // JSON_TEST_STRINGS
|
||||
return true;
|
||||
}
|
||||
if(((quote_bits - 1) & bs_bits ) != 0 ) {
|
||||
if(((helper.quote_bits - 1) & helper.bs_bits ) != 0 ) {
|
||||
// find out where the backspace is
|
||||
uint32_t bs_dist = trailingzeroes(bs_bits);
|
||||
uint32_t bs_dist = trailingzeroes(helper.bs_bits);
|
||||
uint8_t escape_char = src[bs_dist + 1];
|
||||
// we encountered backslash first. Handle backslash
|
||||
if (escape_char == 'u') {
|
||||
|
|
|
@ -1,578 +1 @@
|
|||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
#include "simdjson/common_defs.h"
|
||||
#include "simdjson/jsoncharutils.h"
|
||||
#include "simdjson/numberparsing.h"
|
||||
#include "simdjson/parsedjson.h"
|
||||
#include "simdjson/stringparsing.h"
|
||||
#include "simdjson/simdjson.h"
|
||||
|
||||
#include <iostream>
|
||||
#define PATH_SEP '/'
|
||||
|
||||
|
||||
WARN_UNUSED
|
||||
really_inline bool is_valid_true_atom(const uint8_t *loc) {
|
||||
uint64_t tv = *reinterpret_cast<const uint64_t *>("true ");
|
||||
uint64_t mask4 = 0x00000000ffffffff;
|
||||
uint32_t error = 0;
|
||||
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
||||
std::memcpy(&locval, loc, sizeof(uint64_t));
|
||||
error = (locval & mask4) ^ tv;
|
||||
error |= is_not_structural_or_whitespace(loc[4]);
|
||||
return error == 0;
|
||||
}
|
||||
|
||||
WARN_UNUSED
|
||||
really_inline bool is_valid_false_atom(const uint8_t *loc) {
|
||||
// We have to use an integer constant because the space in the cast
|
||||
// below would lead to values illegally being qualified
|
||||
// uint64_t fv = *reinterpret_cast<const uint64_t *>("false ");
|
||||
// using this constant (that is the same false) but nulls out the
|
||||
// unused bits solves that
|
||||
uint64_t fv = 0x00000065736c6166; // takes into account endianness
|
||||
uint64_t mask5 = 0x000000ffffffffff;
|
||||
// we can't use the 32 bit value for checking for errors otherwise
|
||||
// the last character of false (it being 5 byte long!) would be
|
||||
// ignored
|
||||
uint64_t error = 0;
|
||||
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
||||
std::memcpy(&locval, loc, sizeof(uint64_t));
|
||||
error = (locval & mask5) ^ fv;
|
||||
error |= is_not_structural_or_whitespace(loc[5]);
|
||||
return error == 0;
|
||||
}
|
||||
|
||||
WARN_UNUSED
|
||||
really_inline bool is_valid_null_atom(const uint8_t *loc) {
|
||||
uint64_t nv = *reinterpret_cast<const uint64_t *>("null ");
|
||||
uint64_t mask4 = 0x00000000ffffffff;
|
||||
uint32_t error = 0;
|
||||
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
||||
std::memcpy(&locval, loc, sizeof(uint64_t));
|
||||
error = (locval & mask4) ^ nv;
|
||||
error |= is_not_structural_or_whitespace(loc[4]);
|
||||
return error == 0;
|
||||
}
|
||||
|
||||
|
||||
/************
|
||||
* The JSON is parsed to a tape, see the accompanying tape.md file
|
||||
* for documentation.
|
||||
***********/
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
|
||||
int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
||||
#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
|
||||
memset((uint8_t*)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
|
||||
#endif
|
||||
uint32_t i = 0; // index of the structural character (0,1,2,3...)
|
||||
uint32_t idx; // location of the structural character in the input (buf)
|
||||
uint8_t c; // used to track the (structural) character we are looking at, updated
|
||||
// by UPDATE_CHAR macro
|
||||
uint32_t depth = 0; // could have an arbitrary starting depth
|
||||
pj.init(); // sets isvalid to false
|
||||
if(pj.bytecapacity < len) {
|
||||
pj.errorcode = simdjson::CAPACITY;
|
||||
return pj.errorcode;
|
||||
}
|
||||
// this macro reads the next structural character, updating idx, i and c.
|
||||
#define UPDATE_CHAR() \
|
||||
{ \
|
||||
idx = pj.structural_indexes[i++]; \
|
||||
c = buf[idx]; \
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////// START STATE /////////////////////////////
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&start_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 's';
|
||||
#endif
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
|
||||
// the root is used, if nothing else, to capture the size of the tape
|
||||
depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
UPDATE_CHAR();
|
||||
switch (c) {
|
||||
case '{':
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&start_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 's';
|
||||
#endif
|
||||
depth++;
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c); // strangely, moving this to object_begin slows things down
|
||||
goto object_begin;
|
||||
case '[':
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&start_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 's';
|
||||
#endif
|
||||
depth++;
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
goto array_begin;
|
||||
#define SIMDJSON_ALLOWANYTHINGINROOT
|
||||
// A JSON text is a serialized value. Note that certain previous
|
||||
// specifications of JSON constrained a JSON text to be an object or an
|
||||
// array. Implementations that generate only objects or arrays where a
|
||||
// JSON text is called for will be interoperable in the sense that all
|
||||
// implementations will accept these as conforming JSON texts.
|
||||
// https://tools.ietf.org/html/rfc8259
|
||||
#ifdef SIMDJSON_ALLOWANYTHINGINROOT
|
||||
case '"': {
|
||||
if (!parse_string(buf, len, pj, depth, idx)) {
|
||||
goto fail;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 't': {
|
||||
// we need to make a copy to make sure that the string is space terminated.
|
||||
// this only applies to the JSON document made solely of the true value.
|
||||
// this will almost never be called in practice
|
||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||
if(copy == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
memcpy(copy, buf, len);
|
||||
copy[len] = ' ';
|
||||
if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
|
||||
free(copy);
|
||||
goto fail;
|
||||
}
|
||||
free(copy);
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
}
|
||||
case 'f': {
|
||||
// we need to make a copy to make sure that the string is space terminated.
|
||||
// this only applies to the JSON document made solely of the false value.
|
||||
// this will almost never be called in practice
|
||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||
if(copy == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
memcpy(copy, buf, len);
|
||||
copy[len] = ' ';
|
||||
if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
|
||||
free(copy);
|
||||
goto fail;
|
||||
}
|
||||
free(copy);
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
}
|
||||
case 'n': {
|
||||
// we need to make a copy to make sure that the string is space terminated.
|
||||
// this only applies to the JSON document made solely of the null value.
|
||||
// this will almost never be called in practice
|
||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||
if(copy == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
memcpy(copy, buf, len);
|
||||
copy[len] = ' ';
|
||||
if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
|
||||
free(copy);
|
||||
goto fail;
|
||||
}
|
||||
free(copy);
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
}
|
||||
case '0':
|
||||
case '1':
|
||||
case '2':
|
||||
case '3':
|
||||
case '4':
|
||||
case '5':
|
||||
case '6':
|
||||
case '7':
|
||||
case '8':
|
||||
case '9': {
|
||||
// we need to make a copy to make sure that the string is space terminated.
|
||||
// this is done only for JSON documents made of a sole number
|
||||
// this will almost never be called in practice. We terminate with a space
|
||||
// because we do not want to allow NULLs in the middle of a number (whereas a
|
||||
// space in the middle of a number would be identified in stage 1).
|
||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||
if(copy == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
memcpy(copy, buf, len);
|
||||
copy[len] = ' ';
|
||||
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, false)) {
|
||||
free(copy);
|
||||
goto fail;
|
||||
}
|
||||
free(copy);
|
||||
break;
|
||||
}
|
||||
case '-': {
|
||||
// we need to make a copy to make sure that the string is NULL terminated.
|
||||
// this is done only for JSON documents made of a sole number
|
||||
// this will almost never be called in practice
|
||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||
if(copy == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
memcpy(copy, buf, len);
|
||||
copy[len] = '\0';
|
||||
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
|
||||
free(copy);
|
||||
goto fail;
|
||||
}
|
||||
free(copy);
|
||||
break;
|
||||
}
|
||||
#endif // ALLOWANYTHINGINROOT
|
||||
default:
|
||||
goto fail;
|
||||
}
|
||||
start_continue:
|
||||
// the string might not be NULL terminated.
|
||||
if(i + 1 == pj.n_structural_indexes) {
|
||||
goto succeed;
|
||||
} else {
|
||||
goto fail;
|
||||
}
|
||||
////////////////////////////// OBJECT STATES /////////////////////////////
|
||||
|
||||
object_begin:
|
||||
UPDATE_CHAR();
|
||||
switch (c) {
|
||||
case '"': {
|
||||
if (!parse_string(buf, len, pj, depth, idx)) {
|
||||
goto fail;
|
||||
}
|
||||
goto object_key_state;
|
||||
}
|
||||
case '}':
|
||||
goto scope_end; // could also go to object_continue
|
||||
default:
|
||||
goto fail;
|
||||
}
|
||||
|
||||
object_key_state:
|
||||
UPDATE_CHAR();
|
||||
if (c != ':') {
|
||||
goto fail;
|
||||
}
|
||||
UPDATE_CHAR();
|
||||
switch (c) {
|
||||
case '"': {
|
||||
if (!parse_string(buf, len, pj, depth, idx)) {
|
||||
goto fail;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 't':
|
||||
if (!is_valid_true_atom(buf + idx)) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
case 'f':
|
||||
if (!is_valid_false_atom(buf + idx)) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
case 'n':
|
||||
if (!is_valid_null_atom(buf + idx)) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
case '0':
|
||||
case '1':
|
||||
case '2':
|
||||
case '3':
|
||||
case '4':
|
||||
case '5':
|
||||
case '6':
|
||||
case '7':
|
||||
case '8':
|
||||
case '9': {
|
||||
if (!parse_number(buf, pj, idx, false)) {
|
||||
goto fail;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case '-': {
|
||||
if (!parse_number(buf, pj, idx, true)) {
|
||||
goto fail;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case '{': {
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
||||
// we have not yet encountered } so we need to come back for it
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&object_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 'o';
|
||||
#endif
|
||||
// we found an object inside an object, so we need to increment the depth
|
||||
depth++;
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
goto object_begin;
|
||||
}
|
||||
case '[': {
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
||||
// we have not yet encountered } so we need to come back for it
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&object_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 'o';
|
||||
#endif
|
||||
// we found an array inside an object, so we need to increment the depth
|
||||
depth++;
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
goto array_begin;
|
||||
}
|
||||
default:
|
||||
goto fail;
|
||||
}
|
||||
|
||||
object_continue:
|
||||
UPDATE_CHAR();
|
||||
switch (c) {
|
||||
case ',':
|
||||
UPDATE_CHAR();
|
||||
if (c != '"') {
|
||||
goto fail;
|
||||
} else {
|
||||
if (!parse_string(buf, len, pj, depth, idx)) {
|
||||
goto fail;
|
||||
}
|
||||
goto object_key_state;
|
||||
}
|
||||
case '}':
|
||||
goto scope_end;
|
||||
default:
|
||||
goto fail;
|
||||
}
|
||||
|
||||
////////////////////////////// COMMON STATE /////////////////////////////
|
||||
|
||||
scope_end:
|
||||
// write our tape location to the header scope
|
||||
depth--;
|
||||
pj.write_tape(pj.containing_scope_offset[depth], c);
|
||||
pj.annotate_previousloc(pj.containing_scope_offset[depth],
|
||||
pj.get_current_loc());
|
||||
// goto saved_state
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
goto *pj.ret_address[depth];
|
||||
#else
|
||||
if(pj.ret_address[depth] == 'a') {
|
||||
goto array_continue;
|
||||
} else if (pj.ret_address[depth] == 'o') {
|
||||
goto object_continue;
|
||||
} else goto start_continue;
|
||||
#endif
|
||||
|
||||
////////////////////////////// ARRAY STATES /////////////////////////////
|
||||
array_begin:
|
||||
UPDATE_CHAR();
|
||||
if (c == ']') {
|
||||
goto scope_end; // could also go to array_continue
|
||||
}
|
||||
|
||||
main_array_switch:
|
||||
// we call update char on all paths in, so we can peek at c on the
|
||||
// on paths that can accept a close square brace (post-, and at start)
|
||||
switch (c) {
|
||||
case '"': {
|
||||
if (!parse_string(buf, len, pj, depth, idx)) {
|
||||
goto fail;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 't':
|
||||
if (!is_valid_true_atom(buf + idx)) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
case 'f':
|
||||
if (!is_valid_false_atom(buf + idx)) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
break;
|
||||
case 'n':
|
||||
if (!is_valid_null_atom(buf + idx)) {
|
||||
goto fail;
|
||||
}
|
||||
pj.write_tape(0, c);
|
||||
break; // goto array_continue;
|
||||
|
||||
case '0':
|
||||
case '1':
|
||||
case '2':
|
||||
case '3':
|
||||
case '4':
|
||||
case '5':
|
||||
case '6':
|
||||
case '7':
|
||||
case '8':
|
||||
case '9': {
|
||||
if (!parse_number(buf, pj, idx, false)) {
|
||||
goto fail;
|
||||
}
|
||||
break; // goto array_continue;
|
||||
}
|
||||
case '-': {
|
||||
if (!parse_number(buf, pj, idx, true)) {
|
||||
goto fail;
|
||||
}
|
||||
break; // goto array_continue;
|
||||
}
|
||||
case '{': {
|
||||
// we have not yet encountered ] so we need to come back for it
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&array_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 'a';
|
||||
#endif
|
||||
// we found an object inside an array, so we need to increment the depth
|
||||
depth++;
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
goto object_begin;
|
||||
}
|
||||
case '[': {
|
||||
// we have not yet encountered ] so we need to come back for it
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
pj.ret_address[depth] = &&array_continue;
|
||||
#else
|
||||
pj.ret_address[depth] = 'a';
|
||||
#endif
|
||||
// we found an array inside an array, so we need to increment the depth
|
||||
depth++;
|
||||
if (depth >= pj.depthcapacity) {
|
||||
goto fail;
|
||||
}
|
||||
goto array_begin;
|
||||
}
|
||||
default:
|
||||
goto fail;
|
||||
}
|
||||
|
||||
array_continue:
|
||||
UPDATE_CHAR();
|
||||
switch (c) {
|
||||
case ',':
|
||||
UPDATE_CHAR();
|
||||
goto main_array_switch;
|
||||
case ']':
|
||||
goto scope_end;
|
||||
default:
|
||||
goto fail;
|
||||
}
|
||||
|
||||
////////////////////////////// FINAL STATES /////////////////////////////
|
||||
|
||||
succeed:
|
||||
depth --;
|
||||
if(depth != 0) {
|
||||
fprintf(stderr, "internal bug\n");
|
||||
abort();
|
||||
}
|
||||
if(pj.containing_scope_offset[depth] != 0) {
|
||||
fprintf(stderr, "internal bug\n");
|
||||
abort();
|
||||
}
|
||||
pj.annotate_previousloc(pj.containing_scope_offset[depth],
|
||||
pj.get_current_loc());
|
||||
pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
|
||||
|
||||
pj.isvalid = true;
|
||||
pj.errorcode = simdjson::SUCCESS;
|
||||
return pj.errorcode;
|
||||
fail:
|
||||
// we do not need the next line because this is done by pj.init(), pessimistically.
|
||||
// pj.isvalid = false;
|
||||
// At this point in the code, we have all the time in the world.
|
||||
// Note that we know exactly where we are in the document so we could,
|
||||
// without any overhead on the processing code, report a specific location.
|
||||
// We could even trigger special code paths to assess what happened carefully,
|
||||
// all without any added cost.
|
||||
if (depth >= pj.depthcapacity) {
|
||||
pj.errorcode = simdjson::DEPTH_ERROR;
|
||||
return pj.errorcode;
|
||||
}
|
||||
switch(c) {
|
||||
case '"':
|
||||
pj.errorcode = simdjson::STRING_ERROR;
|
||||
return pj.errorcode;
|
||||
case '0':
|
||||
case '1':
|
||||
case '2':
|
||||
case '3':
|
||||
case '4':
|
||||
case '5':
|
||||
case '6':
|
||||
case '7':
|
||||
case '8':
|
||||
case '9':
|
||||
case '-':
|
||||
pj.errorcode = simdjson::NUMBER_ERROR;
|
||||
return pj.errorcode;
|
||||
case 't':
|
||||
pj.errorcode = simdjson::T_ATOM_ERROR;
|
||||
return pj.errorcode;
|
||||
case 'n':
|
||||
pj.errorcode = simdjson::N_ATOM_ERROR;
|
||||
return pj.errorcode;
|
||||
case 'f':
|
||||
pj.errorcode = simdjson::F_ATOM_ERROR;
|
||||
return pj.errorcode;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
pj.errorcode = simdjson::TAPE_ERROR;
|
||||
return pj.errorcode;
|
||||
}
|
||||
|
||||
int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
|
||||
return unified_machine(reinterpret_cast<const uint8_t*>(buf), len, pj);
|
||||
}
|
||||
// File kept in case we want to reuse it soon. (many configuration files to edit)
|
||||
|
|
|
@ -38,7 +38,7 @@ bool is_in_bad_list(const char *buf) {
|
|||
return false;
|
||||
}
|
||||
|
||||
inline void foundInvalidNumber(const uint8_t *buf) {
|
||||
void foundInvalidNumber(const uint8_t *buf) {
|
||||
invalid_count++;
|
||||
char *endptr;
|
||||
double expected = strtod((const char *)buf, &endptr);
|
||||
|
@ -53,7 +53,7 @@ inline void foundInvalidNumber(const uint8_t *buf) {
|
|||
}
|
||||
}
|
||||
|
||||
inline void foundInteger(int64_t result, const uint8_t *buf) {
|
||||
void foundInteger(int64_t result, const uint8_t *buf) {
|
||||
int_count++;
|
||||
char *endptr;
|
||||
long long expected = strtoll((const char *)buf, &endptr, 10);
|
||||
|
@ -64,7 +64,7 @@ inline void foundInteger(int64_t result, const uint8_t *buf) {
|
|||
}
|
||||
}
|
||||
|
||||
inline void foundFloat(double result, const uint8_t *buf) {
|
||||
void foundFloat(double result, const uint8_t *buf) {
|
||||
char *endptr;
|
||||
float_count++;
|
||||
double expected = strtod((const char *)buf, &endptr);
|
||||
|
|
|
@ -203,7 +203,7 @@ static bool parse_string(const char *p, char *output, char **end) {
|
|||
// end of borrowed code
|
||||
char *bigbuffer; // global variable
|
||||
|
||||
inline void foundBadString(const uint8_t *buf) {
|
||||
void foundBadString(const uint8_t *buf) {
|
||||
bad_string++;
|
||||
char *end;
|
||||
if (parse_string((const char *)buf, bigbuffer, &end)) {
|
||||
|
@ -226,7 +226,7 @@ void print_cmp_hex(const char *s1, const char *s2, size_t len) {
|
|||
}
|
||||
}
|
||||
|
||||
inline void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
|
||||
void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
|
||||
const uint8_t *parsed_end) {
|
||||
size_t thislen = parsed_end - parsed_begin;
|
||||
total_string_length += thislen;
|
||||
|
|
Loading…
Reference in New Issue