Stage2 refactored to simplify multiple implementations

This commit is contained in:
ioioioio 2019-07-02 17:12:00 -04:00
parent aa78b70d69
commit 3f24879157
8 changed files with 647 additions and 617 deletions

View File

@ -157,7 +157,8 @@ int main(int argc, char *argv[]) {
break;
}
unified.start();
isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
// The default template is simdjson::instruction_set::native.
isok = isok && (simdjson::SUCCESS == unified_machine<>(p.data(), p.size(), pj));
unified.end(results);
cy2 += results[0];
cl2 += results[1];
@ -188,7 +189,7 @@ int main(int argc, char *argv[]) {
auto start = std::chrono::steady_clock::now();
// The default template is simdjson::instruction_set::native.
isok = (find_structural_bits<>(p.data(), p.size(), pj) == simdjson::SUCCESS);
isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
isok = isok && (simdjson::SUCCESS == unified_machine<>(p.data(), p.size(), pj));
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
res[i] = secs.count();

View File

@ -25,7 +25,6 @@ using json_parse_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj,
// Pointer that holds the json_parse implementation corresponding to the available SIMD instruction set
extern json_parse_functype *json_parse_ptr;
// json_parse_implementation is the generic function, it is specialized for various
// SIMD instruction sets, e.g., as json_parse_implementation<simdjson::instruction_set::avx2>
// or json_parse_implementation<simdjson::instruction_set::neon>
@ -68,7 +67,7 @@ int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj, bo
pj.errorcode = stage1_is_ok;
return pj.errorcode;
}
int res = unified_machine(buf, len, pj);
int res = unified_machine<T>(buf, len, pj);
if(reallocated) { aligned_free((void*)buf);}
return res;
}

View File

@ -6,6 +6,13 @@
#include "simdjson/parsedjson.h"
#include "simdjson/portability.h"
#ifdef JSON_TEST_NUMBERS // for unit testing
void foundInvalidNumber(const uint8_t *buf);
void foundInteger(int64_t result, const uint8_t *buf);
void foundFloat(double result, const uint8_t *buf);
#endif
// Allowable floating-point values range from std::numeric_limits<double>::lowest()
// to std::numeric_limits<double>::max(), so from
// -1.7976e308 all the way to 1.7975e308 in binary64. The lowest non-zero
@ -375,9 +382,6 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
return is_structural_or_whitespace(*p);
}
// parse the number at buf + offset
// define JSON_TEST_NUMBERS for unit testing
//

View File

@ -1,16 +1,586 @@
#ifndef SIMDJSON_STAGE2_BUILD_TAPE_H
#define SIMDJSON_STAGE2_BUILD_TAPE_H
#include "simdjson/common_defs.h"
#include <cassert>
#include <cstring>
#include <iostream>
struct ParsedJson;
#include "simdjson/common_defs.h"
#include "simdjson/jsoncharutils.h"
#include "simdjson/numberparsing.h"
#include "simdjson/parsedjson.h"
#include "simdjson/stringparsing.h"
#include "simdjson/simdjson.h"
#define PATH_SEP '/'
void init_state_machine();
WARN_UNUSED
int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
really_inline bool is_valid_true_atom(const uint8_t *loc) {
uint64_t tv = *reinterpret_cast<const uint64_t *>("true ");
uint64_t mask4 = 0x00000000ffffffff;
uint32_t error = 0;
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
std::memcpy(&locval, loc, sizeof(uint64_t));
error = (locval & mask4) ^ tv;
error |= is_not_structural_or_whitespace(loc[4]);
return error == 0;
}
WARN_UNUSED
int unified_machine(const char *buf, size_t len, ParsedJson &pj);
really_inline bool is_valid_false_atom(const uint8_t *loc) {
// We have to use an integer constant because the space in the cast
// below would lead to values illegally being qualified
// uint64_t fv = *reinterpret_cast<const uint64_t *>("false ");
// using this constant (that is the same false) but nulls out the
// unused bits solves that
uint64_t fv = 0x00000065736c6166; // takes into account endianness
uint64_t mask5 = 0x000000ffffffffff;
// we can't use the 32 bit value for checking for errors otherwise
// the last character of false (it being 5 byte long!) would be
// ignored
uint64_t error = 0;
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
std::memcpy(&locval, loc, sizeof(uint64_t));
error = (locval & mask5) ^ fv;
error |= is_not_structural_or_whitespace(loc[5]);
return error == 0;
}
WARN_UNUSED
really_inline bool is_valid_null_atom(const uint8_t *loc) {
uint64_t nv = *reinterpret_cast<const uint64_t *>("null ");
uint64_t mask4 = 0x00000000ffffffff;
uint32_t error = 0;
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
std::memcpy(&locval, loc, sizeof(uint64_t));
error = (locval & mask4) ^ nv;
error |= is_not_structural_or_whitespace(loc[4]);
return error == 0;
}
/************
* The JSON is parsed to a tape, see the accompanying tape.md file
* for documentation.
***********/
template<simdjson::instruction_set T = simdjson::instruction_set::native>
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
memset((uint8_t*)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
#endif
uint32_t i = 0; // index of the structural character (0,1,2,3...)
uint32_t idx; // location of the structural character in the input (buf)
uint8_t c; // used to track the (structural) character we are looking at, updated
// by UPDATE_CHAR macro
uint32_t depth = 0; // could have an arbitrary starting depth
pj.init(); // sets isvalid to false
if(pj.bytecapacity < len) {
pj.errorcode = simdjson::CAPACITY;
return pj.errorcode;
}
// this macro reads the next structural character, updating idx, i and c.
#define UPDATE_CHAR() \
{ \
idx = pj.structural_indexes[i++]; \
c = buf[idx]; \
}
////////////////////////////// START STATE /////////////////////////////
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&start_continue;
#else
pj.ret_address[depth] = 's';
#endif
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
// the root is used, if nothing else, to capture the size of the tape
depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
if (depth >= pj.depthcapacity) {
goto fail;
}
UPDATE_CHAR();
switch (c) {
case '{':
pj.containing_scope_offset[depth] = pj.get_current_loc();
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&start_continue;
#else
pj.ret_address[depth] = 's';
#endif
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
pj.write_tape(0, c); // strangely, moving this to object_begin slows things down
goto object_begin;
case '[':
pj.containing_scope_offset[depth] = pj.get_current_loc();
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&start_continue;
#else
pj.ret_address[depth] = 's';
#endif
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
pj.write_tape(0, c);
goto array_begin;
#define SIMDJSON_ALLOWANYTHINGINROOT
// A JSON text is a serialized value. Note that certain previous
// specifications of JSON constrained a JSON text to be an object or an
// array. Implementations that generate only objects or arrays where a
// JSON text is called for will be interoperable in the sense that all
// implementations will accept these as conforming JSON texts.
// https://tools.ietf.org/html/rfc8259
#ifdef SIMDJSON_ALLOWANYTHINGINROOT
case '"': {
if (!parse_string<T>(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't': {
// we need to make a copy to make sure that the string is space terminated.
// this only applies to the JSON document made solely of the true value.
// this will almost never be called in practice
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case 'f': {
// we need to make a copy to make sure that the string is space terminated.
// this only applies to the JSON document made solely of the false value.
// this will almost never be called in practice
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case 'n': {
// we need to make a copy to make sure that the string is space terminated.
// this only applies to the JSON document made solely of the null value.
// this will almost never be called in practice
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
// we need to make a copy to make sure that the string is space terminated.
// this is done only for JSON documents made of a sole number
// this will almost never be called in practice. We terminate with a space
// because we do not want to allow NULLs in the middle of a number (whereas a
// space in the middle of a number would be identified in stage 1).
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, false)) {
free(copy);
goto fail;
}
free(copy);
break;
}
case '-': {
// we need to make a copy to make sure that the string is NULL terminated.
// this is done only for JSON documents made of a sole number
// this will almost never be called in practice
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = '\0';
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
free(copy);
goto fail;
}
free(copy);
break;
}
#endif // ALLOWANYTHINGINROOT
default:
goto fail;
}
start_continue:
// the string might not be NULL terminated.
if(i + 1 == pj.n_structural_indexes) {
goto succeed;
} else {
goto fail;
}
////////////////////////////// OBJECT STATES /////////////////////////////
object_begin:
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string<T>(buf, len, pj, depth, idx)) {
goto fail;
}
goto object_key_state;
}
case '}':
goto scope_end; // could also go to object_continue
default:
goto fail;
}
object_key_state:
UPDATE_CHAR();
if (c != ':') {
goto fail;
}
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string<T>(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (!parse_number(buf, pj, idx, false)) {
goto fail;
}
break;
}
case '-': {
if (!parse_number(buf, pj, idx, true)) {
goto fail;
}
break;
}
case '{': {
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
// we have not yet encountered } so we need to come back for it
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&object_continue;
#else
pj.ret_address[depth] = 'o';
#endif
// we found an object inside an object, so we need to increment the depth
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
goto object_begin;
}
case '[': {
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
// we have not yet encountered } so we need to come back for it
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&object_continue;
#else
pj.ret_address[depth] = 'o';
#endif
// we found an array inside an object, so we need to increment the depth
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
goto array_begin;
}
default:
goto fail;
}
object_continue:
UPDATE_CHAR();
switch (c) {
case ',':
UPDATE_CHAR();
if (c != '"') {
goto fail;
} else {
if (!parse_string<T>(buf, len, pj, depth, idx)) {
goto fail;
}
goto object_key_state;
}
case '}':
goto scope_end;
default:
goto fail;
}
////////////////////////////// COMMON STATE /////////////////////////////
scope_end:
// write our tape location to the header scope
depth--;
pj.write_tape(pj.containing_scope_offset[depth], c);
pj.annotate_previousloc(pj.containing_scope_offset[depth],
pj.get_current_loc());
// goto saved_state
#ifdef SIMDJSON_USE_COMPUTED_GOTO
goto *pj.ret_address[depth];
#else
if(pj.ret_address[depth] == 'a') {
goto array_continue;
} else if (pj.ret_address[depth] == 'o') {
goto object_continue;
} else goto start_continue;
#endif
////////////////////////////// ARRAY STATES /////////////////////////////
array_begin:
UPDATE_CHAR();
if (c == ']') {
goto scope_end; // could also go to array_continue
}
main_array_switch:
// we call update char on all paths in, so we can peek at c on the
// on paths that can accept a close square brace (post-, and at start)
switch (c) {
case '"': {
if (!parse_string<T>(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break; // goto array_continue;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (!parse_number(buf, pj, idx, false)) {
goto fail;
}
break; // goto array_continue;
}
case '-': {
if (!parse_number(buf, pj, idx, true)) {
goto fail;
}
break; // goto array_continue;
}
case '{': {
// we have not yet encountered ] so we need to come back for it
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&array_continue;
#else
pj.ret_address[depth] = 'a';
#endif
// we found an object inside an array, so we need to increment the depth
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
goto object_begin;
}
case '[': {
// we have not yet encountered ] so we need to come back for it
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&array_continue;
#else
pj.ret_address[depth] = 'a';
#endif
// we found an array inside an array, so we need to increment the depth
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
goto array_begin;
}
default:
goto fail;
}
array_continue:
UPDATE_CHAR();
switch (c) {
case ',':
UPDATE_CHAR();
goto main_array_switch;
case ']':
goto scope_end;
default:
goto fail;
}
////////////////////////////// FINAL STATES /////////////////////////////
succeed:
depth --;
if(depth != 0) {
fprintf(stderr, "internal bug\n");
abort();
}
if(pj.containing_scope_offset[depth] != 0) {
fprintf(stderr, "internal bug\n");
abort();
}
pj.annotate_previousloc(pj.containing_scope_offset[depth],
pj.get_current_loc());
pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
pj.isvalid = true;
pj.errorcode = simdjson::SUCCESS;
return pj.errorcode;
fail:
// we do not need the next line because this is done by pj.init(), pessimistically.
// pj.isvalid = false;
// At this point in the code, we have all the time in the world.
// Note that we know exactly where we are in the document so we could,
// without any overhead on the processing code, report a specific location.
// We could even trigger special code paths to assess what happened carefully,
// all without any added cost.
if (depth >= pj.depthcapacity) {
pj.errorcode = simdjson::DEPTH_ERROR;
return pj.errorcode;
}
switch(c) {
case '"':
pj.errorcode = simdjson::STRING_ERROR;
return pj.errorcode;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
pj.errorcode = simdjson::NUMBER_ERROR;
return pj.errorcode;
case 't':
pj.errorcode = simdjson::T_ATOM_ERROR;
return pj.errorcode;
case 'n':
pj.errorcode = simdjson::N_ATOM_ERROR;
return pj.errorcode;
case 'f':
pj.errorcode = simdjson::F_ATOM_ERROR;
return pj.errorcode;
default:
break;
}
pj.errorcode = simdjson::TAPE_ERROR;
return pj.errorcode;
}
template<simdjson::instruction_set T = simdjson::instruction_set::native>
int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
return unified_machine<T>(reinterpret_cast<const uint8_t*>(buf), len, pj);
}
#endif

View File

@ -5,6 +5,11 @@
#include "simdjson/jsoncharutils.h"
#include "simdjson/parsedjson.h"
#ifdef JSON_TEST_STRINGS
void foundString(const uint8_t *buf, const uint8_t *parsed_begin, const uint8_t *parsed_end);
void foundBadString(const uint8_t *buf);
#endif
// begin copypasta
// These chars yield themselves: " \ /
@ -76,19 +81,19 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, uint8_t **d
#include <arm_neon.h>
#endif
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
really_inline bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
#ifdef SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing
pj.write_tape(0, '"');// don't bother with the string parsing at all
return true; // always succeeds
#else
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
const uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
const uint8_t *const start_of_string = dst;
while (1) {
// Holds backslashes and quotes locations.
struct parse_string_helper {
uint32_t bs_bits;
uint32_t quote_bits;
};
// Finds where the backslashes and quotes are located.
template<simdjson::instruction_set>
parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst);
#ifdef __AVX2__
template<> really_inline
parse_string_helper find_bs_bits_and_quote_bits<simdjson::instruction_set::avx2> (const uint8_t *src, uint8_t *dst) {
// this can read up to 31 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
@ -96,12 +101,22 @@ really_inline bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
// store to dest unconditionally - we can overwrite the bits we don't like
// later
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
auto bs_bits =
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\'))));
//auto bs_bits =
// static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\'))));
auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
auto quote_bits =
static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask));
#else
//uint32_t quote_bits =
// static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask));
return {
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits
static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask)) // quote_bits
};
}
#endif
#ifdef __ARM_NEON
template<> really_inline
parse_string_helper find_bs_bits_and_quote_bits<simdjson::instruction_set::neon> (const uint8_t *src, uint8_t *dst) {
// this can read up to 31 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
@ -128,14 +143,32 @@ really_inline bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1);
sum0 = vpaddq_u8(sum0, sum1);
sum0 = vpaddq_u8(sum0, sum0);
auto bs_bits = vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0);
auto quote_bits = vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1);
return {
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits
};
}
#endif
if(((bs_bits - 1) & quote_bits) != 0 ) {
template<simdjson::instruction_set T>
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline
bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
#ifdef SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing
pj.write_tape(0, '"');// don't bother with the string parsing at all
return true; // always succeeds
#else
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
const uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
const uint8_t *const start_of_string = dst;
while (1) {
parse_string_helper helper = find_bs_bits_and_quote_bits<T>(src, dst);
if(((helper.bs_bits - 1) & helper.quote_bits) != 0 ) {
// we encountered quotes first. Move dst to point to quotes and exit
// find out where the quote is...
uint32_t quote_dist = trailingzeroes(quote_bits);
uint32_t quote_dist = trailingzeroes(helper.quote_bits);
// NULL termination is still handy if you expect all your strings to be NULL terminated?
// It comes at a small cost
@ -158,9 +191,9 @@ really_inline bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
#endif // JSON_TEST_STRINGS
return true;
}
if(((quote_bits - 1) & bs_bits ) != 0 ) {
if(((helper.quote_bits - 1) & helper.bs_bits ) != 0 ) {
// find out where the backspace is
uint32_t bs_dist = trailingzeroes(bs_bits);
uint32_t bs_dist = trailingzeroes(helper.bs_bits);
uint8_t escape_char = src[bs_dist + 1];
// we encountered backslash first. Handle backslash
if (escape_char == 'u') {

View File

@ -1,578 +1 @@
#include <cassert>
#include <cstring>
#include "simdjson/common_defs.h"
#include "simdjson/jsoncharutils.h"
#include "simdjson/numberparsing.h"
#include "simdjson/parsedjson.h"
#include "simdjson/stringparsing.h"
#include "simdjson/simdjson.h"
#include <iostream>
#define PATH_SEP '/'
WARN_UNUSED
really_inline bool is_valid_true_atom(const uint8_t *loc) {
uint64_t tv = *reinterpret_cast<const uint64_t *>("true ");
uint64_t mask4 = 0x00000000ffffffff;
uint32_t error = 0;
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
std::memcpy(&locval, loc, sizeof(uint64_t));
error = (locval & mask4) ^ tv;
error |= is_not_structural_or_whitespace(loc[4]);
return error == 0;
}
WARN_UNUSED
really_inline bool is_valid_false_atom(const uint8_t *loc) {
// We have to use an integer constant because the space in the cast
// below would lead to values illegally being qualified
// uint64_t fv = *reinterpret_cast<const uint64_t *>("false ");
// using this constant (that is the same false) but nulls out the
// unused bits solves that
uint64_t fv = 0x00000065736c6166; // takes into account endianness
uint64_t mask5 = 0x000000ffffffffff;
// we can't use the 32 bit value for checking for errors otherwise
// the last character of false (it being 5 byte long!) would be
// ignored
uint64_t error = 0;
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
std::memcpy(&locval, loc, sizeof(uint64_t));
error = (locval & mask5) ^ fv;
error |= is_not_structural_or_whitespace(loc[5]);
return error == 0;
}
WARN_UNUSED
really_inline bool is_valid_null_atom(const uint8_t *loc) {
uint64_t nv = *reinterpret_cast<const uint64_t *>("null ");
uint64_t mask4 = 0x00000000ffffffff;
uint32_t error = 0;
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
std::memcpy(&locval, loc, sizeof(uint64_t));
error = (locval & mask4) ^ nv;
error |= is_not_structural_or_whitespace(loc[4]);
return error == 0;
}
/************
* The JSON is parsed to a tape, see the accompanying tape.md file
* for documentation.
***********/
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
memset((uint8_t*)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
#endif
uint32_t i = 0; // index of the structural character (0,1,2,3...)
uint32_t idx; // location of the structural character in the input (buf)
uint8_t c; // used to track the (structural) character we are looking at, updated
// by UPDATE_CHAR macro
uint32_t depth = 0; // could have an arbitrary starting depth
pj.init(); // sets isvalid to false
if(pj.bytecapacity < len) {
pj.errorcode = simdjson::CAPACITY;
return pj.errorcode;
}
// this macro reads the next structural character, updating idx, i and c.
#define UPDATE_CHAR() \
{ \
idx = pj.structural_indexes[i++]; \
c = buf[idx]; \
}
////////////////////////////// START STATE /////////////////////////////
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&start_continue;
#else
pj.ret_address[depth] = 's';
#endif
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
// the root is used, if nothing else, to capture the size of the tape
depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
if (depth >= pj.depthcapacity) {
goto fail;
}
UPDATE_CHAR();
switch (c) {
case '{':
pj.containing_scope_offset[depth] = pj.get_current_loc();
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&start_continue;
#else
pj.ret_address[depth] = 's';
#endif
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
pj.write_tape(0, c); // strangely, moving this to object_begin slows things down
goto object_begin;
case '[':
pj.containing_scope_offset[depth] = pj.get_current_loc();
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&start_continue;
#else
pj.ret_address[depth] = 's';
#endif
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
pj.write_tape(0, c);
goto array_begin;
#define SIMDJSON_ALLOWANYTHINGINROOT
// A JSON text is a serialized value. Note that certain previous
// specifications of JSON constrained a JSON text to be an object or an
// array. Implementations that generate only objects or arrays where a
// JSON text is called for will be interoperable in the sense that all
// implementations will accept these as conforming JSON texts.
// https://tools.ietf.org/html/rfc8259
#ifdef SIMDJSON_ALLOWANYTHINGINROOT
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't': {
// we need to make a copy to make sure that the string is space terminated.
// this only applies to the JSON document made solely of the true value.
// this will almost never be called in practice
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case 'f': {
// we need to make a copy to make sure that the string is space terminated.
// this only applies to the JSON document made solely of the false value.
// this will almost never be called in practice
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case 'n': {
// we need to make a copy to make sure that the string is space terminated.
// this only applies to the JSON document made solely of the null value.
// this will almost never be called in practice
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
// we need to make a copy to make sure that the string is space terminated.
// this is done only for JSON documents made of a sole number
// this will almost never be called in practice. We terminate with a space
// because we do not want to allow NULLs in the middle of a number (whereas a
// space in the middle of a number would be identified in stage 1).
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, false)) {
free(copy);
goto fail;
}
free(copy);
break;
}
case '-': {
// we need to make a copy to make sure that the string is NULL terminated.
// this is done only for JSON documents made of a sole number
// this will almost never be called in practice
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if(copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = '\0';
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
free(copy);
goto fail;
}
free(copy);
break;
}
#endif // ALLOWANYTHINGINROOT
default:
goto fail;
}
start_continue:
// the string might not be NULL terminated.
if(i + 1 == pj.n_structural_indexes) {
goto succeed;
} else {
goto fail;
}
////////////////////////////// OBJECT STATES /////////////////////////////
object_begin:
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
goto object_key_state;
}
case '}':
goto scope_end; // could also go to object_continue
default:
goto fail;
}
object_key_state:
UPDATE_CHAR();
if (c != ':') {
goto fail;
}
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (!parse_number(buf, pj, idx, false)) {
goto fail;
}
break;
}
case '-': {
if (!parse_number(buf, pj, idx, true)) {
goto fail;
}
break;
}
case '{': {
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
// we have not yet encountered } so we need to come back for it
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&object_continue;
#else
pj.ret_address[depth] = 'o';
#endif
// we found an object inside an object, so we need to increment the depth
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
goto object_begin;
}
case '[': {
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
// we have not yet encountered } so we need to come back for it
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&object_continue;
#else
pj.ret_address[depth] = 'o';
#endif
// we found an array inside an object, so we need to increment the depth
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
goto array_begin;
}
default:
goto fail;
}
object_continue:
UPDATE_CHAR();
switch (c) {
case ',':
UPDATE_CHAR();
if (c != '"') {
goto fail;
} else {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
goto object_key_state;
}
case '}':
goto scope_end;
default:
goto fail;
}
////////////////////////////// COMMON STATE /////////////////////////////
scope_end:
// write our tape location to the header scope
depth--;
pj.write_tape(pj.containing_scope_offset[depth], c);
pj.annotate_previousloc(pj.containing_scope_offset[depth],
pj.get_current_loc());
// goto saved_state
#ifdef SIMDJSON_USE_COMPUTED_GOTO
goto *pj.ret_address[depth];
#else
if(pj.ret_address[depth] == 'a') {
goto array_continue;
} else if (pj.ret_address[depth] == 'o') {
goto object_continue;
} else goto start_continue;
#endif
////////////////////////////// ARRAY STATES /////////////////////////////
array_begin:
UPDATE_CHAR();
if (c == ']') {
goto scope_end; // could also go to array_continue
}
main_array_switch:
// we call update char on all paths in, so we can peek at c on the
// on paths that can accept a close square brace (post-, and at start)
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break; // goto array_continue;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (!parse_number(buf, pj, idx, false)) {
goto fail;
}
break; // goto array_continue;
}
case '-': {
if (!parse_number(buf, pj, idx, true)) {
goto fail;
}
break; // goto array_continue;
}
case '{': {
// we have not yet encountered ] so we need to come back for it
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&array_continue;
#else
pj.ret_address[depth] = 'a';
#endif
// we found an object inside an array, so we need to increment the depth
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
goto object_begin;
}
case '[': {
// we have not yet encountered ] so we need to come back for it
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
#ifdef SIMDJSON_USE_COMPUTED_GOTO
pj.ret_address[depth] = &&array_continue;
#else
pj.ret_address[depth] = 'a';
#endif
// we found an array inside an array, so we need to increment the depth
depth++;
if (depth >= pj.depthcapacity) {
goto fail;
}
goto array_begin;
}
default:
goto fail;
}
array_continue:
UPDATE_CHAR();
switch (c) {
case ',':
UPDATE_CHAR();
goto main_array_switch;
case ']':
goto scope_end;
default:
goto fail;
}
////////////////////////////// FINAL STATES /////////////////////////////
succeed:
depth --;
if(depth != 0) {
fprintf(stderr, "internal bug\n");
abort();
}
if(pj.containing_scope_offset[depth] != 0) {
fprintf(stderr, "internal bug\n");
abort();
}
pj.annotate_previousloc(pj.containing_scope_offset[depth],
pj.get_current_loc());
pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
pj.isvalid = true;
pj.errorcode = simdjson::SUCCESS;
return pj.errorcode;
fail:
// we do not need the next line because this is done by pj.init(), pessimistically.
// pj.isvalid = false;
// At this point in the code, we have all the time in the world.
// Note that we know exactly where we are in the document so we could,
// without any overhead on the processing code, report a specific location.
// We could even trigger special code paths to assess what happened carefully,
// all without any added cost.
if (depth >= pj.depthcapacity) {
pj.errorcode = simdjson::DEPTH_ERROR;
return pj.errorcode;
}
switch(c) {
case '"':
pj.errorcode = simdjson::STRING_ERROR;
return pj.errorcode;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
pj.errorcode = simdjson::NUMBER_ERROR;
return pj.errorcode;
case 't':
pj.errorcode = simdjson::T_ATOM_ERROR;
return pj.errorcode;
case 'n':
pj.errorcode = simdjson::N_ATOM_ERROR;
return pj.errorcode;
case 'f':
pj.errorcode = simdjson::F_ATOM_ERROR;
return pj.errorcode;
default:
break;
}
pj.errorcode = simdjson::TAPE_ERROR;
return pj.errorcode;
}
int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
return unified_machine(reinterpret_cast<const uint8_t*>(buf), len, pj);
}
// File kept in case we want to reuse it soon. (many configuration files to edit)

View File

@ -38,7 +38,7 @@ bool is_in_bad_list(const char *buf) {
return false;
}
inline void foundInvalidNumber(const uint8_t *buf) {
void foundInvalidNumber(const uint8_t *buf) {
invalid_count++;
char *endptr;
double expected = strtod((const char *)buf, &endptr);
@ -53,7 +53,7 @@ inline void foundInvalidNumber(const uint8_t *buf) {
}
}
inline void foundInteger(int64_t result, const uint8_t *buf) {
void foundInteger(int64_t result, const uint8_t *buf) {
int_count++;
char *endptr;
long long expected = strtoll((const char *)buf, &endptr, 10);
@ -64,7 +64,7 @@ inline void foundInteger(int64_t result, const uint8_t *buf) {
}
}
inline void foundFloat(double result, const uint8_t *buf) {
void foundFloat(double result, const uint8_t *buf) {
char *endptr;
float_count++;
double expected = strtod((const char *)buf, &endptr);

View File

@ -203,7 +203,7 @@ static bool parse_string(const char *p, char *output, char **end) {
// end of borrowed code
char *bigbuffer; // global variable
inline void foundBadString(const uint8_t *buf) {
void foundBadString(const uint8_t *buf) {
bad_string++;
char *end;
if (parse_string((const char *)buf, bigbuffer, &end)) {
@ -226,7 +226,7 @@ void print_cmp_hex(const char *s1, const char *s2, size_t len) {
}
}
inline void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
const uint8_t *parsed_end) {
size_t thislen = parsed_end - parsed_begin;
total_string_length += thislen;