Merge branch 'master' of github.com:lemire/simdjson

This commit is contained in:
Daniel Lemire 2020-01-02 15:27:00 -05:00
commit a2d05b21ff
7 changed files with 479 additions and 970 deletions

View File

@ -64,7 +64,8 @@ jobs:
done
- name: Run the other fuzzer variants for $fuzzer, with sanitizers etc
run: |
for fuzzer in $allfuzzers; do
set -x
for fuzzer in $allfuzzers; do
build-ossfuzz-withavx/fuzz/fuzz_$fuzzer out/$fuzzer -max_total_time=20 $artifactsprefix || touch failed
build-ossfuzz-noavx/fuzz/fuzz_$fuzzer out/$fuzzer -max_total_time=10 $artifactsprefix || touch failed
build-ossfuzz-noavx8/fuzz/fuzz_$fuzzer out/$fuzzer -max_total_time=10 $artifactsprefix || touch failed

View File

@ -14,7 +14,8 @@ WARN_UNUSED int
unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
template <Architecture T = Architecture::NATIVE>
int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
WARN_UNUSED int
unified_machine(const char *buf, size_t len, ParsedJson &pj) {
return unified_machine<T>(reinterpret_cast<const uint8_t *>(buf), len, pj);
}

View File

@ -21,13 +21,13 @@ namespace simdjson {
template <>
WARN_UNUSED int
unified_machine<Architecture::ARM64>(const uint8_t *buf, size_t len, ParsedJson &pj) {
return arm64::unified_machine(buf, len, pj);
return arm64::stage2::unified_machine(buf, len, pj);
}
template <>
WARN_UNUSED int
unified_machine<Architecture::ARM64>(const uint8_t *buf, size_t len, ParsedJson &pj, size_t &next_json) {
return arm64::unified_machine(buf, len, pj, next_json);
return arm64::stage2::unified_machine(buf, len, pj, next_json);
}
} // namespace simdjson

View File

@ -3,192 +3,86 @@
// We assume the file in which it is include already includes
// "simdjson/stage2_build_tape.h" (this simplifies amalgation)
// this macro reads the next structural character, updating idx, i and c.
#define UPDATE_CHAR() \
{ \
idx = pj.structural_indexes[i++]; \
c = buf[idx]; \
}
namespace stage2 {
#ifdef SIMDJSON_USE_COMPUTED_GOTO
#define SET_GOTO_ARRAY_CONTINUE() pj.ret_address[depth] = &&array_continue;
#define SET_GOTO_OBJECT_CONTINUE() pj.ret_address[depth] = &&object_continue;
#define SET_GOTO_START_CONTINUE() pj.ret_address[depth] = &&start_continue;
#define GOTO_CONTINUE() goto *pj.ret_address[depth];
typedef void* ret_address;
#define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
#define GOTO(address) { goto *(address); }
#define CONTINUE(address) { goto *(address); }
#else
#define SET_GOTO_ARRAY_CONTINUE() pj.ret_address[depth] = 'a';
#define SET_GOTO_OBJECT_CONTINUE() pj.ret_address[depth] = 'o';
#define SET_GOTO_START_CONTINUE() pj.ret_address[depth] = 's';
#define GOTO_CONTINUE() \
{ \
if (pj.ret_address[depth] == 'a') { \
goto array_continue; \
} else if (pj.ret_address[depth] == 'o') { \
goto object_continue; \
} else { \
goto start_continue; \
} \
typedef char ret_address;
#define INIT_ADDRESSES() { '[', 'a', 'e', 'f', '{', 'o' };
#define GOTO(address) \
{ \
switch(address) { \
case '[': goto array_begin; \
case 'a': goto array_continue; \
case 'e': goto error; \
case 'f': goto finish; \
case '{': goto object_begin; \
case 'o': goto object_continue; \
} \
}
// For the more constrained pop_scope() situation
#define CONTINUE(address) \
{ \
switch(address) { \
case 'a': goto array_continue; \
case 'o': goto object_continue; \
case 'f': goto finish; \
} \
}
#endif
/************
* The JSON is parsed to a tape, see the accompanying tape.md file
* for documentation.
***********/
WARN_UNUSED int
unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
uint32_t i = 0; /* index of the structural character (0,1,2,3...) */
uint32_t idx; /* location of the structural character in the input (buf) */
uint8_t c; /* used to track the (structural) character we are looking at,
updated */
/* by UPDATE_CHAR macro */
uint32_t depth = 0; /* could have an arbitrary starting depth */
pj.init(); /* sets is_valid to false */
if (pj.byte_capacity < len) {
pj.error_code = simdjson::CAPACITY;
return pj.error_code;
struct unified_machine_addresses {
ret_address array_begin;
ret_address array_continue;
ret_address error;
ret_address finish;
ret_address object_begin;
ret_address object_continue;
};
#undef FAIL_IF
#define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
// This is just so we can call parse_string() from parser.parse_string() without conflict.
WARN_UNUSED really_inline bool
really_parse_string(const uint8_t *buf, size_t len, ParsedJson &pj, uint32_t depth, uint32_t idx) {
return parse_string(buf, len, pj, depth, idx);
}
WARN_UNUSED really_inline bool
really_parse_number(const uint8_t *const buf, ParsedJson &pj, const uint32_t offset, bool found_minus) {
return parse_number(buf, pj, offset, found_minus);
}
struct structural_parser {
const uint8_t* const buf;
const size_t len;
ParsedJson &pj;
uint32_t i; // next structural index
uint32_t idx; // location of the structural character in the input (buf)
uint8_t c; // used to track the (structural) character we are looking at
uint32_t depth = 0; // could have an arbitrary starting depth
really_inline structural_parser(const uint8_t *_buf, size_t _len, ParsedJson &_pj, uint32_t _i = 0) : buf{_buf}, len{_len}, pj{_pj}, i{_i} {}
WARN_UNUSED really_inline int set_error_code(ErrorValues error_code) {
pj.error_code = error_code;
return error_code;
}
/*//////////////////////////// START STATE /////////////////////////////
*/
SET_GOTO_START_CONTINUE()
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, 'r'); /* r for root, 0 is going to get overwritten */
/* the root is used, if nothing else, to capture the size of the tape */
depth++; /* everything starts at depth = 1, depth = 0 is just for the
root, the root may contain an object, an array or something
else. */
if (depth >= pj.depth_capacity) {
goto fail;
really_inline char advance_char() {
idx = pj.structural_indexes[i++];
c = buf[idx];
return c;
}
UPDATE_CHAR();
switch (c) {
case '{':
pj.containing_scope_offset[depth] = pj.get_current_loc();
SET_GOTO_START_CONTINUE();
depth++;
if (depth >= pj.depth_capacity) {
goto fail;
}
pj.write_tape(
0, c); /* strangely, moving this to object_begin slows things down */
goto object_begin;
case '[':
pj.containing_scope_offset[depth] = pj.get_current_loc();
SET_GOTO_START_CONTINUE();
depth++;
if (depth >= pj.depth_capacity) {
goto fail;
}
pj.write_tape(0, c);
goto array_begin;
template<typename F>
really_inline bool with_space_terminated_copy(const F& f) {
/**
* A JSON text is a serialized value. Note that certain previous
* specifications of JSON constrained a JSON text to be an object or an
* array. Implementations that generate only objects or arrays where a
* JSON text is called for will be interoperable in the sense that all
* implementations will accept these as conforming JSON texts.
* https://tools.ietf.org/html/rfc8259
**/
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't': {
/* We need to make a copy to make sure that the string is space
* terminated.
* This only applies to the JSON document made solely of the true
* value.
* This is not about padding the input, which should already be padded up
* to len + SIMDJSON_PADDING. However, we have no control at this stage
* on how the padding was done. What if the input string was padded with nulls?
* It is quite common for an input string to have an extra null character (C string).
* This copy is relatively expensive, but it will almost never be called in
* practice unless you are in the strange scenario where you have many JSON
* documents made of single atoms.
*/
char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if (copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
memset(copy + len, ' ', sizeof(uint64_t));
if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case 'f': {
/* We need to make a copy to make sure that the string is space
* terminated.
* This only applies to the JSON document made solely of the false
* value.
* This is not about padding the input, which should already be padded up
* to len + SIMDJSON_PADDING. However, we have no control at this stage
* on how the padding was done. What if the input string was padded with nulls?
* It is quite common for an input string to have an extra null character (C string).
* This copy is relatively expensive, but it will almost never be called in
* practice unless you are in the strange scenario where you have many JSON
* documents made of single atoms.
*/
char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if (copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
memset(copy + len, ' ', sizeof(uint64_t));
if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case 'n': {
/* We need to make a copy to make sure that the string is space
* terminated.
* This is not about padding the input, which should already padded up
* to len + SIMDJSON_PADDING. However, we have no control at this stage
* on how the padding was done. What if the input string was padded with nulls?
* It is quite common for an input string to have an extra null character (C string).
* This only applies to the JSON document made solely of the null value.
* This copy is relatively expensive, but it will almost never be called in
* practice unless you are in the strange scenario where you have many JSON
* documents made of single atoms.
*/
char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if (copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
memset(copy + len, ' ', sizeof(uint64_t));
if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
/**
* We need to make a copy to make sure that the input string is space terminated.
* We need to make a copy to make sure that the string is space terminated.
* This is not about padding the input, which should already padded up
* to len + SIMDJSON_PADDING. However, we have no control at this stage
* on how the padding was done. What if the input string was padded with nulls?
@ -202,352 +96,301 @@ unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
*/
char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if (copy == nullptr) {
goto fail;
return true;
}
memcpy(copy, buf, len);
memset(copy + len, ' ', SIMDJSON_PADDING);
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx,
false)) {
free(copy);
goto fail;
}
bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
free(copy);
break;
return result;
}
case '-': {
/**
* We need to make a copy to make sure that the input string is space terminated.
* This is not about padding the input, which should already padded up
* to len + SIMDJSON_PADDING. However, we have no control at this stage
* on how the padding was done. What if the input string was padded with nulls?
* It is quite common for an input string to have an extra null character (C string).
* We do not want to allow -9\0 (where \0 is the null character) inside a JSON
* document, but the string "-9\0" by itself is fine. So we make a copy and
* pad the input with spaces when we know that there is just one input element.
* This copy is relatively expensive, but it will almost never be called in
* practice unless you are in the strange scenario where you have many JSON
* documents made of single atoms.
*/
char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if (copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
memset(copy + len, ' ', SIMDJSON_PADDING);
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
free(copy);
goto fail;
}
free(copy);
break;
}
default:
goto fail;
}
start_continue:
/* the string might not be NULL terminated. */
if (i + 1 == pj.n_structural_indexes) {
goto succeed;
} else {
goto fail;
}
/*//////////////////////////// OBJECT STATES ///////////////////////////*/
object_begin:
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
WARN_UNUSED really_inline bool push_start_scope(ret_address continue_state, char type) {
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.ret_address[depth] = continue_state;
depth++;
pj.write_tape(0, type);
return depth >= pj.depth_capacity;
}
WARN_UNUSED really_inline bool push_start_scope(ret_address continue_state) {
return push_start_scope(continue_state, c);
}
WARN_UNUSED really_inline bool push_scope(ret_address continue_state) {
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); // Do this as early as possible
pj.ret_address[depth] = continue_state;
depth++;
return depth >= pj.depth_capacity;
}
WARN_UNUSED really_inline ret_address pop_scope() {
// write our tape location to the header scope
depth--;
pj.write_tape(pj.containing_scope_offset[depth], c);
pj.annotate_previous_loc(pj.containing_scope_offset[depth], pj.get_current_loc());
return pj.ret_address[depth];
}
really_inline void pop_root_scope() {
// write our tape location to the header scope
// The root scope gets written *at* the previous location.
depth--;
pj.annotate_previous_loc(pj.containing_scope_offset[depth], pj.get_current_loc());
pj.write_tape(pj.containing_scope_offset[depth], 'r');
}
WARN_UNUSED really_inline bool parse_string() {
return !really_parse_string(buf, len, pj, depth, idx);
}
WARN_UNUSED really_inline bool parse_number(const uint8_t *copy, uint32_t offset, bool found_minus) {
return !really_parse_number(copy, pj, offset, found_minus);
}
WARN_UNUSED really_inline bool parse_number(bool found_minus) {
return parse_number(buf, idx, found_minus);
}
WARN_UNUSED really_inline bool parse_atom(const uint8_t *copy, uint32_t offset) {
switch (c) {
case 't':
if (!is_valid_true_atom(copy + offset)) { return true; };
break;
case 'f':
if (!is_valid_false_atom(copy + offset)) { return true; }
break;
case 'n':
if (!is_valid_null_atom(copy + offset)) { return true; }
break;
default:
return false;
}
pj.write_tape(0, c);
return false;
}
WARN_UNUSED really_inline bool parse_atom() {
return parse_atom(buf, idx);
}
WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
switch (c) {
case '"':
FAIL_IF( parse_string() );
return continue_state;
case 't': case 'f': case 'n':
FAIL_IF( parse_atom() );
return continue_state;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
FAIL_IF( parse_number(false) );
return continue_state;
case '-':
FAIL_IF( parse_number(true) );
return continue_state;
case '{':
FAIL_IF( push_scope(continue_state) );
return addresses.object_begin;
case '[':
FAIL_IF( push_scope(continue_state) );
return addresses.array_begin;
default:
return addresses.error;
}
}
WARN_UNUSED really_inline int finish() {
// the string might not be NULL terminated.
if ( i + 1 != pj.n_structural_indexes ) {
return set_error_code(TAPE_ERROR);
}
pop_root_scope();
if (depth != 0) {
return set_error_code(TAPE_ERROR);
}
if (pj.containing_scope_offset[depth] != 0) {
return set_error_code(TAPE_ERROR);
}
pj.valid = true;
return set_error_code(SUCCESS);
}
WARN_UNUSED really_inline int error() {
/* We do not need the next line because this is done by pj.init(),
* pessimistically.
* pj.is_valid = false;
* At this point in the code, we have all the time in the world.
* Note that we know exactly where we are in the document so we could,
* without any overhead on the processing code, report a specific
* location.
* We could even trigger special code paths to assess what happened
* carefully,
* all without any added cost. */
if (depth >= pj.depth_capacity) {
return set_error_code(DEPTH_ERROR);
}
switch (c) {
case '"':
return set_error_code(STRING_ERROR);
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
return set_error_code(NUMBER_ERROR);
case 't':
return set_error_code(T_ATOM_ERROR);
case 'n':
return set_error_code(N_ATOM_ERROR);
case 'f':
return set_error_code(F_ATOM_ERROR);
default:
return set_error_code(TAPE_ERROR);
}
}
WARN_UNUSED really_inline int start(ret_address finish_state) {
pj.init(); // sets is_valid to false
if (len > pj.byte_capacity) {
return CAPACITY;
}
// Advance to the first character as soon as possible
advance_char();
// Push the root scope (there is always at least one scope)
if (push_start_scope(finish_state, 'r')) {
return DEPTH_ERROR;
}
return SUCCESS;
}
};
// Redefine FAIL_IF to use goto since it'll be used inside the function now
#undef FAIL_IF
#define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
/************
* The JSON is parsed to a tape, see the accompanying tape.md file
* for documentation.
***********/
WARN_UNUSED int
unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
static constexpr unified_machine_addresses addresses = INIT_ADDRESSES();
structural_parser parser(buf, len, pj);
int result = parser.start(addresses.finish);
if (result) { return result; }
//
// Read first value
//
switch (parser.c) {
case '{':
FAIL_IF( parser.push_start_scope(addresses.finish) );
goto object_begin;
case '[':
FAIL_IF( parser.push_start_scope(addresses.finish) );
goto array_begin;
case '"':
FAIL_IF( parser.parse_string() );
goto finish;
case 't': case 'f': case 'n':
FAIL_IF(
parser.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_atom(copy, idx);
})
);
goto finish;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
FAIL_IF(
parser.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_number(copy, idx, false);
})
);
goto finish;
case '-':
FAIL_IF(
parser.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_number(copy, idx, true);
})
);
goto finish;
default:
goto error;
}
//
// Object parser states
//
object_begin:
parser.advance_char();
switch (parser.c) {
case '"': {
FAIL_IF( parser.parse_string() );
goto object_key_state;
}
case '}':
goto scope_end; /* could also go to object_continue */
goto scope_end; // could also go to object_continue
default:
goto fail;
goto error;
}
object_key_state:
UPDATE_CHAR();
if (c != ':') {
goto fail;
}
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (!parse_number(buf, pj, idx, false)) {
goto fail;
}
break;
}
case '-': {
if (!parse_number(buf, pj, idx, true)) {
goto fail;
}
break;
}
case '{': {
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); /* here the compilers knows what c is so this gets
optimized */
/* we have not yet encountered } so we need to come back for it */
SET_GOTO_OBJECT_CONTINUE()
/* we found an object inside an object, so we need to increment the
* depth */
depth++;
if (depth >= pj.depth_capacity) {
goto fail;
}
FAIL_IF( parser.advance_char() != ':' );
goto object_begin;
}
case '[': {
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); /* here the compilers knows what c is so this gets
optimized */
/* we have not yet encountered } so we need to come back for it */
SET_GOTO_OBJECT_CONTINUE()
/* we found an array inside an object, so we need to increment the depth
*/
depth++;
if (depth >= pj.depth_capacity) {
goto fail;
}
goto array_begin;
}
default:
goto fail;
}
parser.advance_char();
GOTO( parser.parse_value(addresses, addresses.object_continue) );
object_continue:
UPDATE_CHAR();
switch (c) {
switch (parser.advance_char()) {
case ',':
UPDATE_CHAR();
if (c != '"') {
goto fail;
} else {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
goto object_key_state;
}
FAIL_IF( parser.advance_char() != '"' );
FAIL_IF( parser.parse_string() );
goto object_key_state;
case '}':
goto scope_end;
default:
goto fail;
goto error;
}
/*//////////////////////////// COMMON STATE ///////////////////////////*/
scope_end:
/* write our tape location to the header scope */
depth--;
pj.write_tape(pj.containing_scope_offset[depth], c);
pj.annotate_previous_loc(pj.containing_scope_offset[depth],
pj.get_current_loc());
/* goto saved_state */
GOTO_CONTINUE()
CONTINUE( parser.pop_scope() );
/*//////////////////////////// ARRAY STATES ///////////////////////////*/
//
// Array parser states
//
array_begin:
UPDATE_CHAR();
if (c == ']') {
goto scope_end; /* could also go to array_continue */
if (parser.advance_char() == ']') {
goto scope_end; // could also go to array_continue
}
main_array_switch:
/* we call update char on all paths in, so we can peek at c on the
/* we call update char on all paths in, so we can peek at parser.c on the
* on paths that can accept a close square brace (post-, and at start) */
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break; /* goto array_continue; */
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (!parse_number(buf, pj, idx, false)) {
goto fail;
}
break; /* goto array_continue; */
}
case '-': {
if (!parse_number(buf, pj, idx, true)) {
goto fail;
}
break; /* goto array_continue; */
}
case '{': {
/* we have not yet encountered ] so we need to come back for it */
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); /* here the compilers knows what c is so this gets
optimized */
SET_GOTO_ARRAY_CONTINUE()
/* we found an object inside an array, so we need to increment the depth
*/
depth++;
if (depth >= pj.depth_capacity) {
goto fail;
}
goto object_begin;
}
case '[': {
/* we have not yet encountered ] so we need to come back for it */
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); /* here the compilers knows what c is so this gets
optimized */
SET_GOTO_ARRAY_CONTINUE()
/* we found an array inside an array, so we need to increment the depth
*/
depth++;
if (depth >= pj.depth_capacity) {
goto fail;
}
goto array_begin;
}
default:
goto fail;
}
GOTO( parser.parse_value(addresses, addresses.array_continue) );
array_continue:
UPDATE_CHAR();
switch (c) {
switch (parser.advance_char()) {
case ',':
UPDATE_CHAR();
parser.advance_char();
goto main_array_switch;
case ']':
goto scope_end;
default:
goto fail;
goto error;
}
/*//////////////////////////// FINAL STATES ///////////////////////////*/
finish:
return parser.finish();
succeed:
depth--;
if (depth != 0) {
fprintf(stderr, "internal bug\n");
abort();
}
if (pj.containing_scope_offset[depth] != 0) {
fprintf(stderr, "internal bug\n");
abort();
}
pj.annotate_previous_loc(pj.containing_scope_offset[depth],
pj.get_current_loc());
pj.write_tape(pj.containing_scope_offset[depth], 'r'); /* r is root */
pj.valid = true;
pj.error_code = simdjson::SUCCESS;
return pj.error_code;
fail:
/* we do not need the next line because this is done by pj.init(),
* pessimistically.
* pj.is_valid = false;
* At this point in the code, we have all the time in the world.
* Note that we know exactly where we are in the document so we could,
* without any overhead on the processing code, report a specific
* location.
* We could even trigger special code paths to assess what happened
* carefully,
* all without any added cost. */
if (depth >= pj.depth_capacity) {
pj.error_code = simdjson::DEPTH_ERROR;
return pj.error_code;
}
switch (c) {
case '"':
pj.error_code = simdjson::STRING_ERROR;
return pj.error_code;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
pj.error_code = simdjson::NUMBER_ERROR;
return pj.error_code;
case 't':
pj.error_code = simdjson::T_ATOM_ERROR;
return pj.error_code;
case 'n':
pj.error_code = simdjson::N_ATOM_ERROR;
return pj.error_code;
case 'f':
pj.error_code = simdjson::F_ATOM_ERROR;
return pj.error_code;
default:
break;
}
pj.error_code = simdjson::TAPE_ERROR;
return pj.error_code;
error:
return parser.error();
}
} // namespace stage2

View File

@ -1,497 +1,161 @@
namespace stage2 {
struct streaming_structural_parser: structural_parser {
really_inline streaming_structural_parser(const uint8_t *_buf, size_t _len, ParsedJson &_pj, size_t _i) : structural_parser(_buf, _len, _pj, _i) {}
// override to add streaming
WARN_UNUSED really_inline int start(ret_address finish_parser) {
pj.init(); // sets is_valid to false
// Capacity ain't no thang for streaming, so we don't check it.
// Advance to the first character as soon as possible
advance_char();
// Push the root scope (there is always at least one scope)
if (push_start_scope(finish_parser, 'r')) {
return DEPTH_ERROR;
}
return SUCCESS;
}
// override to add streaming
WARN_UNUSED really_inline int finish() {
/* the string might not be NULL terminated. */
if ( i + 1 > pj.n_structural_indexes ) {
return set_error_code(TAPE_ERROR);
}
bool finished = i + 1 == pj.n_structural_indexes;
if (finished && buf[idx+2] != '\0') {
return set_error_code(TAPE_ERROR);
}
pop_root_scope();
if (depth != 0) {
return set_error_code(TAPE_ERROR);
}
if (pj.containing_scope_offset[depth] != 0) {
return set_error_code(TAPE_ERROR);
}
pj.valid = true;
return set_error_code(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
}
};
/************
* The JSON is parsed to a tape, see the accompanying tape.md file
* for documentation.
***********/
WARN_UNUSED int
unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj, size_t &next_json) {
size_t i{next_json}; /* index of the structural character (0,1,2,3...) */
size_t idx; /* location of the structural character in the input (buf) */
uint8_t c; /* used to track the (structural) character we are looking at,
updated */
/* by UPDATE_CHAR macro */
size_t depth = 0; /* could have an arbitrary starting depth */
pj.init(); /* sets is_valid to false */
/*//////////////////////////// START STATE /////////////////////////////
*/
SET_GOTO_START_CONTINUE()
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, 'r'); /* r for root, 0 is going to get overwritten */
/* the root is used, if nothing else, to capture the size of the tape */
depth++; /* everything starts at depth = 1, depth = 0 is just for the
root, the root may contain an object, an array or something
else. */
if (depth >= pj.depth_capacity) {
goto fail;
}
static constexpr unified_machine_addresses addresses = INIT_ADDRESSES();
streaming_structural_parser parser(buf, len, pj, next_json);
int result = parser.start(addresses.finish);
if (result) { return result; }
UPDATE_CHAR();
switch (c) {
case '{':
pj.containing_scope_offset[depth] = pj.get_current_loc();
SET_GOTO_START_CONTINUE();
depth++;
if (depth >= pj.depth_capacity) {
goto fail;
}
pj.write_tape(
0, c); /* strangely, moving this to object_begin slows things down */
goto object_begin;
case '[':
pj.containing_scope_offset[depth] = pj.get_current_loc();
SET_GOTO_START_CONTINUE();
depth++;
if (depth >= pj.depth_capacity) {
goto fail;
}
pj.write_tape(0, c);
goto array_begin;
/* #define SIMDJSON_ALLOWANYTHINGINROOT
* A JSON text is a serialized value. Note that certain previous
* specifications of JSON constrained a JSON text to be an object or an
* array. Implementations that generate only objects or arrays where a
* JSON text is called for will be interoperable in the sense that all
* implementations will accept these as conforming JSON texts.
* https://tools.ietf.org/html/rfc8259
* #ifdef SIMDJSON_ALLOWANYTHINGINROOT */
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't': {
/* we need to make a copy to make sure that the string is space
* terminated.
* this only applies to the JSON document made solely of the true value.
* this will almost never be called in practice */
char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if (copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case 'f': {
/* we need to make a copy to make sure that the string is space
* terminated.
* this only applies to the JSON document made solely of the false
* value.
* this will almost never be called in practice */
char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if (copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case 'n': {
/* we need to make a copy to make sure that the string is space
* terminated.
* this only applies to the JSON document made solely of the null value.
* this will almost never be called in practice */
char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if (copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
free(copy);
goto fail;
}
free(copy);
pj.write_tape(0, c);
break;
}
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
/* we need to make a copy to make sure that the string is space
* terminated.
* this is done only for JSON documents made of a sole number
* this will almost never be called in practice. We terminate with a
* space
* because we do not want to allow NULLs in the middle of a number
* (whereas a
* space in the middle of a number would be identified in stage 1). */
char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if (copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx,
false)) {
free(copy);
goto fail;
}
free(copy);
break;
}
case '-': {
/* we need to make a copy to make sure that the string is NULL
* terminated.
* this is done only for JSON documents made of a sole number
* this will almost never be called in practice */
char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
if (copy == nullptr) {
goto fail;
}
memcpy(copy, buf, len);
copy[len] = ' ';
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
free(copy);
goto fail;
}
free(copy);
break;
}
default:
goto fail;
}
start_continue:
/* the string might not be NULL terminated. */
if (i + 1 == pj.n_structural_indexes && buf[idx+2] == '\0') {
goto succeed;
} else if(depth == 1 && i<=pj.n_structural_indexes) {
goto succeedAndHasMore;
} else {
goto fail;
}
/*//////////////////////////// OBJECT STATES ///////////////////////////*/
//
// Read first value
//
switch (parser.c) {
case '{':
FAIL_IF( parser.push_start_scope(addresses.finish) );
goto object_begin;
case '[':
FAIL_IF( parser.push_start_scope(addresses.finish) );
goto array_begin;
case '"':
FAIL_IF( parser.parse_string() );
goto finish;
case 't': case 'f': case 'n':
FAIL_IF(
parser.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_atom(copy, idx);
})
);
goto finish;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
FAIL_IF(
parser.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_number(copy, idx, false);
})
);
goto finish;
case '-':
FAIL_IF(
parser.with_space_terminated_copy([&](auto copy, auto idx) {
return parser.parse_number(copy, idx, true);
})
);
goto finish;
default:
goto error;
}
object_begin:
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
goto object_key_state;
}
case '}':
goto scope_end; /* could also go to object_continue */
default:
goto fail;
}
//
// Object parser parsers
//
object_begin:
parser.advance_char();
switch (parser.c) {
case '"': {
FAIL_IF( parser.parse_string() );
goto object_key_parser;
}
case '}':
goto scope_end; // could also go to object_continue
default:
goto error;
}
object_key_state:
UPDATE_CHAR();
if (c != ':') {
goto fail;
}
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (!parse_number(buf, pj, idx, false)) {
goto fail;
}
break;
}
case '-': {
if (!parse_number(buf, pj, idx, true)) {
goto fail;
}
break;
}
case '{': {
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); /* here the compilers knows what c is so this gets
optimized */
/* we have not yet encountered } so we need to come back for it */
SET_GOTO_OBJECT_CONTINUE()
/* we found an object inside an object, so we need to increment the
* depth */
depth++;
if (depth >= pj.depth_capacity) {
goto fail;
}
object_key_parser:
FAIL_IF( parser.advance_char() != ':' );
goto object_begin;
}
case '[': {
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); /* here the compilers knows what c is so this gets
optimized */
/* we have not yet encountered } so we need to come back for it */
SET_GOTO_OBJECT_CONTINUE()
/* we found an array inside an object, so we need to increment the depth
*/
depth++;
if (depth >= pj.depth_capacity) {
goto fail;
}
goto array_begin;
}
default:
goto fail;
}
parser.advance_char();
GOTO( parser.parse_value(addresses, addresses.object_continue) );
object_continue:
UPDATE_CHAR();
switch (c) {
case ',':
UPDATE_CHAR();
if (c != '"') {
goto fail;
} else {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
goto object_key_state;
}
case '}':
goto scope_end;
default:
goto fail;
}
object_continue:
switch (parser.advance_char()) {
case ',':
FAIL_IF( parser.advance_char() != '"' );
FAIL_IF( parser.parse_string() );
goto object_key_parser;
case '}':
goto scope_end;
default:
goto error;
}
/*//////////////////////////// COMMON STATE ///////////////////////////*/
scope_end:
CONTINUE( parser.pop_scope() );
scope_end:
/* write our tape location to the header scope */
depth--;
pj.write_tape(pj.containing_scope_offset[depth], c);
pj.annotate_previous_loc(pj.containing_scope_offset[depth],
pj.get_current_loc());
/* goto saved_state */
GOTO_CONTINUE()
//
// Array parser parsers
//
array_begin:
if (parser.advance_char() == ']') {
goto scope_end; // could also go to array_continue
}
/*//////////////////////////// ARRAY STATES ///////////////////////////*/
array_begin:
UPDATE_CHAR();
if (c == ']') {
goto scope_end; /* could also go to array_continue */
}
main_array_switch:
/* we call update char on all paths in, so we can peek at parser.c on the
* on paths that can accept a close square brace (post-, and at start) */
GOTO( parser.parse_value(addresses, addresses.array_continue) );
main_array_switch:
/* we call update char on all paths in, so we can peek at c on the
* on paths that can accept a close square brace (post-, and at start) */
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break; /* goto array_continue; */
array_continue:
switch (parser.advance_char()) {
case ',':
parser.advance_char();
goto main_array_switch;
case ']':
goto scope_end;
default:
goto error;
}
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (!parse_number(buf, pj, idx, false)) {
goto fail;
}
break; /* goto array_continue; */
}
case '-': {
if (!parse_number(buf, pj, idx, true)) {
goto fail;
}
break; /* goto array_continue; */
}
case '{': {
/* we have not yet encountered ] so we need to come back for it */
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); /* here the compilers knows what c is so this gets
optimized */
SET_GOTO_ARRAY_CONTINUE()
/* we found an object inside an array, so we need to increment the depth
*/
depth++;
if (depth >= pj.depth_capacity) {
goto fail;
}
finish:
next_json = parser.i;
return parser.finish();
goto object_begin;
}
case '[': {
/* we have not yet encountered ] so we need to come back for it */
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c); /* here the compilers knows what c is so this gets
optimized */
SET_GOTO_ARRAY_CONTINUE()
/* we found an array inside an array, so we need to increment the depth
*/
depth++;
if (depth >= pj.depth_capacity) {
goto fail;
}
goto array_begin;
}
default:
goto fail;
}
array_continue:
UPDATE_CHAR();
switch (c) {
case ',':
UPDATE_CHAR();
goto main_array_switch;
case ']':
goto scope_end;
default:
goto fail;
}
/*//////////////////////////// FINAL STATES ///////////////////////////*/
succeedAndHasMore:
depth--;
if (pj.containing_scope_offset[depth] != 0) {
fprintf(stderr, "internal bug\n");
abort();
}
pj.annotate_previous_loc(pj.containing_scope_offset[depth],
pj.get_current_loc());
pj.write_tape(pj.containing_scope_offset[depth], 'r'); /* r is root */
next_json = i;
pj.valid = true;
pj.error_code = simdjson::SUCCESS_AND_HAS_MORE;
return pj.error_code;
succeed:
depth--;
if (depth != 0) {
fprintf(stderr, "internal bug\n");
abort();
}
if (pj.containing_scope_offset[depth] != 0) {
fprintf(stderr, "internal bug\n");
abort();
}
pj.annotate_previous_loc(pj.containing_scope_offset[depth],
pj.get_current_loc());
pj.write_tape(pj.containing_scope_offset[depth], 'r'); /* r is root */
pj.valid = true;
pj.error_code = simdjson::SUCCESS;
return pj.error_code;
fail:
/* we do not need the next line because this is done by pj.init(),
* pessimistically.
* pj.is_valid = false;
* At this point in the code, we have all the time in the world.
* Note that we know exactly where we are in the document so we could,
* without any overhead on the processing code, report a specific
* location.
* We could even trigger special code paths to assess what happened
* carefully,
* all without any added cost. */
if (depth >= pj.depth_capacity) {
pj.error_code = simdjson::DEPTH_ERROR;
return pj.error_code;
}
switch (c) {
case '"':
pj.error_code = simdjson::STRING_ERROR;
return pj.error_code;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
pj.error_code = simdjson::NUMBER_ERROR;
return pj.error_code;
case 't':
pj.error_code = simdjson::T_ATOM_ERROR;
return pj.error_code;
case 'n':
pj.error_code = simdjson::N_ATOM_ERROR;
return pj.error_code;
case 'f':
pj.error_code = simdjson::F_ATOM_ERROR;
return pj.error_code;
default:
break;
}
pj.error_code = simdjson::TAPE_ERROR;
return pj.error_code;
error:
return parser.error();
}
} // namespace stage2

View File

@ -24,13 +24,13 @@ namespace simdjson {
template <>
WARN_UNUSED int
unified_machine<Architecture::HASWELL>(const uint8_t *buf, size_t len, ParsedJson &pj) {
return haswell::unified_machine(buf, len, pj);
return haswell::stage2::unified_machine(buf, len, pj);
}
template <>
WARN_UNUSED int
unified_machine<Architecture::HASWELL>(const uint8_t *buf, size_t len, ParsedJson &pj, size_t &next_json) {
return haswell::unified_machine(buf, len, pj, next_json);
return haswell::stage2::unified_machine(buf, len, pj, next_json);
}
} // namespace simdjson

View File

@ -24,13 +24,13 @@ namespace simdjson {
template <>
WARN_UNUSED int
unified_machine<Architecture::WESTMERE>(const uint8_t *buf, size_t len, ParsedJson &pj) {
return westmere::unified_machine(buf, len, pj);
return westmere::stage2::unified_machine(buf, len, pj);
}
template <>
WARN_UNUSED int
unified_machine<Architecture::WESTMERE>(const uint8_t *buf, size_t len, ParsedJson &pj, size_t &next_json) {
return westmere::unified_machine(buf, len, pj, next_json);
return westmere::stage2::unified_machine(buf, len, pj, next_json);
}