Merge pull request #201 from lemire/Multiple_implementation_refactoring_stage2
Stage2 refactored to simplify multiple implementations
This commit is contained in:
commit
477b058f74
|
@ -157,7 +157,8 @@ int main(int argc, char *argv[]) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
unified.start();
|
unified.start();
|
||||||
isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
|
// The default template is simdjson::instruction_set::native.
|
||||||
|
isok = isok && (simdjson::SUCCESS == simdjson::unified_machine<>(p.data(), p.size(), pj));
|
||||||
unified.end(results);
|
unified.end(results);
|
||||||
cy2 += results[0];
|
cy2 += results[0];
|
||||||
cl2 += results[1];
|
cl2 += results[1];
|
||||||
|
@ -188,7 +189,7 @@ int main(int argc, char *argv[]) {
|
||||||
auto start = std::chrono::steady_clock::now();
|
auto start = std::chrono::steady_clock::now();
|
||||||
// The default template is simdjson::instruction_set::native.
|
// The default template is simdjson::instruction_set::native.
|
||||||
isok = (simdjson::find_structural_bits<>(p.data(), p.size(), pj) == simdjson::SUCCESS);
|
isok = (simdjson::find_structural_bits<>(p.data(), p.size(), pj) == simdjson::SUCCESS);
|
||||||
isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
|
isok = isok && (simdjson::SUCCESS == simdjson::unified_machine<>(p.data(), p.size(), pj));
|
||||||
auto end = std::chrono::steady_clock::now();
|
auto end = std::chrono::steady_clock::now();
|
||||||
std::chrono::duration<double> secs = end - start;
|
std::chrono::duration<double> secs = end - start;
|
||||||
res[i] = secs.count();
|
res[i] = secs.count();
|
||||||
|
|
|
@ -26,7 +26,6 @@ using json_parse_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj,
|
||||||
// Pointer that holds the json_parse implementation corresponding to the available SIMD instruction set
|
// Pointer that holds the json_parse implementation corresponding to the available SIMD instruction set
|
||||||
extern json_parse_functype *json_parse_ptr;
|
extern json_parse_functype *json_parse_ptr;
|
||||||
|
|
||||||
|
|
||||||
// json_parse_implementation is the generic function, it is specialized for various
|
// json_parse_implementation is the generic function, it is specialized for various
|
||||||
// SIMD instruction sets, e.g., as json_parse_implementation<instruction_set::avx2>
|
// SIMD instruction sets, e.g., as json_parse_implementation<instruction_set::avx2>
|
||||||
// or json_parse_implementation<instruction_set::neon>
|
// or json_parse_implementation<instruction_set::neon>
|
||||||
|
@ -69,7 +68,7 @@ int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj, bo
|
||||||
pj.errorcode = stage1_is_ok;
|
pj.errorcode = stage1_is_ok;
|
||||||
return pj.errorcode;
|
return pj.errorcode;
|
||||||
}
|
}
|
||||||
int res = unified_machine(buf, len, pj);
|
int res = unified_machine<T>(buf, len, pj);
|
||||||
if(reallocated) { aligned_free((void*)buf);}
|
if(reallocated) { aligned_free((void*)buf);}
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,6 +6,12 @@
|
||||||
#include "simdjson/parsedjson.h"
|
#include "simdjson/parsedjson.h"
|
||||||
#include "simdjson/portability.h"
|
#include "simdjson/portability.h"
|
||||||
|
|
||||||
|
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||||
|
void foundInvalidNumber(const uint8_t *buf);
|
||||||
|
void foundInteger(int64_t result, const uint8_t *buf);
|
||||||
|
void foundFloat(double result, const uint8_t *buf);
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace simdjson {
|
namespace simdjson {
|
||||||
// Allowable floating-point values range from std::numeric_limits<double>::lowest()
|
// Allowable floating-point values range from std::numeric_limits<double>::lowest()
|
||||||
// to std::numeric_limits<double>::max(), so from
|
// to std::numeric_limits<double>::max(), so from
|
||||||
|
@ -376,9 +382,6 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
|
||||||
return is_structural_or_whitespace(*p);
|
return is_structural_or_whitespace(*p);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// parse the number at buf + offset
|
// parse the number at buf + offset
|
||||||
// define JSON_TEST_NUMBERS for unit testing
|
// define JSON_TEST_NUMBERS for unit testing
|
||||||
//
|
//
|
||||||
|
|
|
@ -33,11 +33,11 @@
|
||||||
#define TRANSPOSE
|
#define TRANSPOSE
|
||||||
|
|
||||||
namespace simdjson {
|
namespace simdjson {
|
||||||
template<simdjson::instruction_set>
|
template<instruction_set>
|
||||||
struct simd_input;
|
struct simd_input;
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
template<>
|
template<>
|
||||||
struct simd_input<simdjson::instruction_set::avx2>
|
struct simd_input<instruction_set::avx2>
|
||||||
{
|
{
|
||||||
__m256i lo;
|
__m256i lo;
|
||||||
__m256i hi;
|
__m256i hi;
|
||||||
|
@ -45,7 +45,7 @@ struct simd_input<simdjson::instruction_set::avx2>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
template<> struct simd_input<simdjson::instruction_set::neon>
|
template<> struct simd_input<instruction_set::neon>
|
||||||
{
|
{
|
||||||
#ifndef TRANSPOSE
|
#ifndef TRANSPOSE
|
||||||
uint8x16_t i0;
|
uint8x16_t i0;
|
||||||
|
@ -111,7 +111,7 @@ uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<simdjson::instruction_set T>
|
template<instruction_set T>
|
||||||
uint64_t compute_quote_mask(uint64_t quote_bits);
|
uint64_t compute_quote_mask(uint64_t quote_bits);
|
||||||
|
|
||||||
// In practice, if you have NEON or __PCLMUL__, you would
|
// In practice, if you have NEON or __PCLMUL__, you would
|
||||||
|
@ -121,7 +121,7 @@ uint64_t compute_quote_mask(uint64_t quote_bits);
|
||||||
// Also: we don't know of an instance where AVX2 is supported but
|
// Also: we don't know of an instance where AVX2 is supported but
|
||||||
// where clmul is not supported, so check for both, to be sure.
|
// where clmul is not supported, so check for both, to be sure.
|
||||||
#ifdef SIMDJSON_AVOID_CLMUL
|
#ifdef SIMDJSON_AVOID_CLMUL
|
||||||
template<simdjson::instruction_set T> really_inline
|
template<instruction_set T> really_inline
|
||||||
uint64_t compute_quote_mask(uint64_t quote_bits)
|
uint64_t compute_quote_mask(uint64_t quote_bits)
|
||||||
{
|
{
|
||||||
uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
|
uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
|
||||||
|
@ -133,12 +133,12 @@ uint64_t compute_quote_mask(uint64_t quote_bits)
|
||||||
return quote_mask;
|
return quote_mask;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
template<simdjson::instruction_set>
|
template<instruction_set>
|
||||||
uint64_t compute_quote_mask(uint64_t quote_bits);
|
uint64_t compute_quote_mask(uint64_t quote_bits);
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
template<> really_inline
|
template<> really_inline
|
||||||
uint64_t compute_quote_mask<simdjson::instruction_set::avx2>(uint64_t quote_bits) {
|
uint64_t compute_quote_mask<instruction_set::avx2>(uint64_t quote_bits) {
|
||||||
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
||||||
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
|
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
|
||||||
return quote_mask;
|
return quote_mask;
|
||||||
|
@ -147,7 +147,7 @@ uint64_t compute_quote_mask<simdjson::instruction_set::avx2>(uint64_t quote_bits
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
template<> really_inline
|
template<> really_inline
|
||||||
uint64_t compute_quote_mask<simdjson::instruction_set::neon>(uint64_t quote_bits) {
|
uint64_t compute_quote_mask<instruction_set::neon>(uint64_t quote_bits) {
|
||||||
#ifdef __PCLMUL__ // Might cause problems on runtime dispatch
|
#ifdef __PCLMUL__ // Might cause problems on runtime dispatch
|
||||||
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
||||||
_mm_set_epi64x(0ULL, quote_bits),
|
_mm_set_epi64x(0ULL, quote_bits),
|
||||||
|
@ -161,7 +161,7 @@ uint64_t compute_quote_mask<simdjson::instruction_set::neon>(uint64_t quote_bits
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef SIMDJSON_UTF8VALIDATE
|
#ifdef SIMDJSON_UTF8VALIDATE
|
||||||
template<simdjson::instruction_set T>really_inline
|
template<instruction_set T>really_inline
|
||||||
void check_utf8(simd_input<T> in,
|
void check_utf8(simd_input<T> in,
|
||||||
__m256i &has_error,
|
__m256i &has_error,
|
||||||
struct avx_processed_utf_bytes &previous) {
|
struct avx_processed_utf_bytes &previous) {
|
||||||
|
@ -182,13 +182,13 @@ void check_utf8(simd_input<T> in,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<simdjson::instruction_set T>
|
template<instruction_set T>
|
||||||
simd_input<T> fill_input(const uint8_t * ptr);
|
simd_input<T> fill_input(const uint8_t * ptr);
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
template<> really_inline
|
template<> really_inline
|
||||||
simd_input<simdjson::instruction_set::avx2> fill_input<simdjson::instruction_set::avx2>(const uint8_t * ptr) {
|
simd_input<instruction_set::avx2> fill_input<instruction_set::avx2>(const uint8_t * ptr) {
|
||||||
struct simd_input<simdjson::instruction_set::avx2> in;
|
struct simd_input<instruction_set::avx2> in;
|
||||||
in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
|
in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
|
||||||
in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
|
in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
|
||||||
return in;
|
return in;
|
||||||
|
@ -197,8 +197,8 @@ simd_input<simdjson::instruction_set::avx2> fill_input<simdjson::instruction_set
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
template<> really_inline
|
template<> really_inline
|
||||||
simd_input<simdjson::instruction_set::neon> fill_input<simdjson::instruction_set::neon>(const uint8_t * ptr) {
|
simd_input<instruction_set::neon> fill_input<instruction_set::neon>(const uint8_t * ptr) {
|
||||||
struct simd_input<simdjson::instruction_set::neon> in;
|
struct simd_input<instruction_set::neon> in;
|
||||||
#ifndef TRANSPOSE
|
#ifndef TRANSPOSE
|
||||||
in.i0 = vld1q_u8(ptr + 0);
|
in.i0 = vld1q_u8(ptr + 0);
|
||||||
in.i1 = vld1q_u8(ptr + 16);
|
in.i1 = vld1q_u8(ptr + 16);
|
||||||
|
@ -213,12 +213,12 @@ simd_input<simdjson::instruction_set::neon> fill_input<simdjson::instruction_set
|
||||||
|
|
||||||
// a straightforward comparison of a mask against input. 5 uops; would be
|
// a straightforward comparison of a mask against input. 5 uops; would be
|
||||||
// cheaper in AVX512.
|
// cheaper in AVX512.
|
||||||
template<simdjson::instruction_set T>
|
template<instruction_set T>
|
||||||
uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
|
uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
template<> really_inline
|
template<> really_inline
|
||||||
uint64_t cmp_mask_against_input<simdjson::instruction_set::avx2>(simd_input<simdjson::instruction_set::avx2> in, uint8_t m) {
|
uint64_t cmp_mask_against_input<instruction_set::avx2>(simd_input<instruction_set::avx2> in, uint8_t m) {
|
||||||
|
|
||||||
const __m256i mask = _mm256_set1_epi8(m);
|
const __m256i mask = _mm256_set1_epi8(m);
|
||||||
__m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
|
__m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
|
||||||
|
@ -231,7 +231,7 @@ uint64_t cmp_mask_against_input<simdjson::instruction_set::avx2>(simd_input<simd
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
template<> really_inline
|
template<> really_inline
|
||||||
uint64_t cmp_mask_against_input<simdjson::instruction_set::neon>(simd_input<simdjson::instruction_set::neon> in, uint8_t m) {
|
uint64_t cmp_mask_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
|
||||||
const uint8x16_t mask = vmovq_n_u8(m);
|
const uint8x16_t mask = vmovq_n_u8(m);
|
||||||
uint8x16_t cmp_res_0 = vceqq_u8(in.i.val[0], mask);
|
uint8x16_t cmp_res_0 = vceqq_u8(in.i.val[0], mask);
|
||||||
uint8x16_t cmp_res_1 = vceqq_u8(in.i.val[1], mask);
|
uint8x16_t cmp_res_1 = vceqq_u8(in.i.val[1], mask);
|
||||||
|
@ -242,12 +242,12 @@ uint64_t cmp_mask_against_input<simdjson::instruction_set::neon>(simd_input<simd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// find all values less than or equal than the content of maxval (using unsigned arithmetic)
|
// find all values less than or equal than the content of maxval (using unsigned arithmetic)
|
||||||
template<simdjson::instruction_set T>
|
template<instruction_set T>
|
||||||
uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
|
uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
template<> really_inline
|
template<> really_inline
|
||||||
uint64_t unsigned_lteq_against_input<simdjson::instruction_set::avx2>(simd_input<simdjson::instruction_set::avx2> in, uint8_t m) {
|
uint64_t unsigned_lteq_against_input<instruction_set::avx2>(simd_input<instruction_set::avx2> in, uint8_t m) {
|
||||||
const __m256i maxval = _mm256_set1_epi8(m);
|
const __m256i maxval = _mm256_set1_epi8(m);
|
||||||
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.lo),maxval);
|
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.lo),maxval);
|
||||||
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
|
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
|
||||||
|
@ -259,7 +259,7 @@ uint64_t unsigned_lteq_against_input<simdjson::instruction_set::avx2>(simd_input
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
template<> really_inline
|
template<> really_inline
|
||||||
uint64_t unsigned_lteq_against_input<simdjson::instruction_set::neon>(simd_input<simdjson::instruction_set::neon> in, uint8_t m) {
|
uint64_t unsigned_lteq_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
|
||||||
const uint8x16_t mask = vmovq_n_u8(m);
|
const uint8x16_t mask = vmovq_n_u8(m);
|
||||||
uint8x16_t cmp_res_0 = vcleq_u8(in.i.val[0], mask);
|
uint8x16_t cmp_res_0 = vcleq_u8(in.i.val[0], mask);
|
||||||
uint8x16_t cmp_res_1 = vcleq_u8(in.i.val[1], mask);
|
uint8x16_t cmp_res_1 = vcleq_u8(in.i.val[1], mask);
|
||||||
|
@ -278,7 +278,7 @@ uint64_t unsigned_lteq_against_input<simdjson::instruction_set::neon>(simd_input
|
||||||
// indicate whether we end an iteration on an odd-length sequence of
|
// indicate whether we end an iteration on an odd-length sequence of
|
||||||
// backslashes, which modifies our subsequent search for odd-length
|
// backslashes, which modifies our subsequent search for odd-length
|
||||||
// sequences of backslashes in an obvious way.
|
// sequences of backslashes in an obvious way.
|
||||||
template<simdjson::instruction_set T> really_inline
|
template<instruction_set T> really_inline
|
||||||
uint64_t find_odd_backslash_sequences(simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash) {
|
uint64_t find_odd_backslash_sequences(simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash) {
|
||||||
const uint64_t even_bits = 0x5555555555555555ULL;
|
const uint64_t even_bits = 0x5555555555555555ULL;
|
||||||
const uint64_t odd_bits = ~even_bits;
|
const uint64_t odd_bits = ~even_bits;
|
||||||
|
@ -323,7 +323,7 @@ uint64_t find_odd_backslash_sequences(simd_input<T> in, uint64_t &prev_iter_ends
|
||||||
// Note that we don't do any error checking to see if we have backslash
|
// Note that we don't do any error checking to see if we have backslash
|
||||||
// sequences outside quotes; these
|
// sequences outside quotes; these
|
||||||
// backslash sequences (of any length) will be detected elsewhere.
|
// backslash sequences (of any length) will be detected elsewhere.
|
||||||
template<simdjson::instruction_set T> really_inline
|
template<instruction_set T> really_inline
|
||||||
uint64_t find_quote_mask_and_bits(simd_input<T> in, uint64_t odd_ends,
|
uint64_t find_quote_mask_and_bits(simd_input<T> in, uint64_t odd_ends,
|
||||||
uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask) {
|
uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask) {
|
||||||
quote_bits = cmp_mask_against_input<T>(in, '"');
|
quote_bits = cmp_mask_against_input<T>(in, '"');
|
||||||
|
@ -352,14 +352,14 @@ uint64_t find_quote_mask_and_bits(simd_input<T> in, uint64_t odd_ends,
|
||||||
// we are also interested in the four whitespace characters
|
// we are also interested in the four whitespace characters
|
||||||
// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
|
// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
|
||||||
// these go into the next 2 buckets of the comparison (8/16)
|
// these go into the next 2 buckets of the comparison (8/16)
|
||||||
template<simdjson::instruction_set T>
|
template<instruction_set T>
|
||||||
void find_whitespace_and_structurals(simd_input<T> in,
|
void find_whitespace_and_structurals(simd_input<T> in,
|
||||||
uint64_t &whitespace,
|
uint64_t &whitespace,
|
||||||
uint64_t &structurals);
|
uint64_t &structurals);
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
template<> really_inline
|
template<> really_inline
|
||||||
void find_whitespace_and_structurals<simdjson::instruction_set::avx2>(simd_input<simdjson::instruction_set::avx2> in,
|
void find_whitespace_and_structurals<instruction_set::avx2>(simd_input<instruction_set::avx2> in,
|
||||||
uint64_t &whitespace,
|
uint64_t &whitespace,
|
||||||
uint64_t &structurals) {
|
uint64_t &structurals) {
|
||||||
#ifdef SIMDJSON_NAIVE_STRUCTURAL
|
#ifdef SIMDJSON_NAIVE_STRUCTURAL
|
||||||
|
@ -451,8 +451,8 @@ void find_whitespace_and_structurals<simdjson::instruction_set::avx2>(simd_input
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
template<> really_inline
|
template<> really_inline
|
||||||
void find_whitespace_and_structurals<simdjson::instruction_set::neon>(
|
void find_whitespace_and_structurals<instruction_set::neon>(
|
||||||
simd_input<simdjson::instruction_set::neon> in,
|
simd_input<instruction_set::neon> in,
|
||||||
uint64_t &whitespace,
|
uint64_t &whitespace,
|
||||||
uint64_t &structurals) {
|
uint64_t &structurals) {
|
||||||
#ifndef FUNKY_BAD_TABLE
|
#ifndef FUNKY_BAD_TABLE
|
||||||
|
@ -698,7 +698,7 @@ really_inline uint64_t finalize_structurals(
|
||||||
return structurals;
|
return structurals;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<simdjson::instruction_set T = simdjson::instruction_set::native>
|
template<instruction_set T = instruction_set::native>
|
||||||
WARN_UNUSED
|
WARN_UNUSED
|
||||||
/*never_inline*/ int find_structural_bits(const uint8_t *buf, size_t len,
|
/*never_inline*/ int find_structural_bits(const uint8_t *buf, size_t len,
|
||||||
ParsedJson &pj) {
|
ParsedJson &pj) {
|
||||||
|
@ -849,7 +849,7 @@ WARN_UNUSED
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<simdjson::instruction_set T = simdjson::instruction_set::native>
|
template<instruction_set T = instruction_set::native>
|
||||||
WARN_UNUSED
|
WARN_UNUSED
|
||||||
int find_structural_bits(const char *buf, size_t len, ParsedJson &pj) {
|
int find_structural_bits(const char *buf, size_t len, ParsedJson &pj) {
|
||||||
return find_structural_bits<T>(reinterpret_cast<const uint8_t *>(buf), len, pj);
|
return find_structural_bits<T>(reinterpret_cast<const uint8_t *>(buf), len, pj);
|
||||||
|
|
|
@ -1,18 +1,588 @@
|
||||||
#ifndef SIMDJSON_STAGE2_BUILD_TAPE_H
|
#ifndef SIMDJSON_STAGE2_BUILD_TAPE_H
|
||||||
#define SIMDJSON_STAGE2_BUILD_TAPE_H
|
#define SIMDJSON_STAGE2_BUILD_TAPE_H
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstring>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
#include "simdjson/common_defs.h"
|
#include "simdjson/common_defs.h"
|
||||||
|
#include "simdjson/jsoncharutils.h"
|
||||||
|
#include "simdjson/numberparsing.h"
|
||||||
|
#include "simdjson/parsedjson.h"
|
||||||
|
#include "simdjson/stringparsing.h"
|
||||||
|
#include "simdjson/simdjson.h"
|
||||||
|
|
||||||
|
#define PATH_SEP '/'
|
||||||
|
|
||||||
namespace simdjson {
|
namespace simdjson {
|
||||||
struct ParsedJson;
|
|
||||||
|
|
||||||
void init_state_machine();
|
void init_state_machine();
|
||||||
|
|
||||||
WARN_UNUSED
|
WARN_UNUSED
|
||||||
int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
|
really_inline bool is_valid_true_atom(const uint8_t *loc) {
|
||||||
|
uint64_t tv = *reinterpret_cast<const uint64_t *>("true ");
|
||||||
|
uint64_t mask4 = 0x00000000ffffffff;
|
||||||
|
uint32_t error = 0;
|
||||||
|
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||||
|
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||||
|
// SIMDJSON_PADDING of padding
|
||||||
|
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
||||||
|
std::memcpy(&locval, loc, sizeof(uint64_t));
|
||||||
|
error = (locval & mask4) ^ tv;
|
||||||
|
error |= is_not_structural_or_whitespace(loc[4]);
|
||||||
|
return error == 0;
|
||||||
|
}
|
||||||
|
|
||||||
WARN_UNUSED
|
WARN_UNUSED
|
||||||
int unified_machine(const char *buf, size_t len, ParsedJson &pj);
|
really_inline bool is_valid_false_atom(const uint8_t *loc) {
|
||||||
|
// We have to use an integer constant because the space in the cast
|
||||||
|
// below would lead to values illegally being qualified
|
||||||
|
// uint64_t fv = *reinterpret_cast<const uint64_t *>("false ");
|
||||||
|
// using this constant (that is the same false) but nulls out the
|
||||||
|
// unused bits solves that
|
||||||
|
uint64_t fv = 0x00000065736c6166; // takes into account endianness
|
||||||
|
uint64_t mask5 = 0x000000ffffffffff;
|
||||||
|
// we can't use the 32 bit value for checking for errors otherwise
|
||||||
|
// the last character of false (it being 5 byte long!) would be
|
||||||
|
// ignored
|
||||||
|
uint64_t error = 0;
|
||||||
|
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||||
|
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||||
|
// SIMDJSON_PADDING of padding
|
||||||
|
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
||||||
|
std::memcpy(&locval, loc, sizeof(uint64_t));
|
||||||
|
error = (locval & mask5) ^ fv;
|
||||||
|
error |= is_not_structural_or_whitespace(loc[5]);
|
||||||
|
return error == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
WARN_UNUSED
|
||||||
|
really_inline bool is_valid_null_atom(const uint8_t *loc) {
|
||||||
|
uint64_t nv = *reinterpret_cast<const uint64_t *>("null ");
|
||||||
|
uint64_t mask4 = 0x00000000ffffffff;
|
||||||
|
uint32_t error = 0;
|
||||||
|
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||||
|
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||||
|
// SIMDJSON_PADDING of padding
|
||||||
|
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
||||||
|
std::memcpy(&locval, loc, sizeof(uint64_t));
|
||||||
|
error = (locval & mask4) ^ nv;
|
||||||
|
error |= is_not_structural_or_whitespace(loc[4]);
|
||||||
|
return error == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/************
|
||||||
|
* The JSON is parsed to a tape, see the accompanying tape.md file
|
||||||
|
* for documentation.
|
||||||
|
***********/
|
||||||
|
template<instruction_set T = instruction_set::native>
|
||||||
|
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
|
||||||
|
int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
||||||
|
#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
|
||||||
|
memset((uint8_t*)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
|
||||||
|
#endif
|
||||||
|
uint32_t i = 0; // index of the structural character (0,1,2,3...)
|
||||||
|
uint32_t idx; // location of the structural character in the input (buf)
|
||||||
|
uint8_t c; // used to track the (structural) character we are looking at, updated
|
||||||
|
// by UPDATE_CHAR macro
|
||||||
|
uint32_t depth = 0; // could have an arbitrary starting depth
|
||||||
|
pj.init(); // sets isvalid to false
|
||||||
|
if(pj.bytecapacity < len) {
|
||||||
|
pj.errorcode = simdjson::CAPACITY;
|
||||||
|
return pj.errorcode;
|
||||||
|
}
|
||||||
|
// this macro reads the next structural character, updating idx, i and c.
|
||||||
|
#define UPDATE_CHAR() \
|
||||||
|
{ \
|
||||||
|
idx = pj.structural_indexes[i++]; \
|
||||||
|
c = buf[idx]; \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////// START STATE /////////////////////////////
|
||||||
|
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||||
|
pj.ret_address[depth] = &&start_continue;
|
||||||
|
#else
|
||||||
|
pj.ret_address[depth] = 's';
|
||||||
|
#endif
|
||||||
|
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||||
|
pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
|
||||||
|
// the root is used, if nothing else, to capture the size of the tape
|
||||||
|
depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
|
||||||
|
if (depth >= pj.depthcapacity) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
UPDATE_CHAR();
|
||||||
|
switch (c) {
|
||||||
|
case '{':
|
||||||
|
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||||
|
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||||
|
pj.ret_address[depth] = &&start_continue;
|
||||||
|
#else
|
||||||
|
pj.ret_address[depth] = 's';
|
||||||
|
#endif
|
||||||
|
depth++;
|
||||||
|
if (depth >= pj.depthcapacity) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
pj.write_tape(0, c); // strangely, moving this to object_begin slows things down
|
||||||
|
goto object_begin;
|
||||||
|
case '[':
|
||||||
|
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||||
|
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||||
|
pj.ret_address[depth] = &&start_continue;
|
||||||
|
#else
|
||||||
|
pj.ret_address[depth] = 's';
|
||||||
|
#endif
|
||||||
|
depth++;
|
||||||
|
if (depth >= pj.depthcapacity) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
pj.write_tape(0, c);
|
||||||
|
goto array_begin;
|
||||||
|
#define SIMDJSON_ALLOWANYTHINGINROOT
|
||||||
|
// A JSON text is a serialized value. Note that certain previous
|
||||||
|
// specifications of JSON constrained a JSON text to be an object or an
|
||||||
|
// array. Implementations that generate only objects or arrays where a
|
||||||
|
// JSON text is called for will be interoperable in the sense that all
|
||||||
|
// implementations will accept these as conforming JSON texts.
|
||||||
|
// https://tools.ietf.org/html/rfc8259
|
||||||
|
#ifdef SIMDJSON_ALLOWANYTHINGINROOT
|
||||||
|
case '"': {
|
||||||
|
if (!parse_string<T>(buf, len, pj, depth, idx)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 't': {
|
||||||
|
// we need to make a copy to make sure that the string is space terminated.
|
||||||
|
// this only applies to the JSON document made solely of the true value.
|
||||||
|
// this will almost never be called in practice
|
||||||
|
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||||
|
if(copy == nullptr) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
memcpy(copy, buf, len);
|
||||||
|
copy[len] = ' ';
|
||||||
|
if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
|
||||||
|
free(copy);
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
free(copy);
|
||||||
|
pj.write_tape(0, c);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'f': {
|
||||||
|
// we need to make a copy to make sure that the string is space terminated.
|
||||||
|
// this only applies to the JSON document made solely of the false value.
|
||||||
|
// this will almost never be called in practice
|
||||||
|
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||||
|
if(copy == nullptr) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
memcpy(copy, buf, len);
|
||||||
|
copy[len] = ' ';
|
||||||
|
if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
|
||||||
|
free(copy);
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
free(copy);
|
||||||
|
pj.write_tape(0, c);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'n': {
|
||||||
|
// we need to make a copy to make sure that the string is space terminated.
|
||||||
|
// this only applies to the JSON document made solely of the null value.
|
||||||
|
// this will almost never be called in practice
|
||||||
|
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||||
|
if(copy == nullptr) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
memcpy(copy, buf, len);
|
||||||
|
copy[len] = ' ';
|
||||||
|
if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
|
||||||
|
free(copy);
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
free(copy);
|
||||||
|
pj.write_tape(0, c);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case '0':
|
||||||
|
case '1':
|
||||||
|
case '2':
|
||||||
|
case '3':
|
||||||
|
case '4':
|
||||||
|
case '5':
|
||||||
|
case '6':
|
||||||
|
case '7':
|
||||||
|
case '8':
|
||||||
|
case '9': {
|
||||||
|
// we need to make a copy to make sure that the string is space terminated.
|
||||||
|
// this is done only for JSON documents made of a sole number
|
||||||
|
// this will almost never be called in practice. We terminate with a space
|
||||||
|
// because we do not want to allow NULLs in the middle of a number (whereas a
|
||||||
|
// space in the middle of a number would be identified in stage 1).
|
||||||
|
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||||
|
if(copy == nullptr) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
memcpy(copy, buf, len);
|
||||||
|
copy[len] = ' ';
|
||||||
|
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, false)) {
|
||||||
|
free(copy);
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
free(copy);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case '-': {
|
||||||
|
// we need to make a copy to make sure that the string is NULL terminated.
|
||||||
|
// this is done only for JSON documents made of a sole number
|
||||||
|
// this will almost never be called in practice
|
||||||
|
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
||||||
|
if(copy == nullptr) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
memcpy(copy, buf, len);
|
||||||
|
copy[len] = '\0';
|
||||||
|
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
|
||||||
|
free(copy);
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
free(copy);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#endif // ALLOWANYTHINGINROOT
|
||||||
|
default:
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
start_continue:
|
||||||
|
// the string might not be NULL terminated.
|
||||||
|
if(i + 1 == pj.n_structural_indexes) {
|
||||||
|
goto succeed;
|
||||||
|
} else {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
////////////////////////////// OBJECT STATES /////////////////////////////
|
||||||
|
|
||||||
|
object_begin:
|
||||||
|
UPDATE_CHAR();
|
||||||
|
switch (c) {
|
||||||
|
case '"': {
|
||||||
|
if (!parse_string<T>(buf, len, pj, depth, idx)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
goto object_key_state;
|
||||||
|
}
|
||||||
|
case '}':
|
||||||
|
goto scope_end; // could also go to object_continue
|
||||||
|
default:
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
object_key_state:
|
||||||
|
UPDATE_CHAR();
|
||||||
|
if (c != ':') {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
UPDATE_CHAR();
|
||||||
|
switch (c) {
|
||||||
|
case '"': {
|
||||||
|
if (!parse_string<T>(buf, len, pj, depth, idx)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 't':
|
||||||
|
if (!is_valid_true_atom(buf + idx)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
pj.write_tape(0, c);
|
||||||
|
break;
|
||||||
|
case 'f':
|
||||||
|
if (!is_valid_false_atom(buf + idx)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
pj.write_tape(0, c);
|
||||||
|
break;
|
||||||
|
case 'n':
|
||||||
|
if (!is_valid_null_atom(buf + idx)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
pj.write_tape(0, c);
|
||||||
|
break;
|
||||||
|
case '0':
|
||||||
|
case '1':
|
||||||
|
case '2':
|
||||||
|
case '3':
|
||||||
|
case '4':
|
||||||
|
case '5':
|
||||||
|
case '6':
|
||||||
|
case '7':
|
||||||
|
case '8':
|
||||||
|
case '9': {
|
||||||
|
if (!parse_number(buf, pj, idx, false)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case '-': {
|
||||||
|
if (!parse_number(buf, pj, idx, true)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case '{': {
|
||||||
|
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||||
|
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
||||||
|
// we have not yet encountered } so we need to come back for it
|
||||||
|
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||||
|
pj.ret_address[depth] = &&object_continue;
|
||||||
|
#else
|
||||||
|
pj.ret_address[depth] = 'o';
|
||||||
|
#endif
|
||||||
|
// we found an object inside an object, so we need to increment the depth
|
||||||
|
depth++;
|
||||||
|
if (depth >= pj.depthcapacity) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
goto object_begin;
|
||||||
|
}
|
||||||
|
case '[': {
|
||||||
|
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||||
|
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
||||||
|
// we have not yet encountered } so we need to come back for it
|
||||||
|
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||||
|
pj.ret_address[depth] = &&object_continue;
|
||||||
|
#else
|
||||||
|
pj.ret_address[depth] = 'o';
|
||||||
|
#endif
|
||||||
|
// we found an array inside an object, so we need to increment the depth
|
||||||
|
depth++;
|
||||||
|
if (depth >= pj.depthcapacity) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
goto array_begin;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
object_continue:
|
||||||
|
UPDATE_CHAR();
|
||||||
|
switch (c) {
|
||||||
|
case ',':
|
||||||
|
UPDATE_CHAR();
|
||||||
|
if (c != '"') {
|
||||||
|
goto fail;
|
||||||
|
} else {
|
||||||
|
if (!parse_string<T>(buf, len, pj, depth, idx)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
goto object_key_state;
|
||||||
|
}
|
||||||
|
case '}':
|
||||||
|
goto scope_end;
|
||||||
|
default:
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////// COMMON STATE /////////////////////////////
|
||||||
|
|
||||||
|
scope_end:
|
||||||
|
// write our tape location to the header scope
|
||||||
|
depth--;
|
||||||
|
pj.write_tape(pj.containing_scope_offset[depth], c);
|
||||||
|
pj.annotate_previousloc(pj.containing_scope_offset[depth],
|
||||||
|
pj.get_current_loc());
|
||||||
|
// goto saved_state
|
||||||
|
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||||
|
goto *pj.ret_address[depth];
|
||||||
|
#else
|
||||||
|
if(pj.ret_address[depth] == 'a') {
|
||||||
|
goto array_continue;
|
||||||
|
} else if (pj.ret_address[depth] == 'o') {
|
||||||
|
goto object_continue;
|
||||||
|
} else goto start_continue;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
////////////////////////////// ARRAY STATES /////////////////////////////
|
||||||
|
array_begin:
|
||||||
|
UPDATE_CHAR();
|
||||||
|
if (c == ']') {
|
||||||
|
goto scope_end; // could also go to array_continue
|
||||||
|
}
|
||||||
|
|
||||||
|
main_array_switch:
|
||||||
|
// we call update char on all paths in, so we can peek at c on the
|
||||||
|
// on paths that can accept a close square brace (post-, and at start)
|
||||||
|
switch (c) {
|
||||||
|
case '"': {
|
||||||
|
if (!parse_string<T>(buf, len, pj, depth, idx)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 't':
|
||||||
|
if (!is_valid_true_atom(buf + idx)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
pj.write_tape(0, c);
|
||||||
|
break;
|
||||||
|
case 'f':
|
||||||
|
if (!is_valid_false_atom(buf + idx)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
pj.write_tape(0, c);
|
||||||
|
break;
|
||||||
|
case 'n':
|
||||||
|
if (!is_valid_null_atom(buf + idx)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
pj.write_tape(0, c);
|
||||||
|
break; // goto array_continue;
|
||||||
|
|
||||||
|
case '0':
|
||||||
|
case '1':
|
||||||
|
case '2':
|
||||||
|
case '3':
|
||||||
|
case '4':
|
||||||
|
case '5':
|
||||||
|
case '6':
|
||||||
|
case '7':
|
||||||
|
case '8':
|
||||||
|
case '9': {
|
||||||
|
if (!parse_number(buf, pj, idx, false)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
break; // goto array_continue;
|
||||||
|
}
|
||||||
|
case '-': {
|
||||||
|
if (!parse_number(buf, pj, idx, true)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
break; // goto array_continue;
|
||||||
|
}
|
||||||
|
case '{': {
|
||||||
|
// we have not yet encountered ] so we need to come back for it
|
||||||
|
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||||
|
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
||||||
|
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||||
|
pj.ret_address[depth] = &&array_continue;
|
||||||
|
#else
|
||||||
|
pj.ret_address[depth] = 'a';
|
||||||
|
#endif
|
||||||
|
// we found an object inside an array, so we need to increment the depth
|
||||||
|
depth++;
|
||||||
|
if (depth >= pj.depthcapacity) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
goto object_begin;
|
||||||
|
}
|
||||||
|
case '[': {
|
||||||
|
// we have not yet encountered ] so we need to come back for it
|
||||||
|
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||||
|
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
||||||
|
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||||
|
pj.ret_address[depth] = &&array_continue;
|
||||||
|
#else
|
||||||
|
pj.ret_address[depth] = 'a';
|
||||||
|
#endif
|
||||||
|
// we found an array inside an array, so we need to increment the depth
|
||||||
|
depth++;
|
||||||
|
if (depth >= pj.depthcapacity) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
goto array_begin;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
array_continue:
|
||||||
|
UPDATE_CHAR();
|
||||||
|
switch (c) {
|
||||||
|
case ',':
|
||||||
|
UPDATE_CHAR();
|
||||||
|
goto main_array_switch;
|
||||||
|
case ']':
|
||||||
|
goto scope_end;
|
||||||
|
default:
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////// FINAL STATES /////////////////////////////
|
||||||
|
|
||||||
|
succeed:
|
||||||
|
depth --;
|
||||||
|
if(depth != 0) {
|
||||||
|
fprintf(stderr, "internal bug\n");
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
if(pj.containing_scope_offset[depth] != 0) {
|
||||||
|
fprintf(stderr, "internal bug\n");
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
pj.annotate_previousloc(pj.containing_scope_offset[depth],
|
||||||
|
pj.get_current_loc());
|
||||||
|
pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
|
||||||
|
|
||||||
|
pj.isvalid = true;
|
||||||
|
pj.errorcode = simdjson::SUCCESS;
|
||||||
|
return pj.errorcode;
|
||||||
|
fail:
|
||||||
|
// we do not need the next line because this is done by pj.init(), pessimistically.
|
||||||
|
// pj.isvalid = false;
|
||||||
|
// At this point in the code, we have all the time in the world.
|
||||||
|
// Note that we know exactly where we are in the document so we could,
|
||||||
|
// without any overhead on the processing code, report a specific location.
|
||||||
|
// We could even trigger special code paths to assess what happened carefully,
|
||||||
|
// all without any added cost.
|
||||||
|
if (depth >= pj.depthcapacity) {
|
||||||
|
pj.errorcode = simdjson::DEPTH_ERROR;
|
||||||
|
return pj.errorcode;
|
||||||
|
}
|
||||||
|
switch(c) {
|
||||||
|
case '"':
|
||||||
|
pj.errorcode = simdjson::STRING_ERROR;
|
||||||
|
return pj.errorcode;
|
||||||
|
case '0':
|
||||||
|
case '1':
|
||||||
|
case '2':
|
||||||
|
case '3':
|
||||||
|
case '4':
|
||||||
|
case '5':
|
||||||
|
case '6':
|
||||||
|
case '7':
|
||||||
|
case '8':
|
||||||
|
case '9':
|
||||||
|
case '-':
|
||||||
|
pj.errorcode = simdjson::NUMBER_ERROR;
|
||||||
|
return pj.errorcode;
|
||||||
|
case 't':
|
||||||
|
pj.errorcode = simdjson::T_ATOM_ERROR;
|
||||||
|
return pj.errorcode;
|
||||||
|
case 'n':
|
||||||
|
pj.errorcode = simdjson::N_ATOM_ERROR;
|
||||||
|
return pj.errorcode;
|
||||||
|
case 'f':
|
||||||
|
pj.errorcode = simdjson::F_ATOM_ERROR;
|
||||||
|
return pj.errorcode;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
pj.errorcode = simdjson::TAPE_ERROR;
|
||||||
|
return pj.errorcode;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<instruction_set T = instruction_set::native>
|
||||||
|
int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
|
||||||
|
return unified_machine<T>(reinterpret_cast<const uint8_t*>(buf), len, pj);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -5,6 +5,11 @@
|
||||||
#include "simdjson/jsoncharutils.h"
|
#include "simdjson/jsoncharutils.h"
|
||||||
#include "simdjson/parsedjson.h"
|
#include "simdjson/parsedjson.h"
|
||||||
|
|
||||||
|
#ifdef JSON_TEST_STRINGS
|
||||||
|
void foundString(const uint8_t *buf, const uint8_t *parsed_begin, const uint8_t *parsed_end);
|
||||||
|
void foundBadString(const uint8_t *buf);
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace simdjson {
|
namespace simdjson {
|
||||||
// begin copypasta
|
// begin copypasta
|
||||||
// These chars yield themselves: " \ /
|
// These chars yield themselves: " \ /
|
||||||
|
@ -76,19 +81,19 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, uint8_t **d
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
|
// Holds backslashes and quotes locations.
|
||||||
really_inline bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
|
struct parse_string_helper {
|
||||||
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
|
uint32_t bs_bits;
|
||||||
#ifdef SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing
|
uint32_t quote_bits;
|
||||||
pj.write_tape(0, '"');// don't bother with the string parsing at all
|
};
|
||||||
return true; // always succeeds
|
|
||||||
#else
|
// Finds where the backslashes and quotes are located.
|
||||||
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
|
template<instruction_set>
|
||||||
const uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
|
parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst);
|
||||||
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
|
|
||||||
const uint8_t *const start_of_string = dst;
|
|
||||||
while (1) {
|
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
|
template<> really_inline
|
||||||
|
parse_string_helper find_bs_bits_and_quote_bits<instruction_set::avx2> (const uint8_t *src, uint8_t *dst) {
|
||||||
// this can read up to 31 bytes beyond the buffer size, but we require
|
// this can read up to 31 bytes beyond the buffer size, but we require
|
||||||
// SIMDJSON_PADDING of padding
|
// SIMDJSON_PADDING of padding
|
||||||
static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
|
static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
|
||||||
|
@ -96,12 +101,17 @@ really_inline bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
|
||||||
// store to dest unconditionally - we can overwrite the bits we don't like
|
// store to dest unconditionally - we can overwrite the bits we don't like
|
||||||
// later
|
// later
|
||||||
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
|
||||||
auto bs_bits =
|
|
||||||
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\'))));
|
|
||||||
auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
|
auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
|
||||||
auto quote_bits =
|
return {
|
||||||
static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask));
|
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits
|
||||||
#else
|
static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask)) // quote_bits
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __ARM_NEON
|
||||||
|
template<> really_inline
|
||||||
|
parse_string_helper find_bs_bits_and_quote_bits<instruction_set::neon> (const uint8_t *src, uint8_t *dst) {
|
||||||
// this can read up to 31 bytes beyond the buffer size, but we require
|
// this can read up to 31 bytes beyond the buffer size, but we require
|
||||||
// SIMDJSON_PADDING of padding
|
// SIMDJSON_PADDING of padding
|
||||||
static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
|
static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
|
||||||
|
@ -128,14 +138,32 @@ really_inline bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
|
||||||
uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1);
|
uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1);
|
||||||
sum0 = vpaddq_u8(sum0, sum1);
|
sum0 = vpaddq_u8(sum0, sum1);
|
||||||
sum0 = vpaddq_u8(sum0, sum0);
|
sum0 = vpaddq_u8(sum0, sum0);
|
||||||
auto bs_bits = vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0);
|
return {
|
||||||
auto quote_bits = vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1);
|
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits
|
||||||
|
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits
|
||||||
|
};
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
if(((bs_bits - 1) & quote_bits) != 0 ) {
|
|
||||||
|
template<instruction_set T>
|
||||||
|
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline
|
||||||
|
bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
|
||||||
|
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
|
||||||
|
#ifdef SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing
|
||||||
|
pj.write_tape(0, '"');// don't bother with the string parsing at all
|
||||||
|
return true; // always succeeds
|
||||||
|
#else
|
||||||
|
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
|
||||||
|
const uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
|
||||||
|
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
|
||||||
|
const uint8_t *const start_of_string = dst;
|
||||||
|
while (1) {
|
||||||
|
parse_string_helper helper = find_bs_bits_and_quote_bits<T>(src, dst);
|
||||||
|
if(((helper.bs_bits - 1) & helper.quote_bits) != 0 ) {
|
||||||
// we encountered quotes first. Move dst to point to quotes and exit
|
// we encountered quotes first. Move dst to point to quotes and exit
|
||||||
|
|
||||||
// find out where the quote is...
|
// find out where the quote is...
|
||||||
uint32_t quote_dist = trailingzeroes(quote_bits);
|
uint32_t quote_dist = trailingzeroes(helper.quote_bits);
|
||||||
|
|
||||||
// NULL termination is still handy if you expect all your strings to be NULL terminated?
|
// NULL termination is still handy if you expect all your strings to be NULL terminated?
|
||||||
// It comes at a small cost
|
// It comes at a small cost
|
||||||
|
@ -158,9 +186,9 @@ really_inline bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
|
||||||
#endif // JSON_TEST_STRINGS
|
#endif // JSON_TEST_STRINGS
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if(((quote_bits - 1) & bs_bits ) != 0 ) {
|
if(((helper.quote_bits - 1) & helper.bs_bits ) != 0 ) {
|
||||||
// find out where the backspace is
|
// find out where the backspace is
|
||||||
uint32_t bs_dist = trailingzeroes(bs_bits);
|
uint32_t bs_dist = trailingzeroes(helper.bs_bits);
|
||||||
uint8_t escape_char = src[bs_dist + 1];
|
uint8_t escape_char = src[bs_dist + 1];
|
||||||
// we encountered backslash first. Handle backslash
|
// we encountered backslash first. Handle backslash
|
||||||
if (escape_char == 'u') {
|
if (escape_char == 'u') {
|
||||||
|
|
|
@ -1,580 +1 @@
|
||||||
#include <cassert>
|
// File kept in case we want to reuse it soon. (many configuration files to edit)
|
||||||
#include <cstring>
|
|
||||||
|
|
||||||
#include "simdjson/common_defs.h"
|
|
||||||
#include "simdjson/jsoncharutils.h"
|
|
||||||
#include "simdjson/numberparsing.h"
|
|
||||||
#include "simdjson/parsedjson.h"
|
|
||||||
#include "simdjson/stringparsing.h"
|
|
||||||
#include "simdjson/simdjson.h"
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#define PATH_SEP '/'
|
|
||||||
|
|
||||||
namespace simdjson {
|
|
||||||
|
|
||||||
WARN_UNUSED
|
|
||||||
really_inline bool is_valid_true_atom(const uint8_t *loc) {
|
|
||||||
uint64_t tv = *reinterpret_cast<const uint64_t *>("true ");
|
|
||||||
uint64_t mask4 = 0x00000000ffffffff;
|
|
||||||
uint32_t error = 0;
|
|
||||||
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
|
||||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
|
||||||
// SIMDJSON_PADDING of padding
|
|
||||||
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
|
||||||
std::memcpy(&locval, loc, sizeof(uint64_t));
|
|
||||||
error = (locval & mask4) ^ tv;
|
|
||||||
error |= is_not_structural_or_whitespace(loc[4]);
|
|
||||||
return error == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
WARN_UNUSED
|
|
||||||
really_inline bool is_valid_false_atom(const uint8_t *loc) {
|
|
||||||
// We have to use an integer constant because the space in the cast
|
|
||||||
// below would lead to values illegally being qualified
|
|
||||||
// uint64_t fv = *reinterpret_cast<const uint64_t *>("false ");
|
|
||||||
// using this constant (that is the same false) but nulls out the
|
|
||||||
// unused bits solves that
|
|
||||||
uint64_t fv = 0x00000065736c6166; // takes into account endianness
|
|
||||||
uint64_t mask5 = 0x000000ffffffffff;
|
|
||||||
// we can't use the 32 bit value for checking for errors otherwise
|
|
||||||
// the last character of false (it being 5 byte long!) would be
|
|
||||||
// ignored
|
|
||||||
uint64_t error = 0;
|
|
||||||
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
|
||||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
|
||||||
// SIMDJSON_PADDING of padding
|
|
||||||
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
|
||||||
std::memcpy(&locval, loc, sizeof(uint64_t));
|
|
||||||
error = (locval & mask5) ^ fv;
|
|
||||||
error |= is_not_structural_or_whitespace(loc[5]);
|
|
||||||
return error == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
WARN_UNUSED
|
|
||||||
really_inline bool is_valid_null_atom(const uint8_t *loc) {
|
|
||||||
uint64_t nv = *reinterpret_cast<const uint64_t *>("null ");
|
|
||||||
uint64_t mask4 = 0x00000000ffffffff;
|
|
||||||
uint32_t error = 0;
|
|
||||||
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
|
||||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
|
||||||
// SIMDJSON_PADDING of padding
|
|
||||||
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
|
||||||
std::memcpy(&locval, loc, sizeof(uint64_t));
|
|
||||||
error = (locval & mask4) ^ nv;
|
|
||||||
error |= is_not_structural_or_whitespace(loc[4]);
|
|
||||||
return error == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/************
|
|
||||||
* The JSON is parsed to a tape, see the accompanying tape.md file
|
|
||||||
* for documentation.
|
|
||||||
***********/
|
|
||||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
|
|
||||||
int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
|
||||||
#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
|
|
||||||
memset((uint8_t*)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
|
|
||||||
#endif
|
|
||||||
uint32_t i = 0; // index of the structural character (0,1,2,3...)
|
|
||||||
uint32_t idx; // location of the structural character in the input (buf)
|
|
||||||
uint8_t c; // used to track the (structural) character we are looking at, updated
|
|
||||||
// by UPDATE_CHAR macro
|
|
||||||
uint32_t depth = 0; // could have an arbitrary starting depth
|
|
||||||
pj.init(); // sets isvalid to false
|
|
||||||
if(pj.bytecapacity < len) {
|
|
||||||
pj.errorcode = CAPACITY;
|
|
||||||
return pj.errorcode;
|
|
||||||
}
|
|
||||||
// this macro reads the next structural character, updating idx, i and c.
|
|
||||||
#define UPDATE_CHAR() \
|
|
||||||
{ \
|
|
||||||
idx = pj.structural_indexes[i++]; \
|
|
||||||
c = buf[idx]; \
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////// START STATE /////////////////////////////
|
|
||||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
|
||||||
pj.ret_address[depth] = &&start_continue;
|
|
||||||
#else
|
|
||||||
pj.ret_address[depth] = 's';
|
|
||||||
#endif
|
|
||||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
|
||||||
pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
|
|
||||||
// the root is used, if nothing else, to capture the size of the tape
|
|
||||||
depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
|
|
||||||
if (depth >= pj.depthcapacity) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
|
|
||||||
UPDATE_CHAR();
|
|
||||||
switch (c) {
|
|
||||||
case '{':
|
|
||||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
|
||||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
|
||||||
pj.ret_address[depth] = &&start_continue;
|
|
||||||
#else
|
|
||||||
pj.ret_address[depth] = 's';
|
|
||||||
#endif
|
|
||||||
depth++;
|
|
||||||
if (depth >= pj.depthcapacity) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
pj.write_tape(0, c); // strangely, moving this to object_begin slows things down
|
|
||||||
goto object_begin;
|
|
||||||
case '[':
|
|
||||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
|
||||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
|
||||||
pj.ret_address[depth] = &&start_continue;
|
|
||||||
#else
|
|
||||||
pj.ret_address[depth] = 's';
|
|
||||||
#endif
|
|
||||||
depth++;
|
|
||||||
if (depth >= pj.depthcapacity) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
pj.write_tape(0, c);
|
|
||||||
goto array_begin;
|
|
||||||
#define SIMDJSON_ALLOWANYTHINGINROOT
|
|
||||||
// A JSON text is a serialized value. Note that certain previous
|
|
||||||
// specifications of JSON constrained a JSON text to be an object or an
|
|
||||||
// array. Implementations that generate only objects or arrays where a
|
|
||||||
// JSON text is called for will be interoperable in the sense that all
|
|
||||||
// implementations will accept these as conforming JSON texts.
|
|
||||||
// https://tools.ietf.org/html/rfc8259
|
|
||||||
#ifdef SIMDJSON_ALLOWANYTHINGINROOT
|
|
||||||
case '"': {
|
|
||||||
if (!parse_string(buf, len, pj, depth, idx)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 't': {
|
|
||||||
// we need to make a copy to make sure that the string is space terminated.
|
|
||||||
// this only applies to the JSON document made solely of the true value.
|
|
||||||
// this will almost never be called in practice
|
|
||||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
|
||||||
if(copy == nullptr) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
memcpy(copy, buf, len);
|
|
||||||
copy[len] = ' ';
|
|
||||||
if (!is_valid_true_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
|
|
||||||
free(copy);
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
free(copy);
|
|
||||||
pj.write_tape(0, c);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 'f': {
|
|
||||||
// we need to make a copy to make sure that the string is space terminated.
|
|
||||||
// this only applies to the JSON document made solely of the false value.
|
|
||||||
// this will almost never be called in practice
|
|
||||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
|
||||||
if(copy == nullptr) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
memcpy(copy, buf, len);
|
|
||||||
copy[len] = ' ';
|
|
||||||
if (!is_valid_false_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
|
|
||||||
free(copy);
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
free(copy);
|
|
||||||
pj.write_tape(0, c);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 'n': {
|
|
||||||
// we need to make a copy to make sure that the string is space terminated.
|
|
||||||
// this only applies to the JSON document made solely of the null value.
|
|
||||||
// this will almost never be called in practice
|
|
||||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
|
||||||
if(copy == nullptr) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
memcpy(copy, buf, len);
|
|
||||||
copy[len] = ' ';
|
|
||||||
if (!is_valid_null_atom(reinterpret_cast<const uint8_t *>(copy) + idx)) {
|
|
||||||
free(copy);
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
free(copy);
|
|
||||||
pj.write_tape(0, c);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case '0':
|
|
||||||
case '1':
|
|
||||||
case '2':
|
|
||||||
case '3':
|
|
||||||
case '4':
|
|
||||||
case '5':
|
|
||||||
case '6':
|
|
||||||
case '7':
|
|
||||||
case '8':
|
|
||||||
case '9': {
|
|
||||||
// we need to make a copy to make sure that the string is space terminated.
|
|
||||||
// this is done only for JSON documents made of a sole number
|
|
||||||
// this will almost never be called in practice. We terminate with a space
|
|
||||||
// because we do not want to allow NULLs in the middle of a number (whereas a
|
|
||||||
// space in the middle of a number would be identified in stage 1).
|
|
||||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
|
||||||
if(copy == nullptr) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
memcpy(copy, buf, len);
|
|
||||||
copy[len] = ' ';
|
|
||||||
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, false)) {
|
|
||||||
free(copy);
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
free(copy);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case '-': {
|
|
||||||
// we need to make a copy to make sure that the string is NULL terminated.
|
|
||||||
// this is done only for JSON documents made of a sole number
|
|
||||||
// this will almost never be called in practice
|
|
||||||
char * copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
|
|
||||||
if(copy == nullptr) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
memcpy(copy, buf, len);
|
|
||||||
copy[len] = '\0';
|
|
||||||
if (!parse_number(reinterpret_cast<const uint8_t *>(copy), pj, idx, true)) {
|
|
||||||
free(copy);
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
free(copy);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
#endif // ALLOWANYTHINGINROOT
|
|
||||||
default:
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
start_continue:
|
|
||||||
// the string might not be NULL terminated.
|
|
||||||
if(i + 1 == pj.n_structural_indexes) {
|
|
||||||
goto succeed;
|
|
||||||
} else {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
////////////////////////////// OBJECT STATES /////////////////////////////
|
|
||||||
|
|
||||||
object_begin:
|
|
||||||
UPDATE_CHAR();
|
|
||||||
switch (c) {
|
|
||||||
case '"': {
|
|
||||||
if (!parse_string(buf, len, pj, depth, idx)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
goto object_key_state;
|
|
||||||
}
|
|
||||||
case '}':
|
|
||||||
goto scope_end; // could also go to object_continue
|
|
||||||
default:
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
|
|
||||||
object_key_state:
|
|
||||||
UPDATE_CHAR();
|
|
||||||
if (c != ':') {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
UPDATE_CHAR();
|
|
||||||
switch (c) {
|
|
||||||
case '"': {
|
|
||||||
if (!parse_string(buf, len, pj, depth, idx)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 't':
|
|
||||||
if (!is_valid_true_atom(buf + idx)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
pj.write_tape(0, c);
|
|
||||||
break;
|
|
||||||
case 'f':
|
|
||||||
if (!is_valid_false_atom(buf + idx)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
pj.write_tape(0, c);
|
|
||||||
break;
|
|
||||||
case 'n':
|
|
||||||
if (!is_valid_null_atom(buf + idx)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
pj.write_tape(0, c);
|
|
||||||
break;
|
|
||||||
case '0':
|
|
||||||
case '1':
|
|
||||||
case '2':
|
|
||||||
case '3':
|
|
||||||
case '4':
|
|
||||||
case '5':
|
|
||||||
case '6':
|
|
||||||
case '7':
|
|
||||||
case '8':
|
|
||||||
case '9': {
|
|
||||||
if (!parse_number(buf, pj, idx, false)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case '-': {
|
|
||||||
if (!parse_number(buf, pj, idx, true)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case '{': {
|
|
||||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
|
||||||
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
|
||||||
// we have not yet encountered } so we need to come back for it
|
|
||||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
|
||||||
pj.ret_address[depth] = &&object_continue;
|
|
||||||
#else
|
|
||||||
pj.ret_address[depth] = 'o';
|
|
||||||
#endif
|
|
||||||
// we found an object inside an object, so we need to increment the depth
|
|
||||||
depth++;
|
|
||||||
if (depth >= pj.depthcapacity) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
|
|
||||||
goto object_begin;
|
|
||||||
}
|
|
||||||
case '[': {
|
|
||||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
|
||||||
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
|
||||||
// we have not yet encountered } so we need to come back for it
|
|
||||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
|
||||||
pj.ret_address[depth] = &&object_continue;
|
|
||||||
#else
|
|
||||||
pj.ret_address[depth] = 'o';
|
|
||||||
#endif
|
|
||||||
// we found an array inside an object, so we need to increment the depth
|
|
||||||
depth++;
|
|
||||||
if (depth >= pj.depthcapacity) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
goto array_begin;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
|
|
||||||
object_continue:
|
|
||||||
UPDATE_CHAR();
|
|
||||||
switch (c) {
|
|
||||||
case ',':
|
|
||||||
UPDATE_CHAR();
|
|
||||||
if (c != '"') {
|
|
||||||
goto fail;
|
|
||||||
} else {
|
|
||||||
if (!parse_string(buf, len, pj, depth, idx)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
goto object_key_state;
|
|
||||||
}
|
|
||||||
case '}':
|
|
||||||
goto scope_end;
|
|
||||||
default:
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////// COMMON STATE /////////////////////////////
|
|
||||||
|
|
||||||
scope_end:
|
|
||||||
// write our tape location to the header scope
|
|
||||||
depth--;
|
|
||||||
pj.write_tape(pj.containing_scope_offset[depth], c);
|
|
||||||
pj.annotate_previousloc(pj.containing_scope_offset[depth],
|
|
||||||
pj.get_current_loc());
|
|
||||||
// goto saved_state
|
|
||||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
|
||||||
goto *pj.ret_address[depth];
|
|
||||||
#else
|
|
||||||
if(pj.ret_address[depth] == 'a') {
|
|
||||||
goto array_continue;
|
|
||||||
} else if (pj.ret_address[depth] == 'o') {
|
|
||||||
goto object_continue;
|
|
||||||
} else goto start_continue;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
////////////////////////////// ARRAY STATES /////////////////////////////
|
|
||||||
array_begin:
|
|
||||||
UPDATE_CHAR();
|
|
||||||
if (c == ']') {
|
|
||||||
goto scope_end; // could also go to array_continue
|
|
||||||
}
|
|
||||||
|
|
||||||
main_array_switch:
|
|
||||||
// we call update char on all paths in, so we can peek at c on the
|
|
||||||
// on paths that can accept a close square brace (post-, and at start)
|
|
||||||
switch (c) {
|
|
||||||
case '"': {
|
|
||||||
if (!parse_string(buf, len, pj, depth, idx)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 't':
|
|
||||||
if (!is_valid_true_atom(buf + idx)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
pj.write_tape(0, c);
|
|
||||||
break;
|
|
||||||
case 'f':
|
|
||||||
if (!is_valid_false_atom(buf + idx)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
pj.write_tape(0, c);
|
|
||||||
break;
|
|
||||||
case 'n':
|
|
||||||
if (!is_valid_null_atom(buf + idx)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
pj.write_tape(0, c);
|
|
||||||
break; // goto array_continue;
|
|
||||||
|
|
||||||
case '0':
|
|
||||||
case '1':
|
|
||||||
case '2':
|
|
||||||
case '3':
|
|
||||||
case '4':
|
|
||||||
case '5':
|
|
||||||
case '6':
|
|
||||||
case '7':
|
|
||||||
case '8':
|
|
||||||
case '9': {
|
|
||||||
if (!parse_number(buf, pj, idx, false)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
break; // goto array_continue;
|
|
||||||
}
|
|
||||||
case '-': {
|
|
||||||
if (!parse_number(buf, pj, idx, true)) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
break; // goto array_continue;
|
|
||||||
}
|
|
||||||
case '{': {
|
|
||||||
// we have not yet encountered ] so we need to come back for it
|
|
||||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
|
||||||
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
|
||||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
|
||||||
pj.ret_address[depth] = &&array_continue;
|
|
||||||
#else
|
|
||||||
pj.ret_address[depth] = 'a';
|
|
||||||
#endif
|
|
||||||
// we found an object inside an array, so we need to increment the depth
|
|
||||||
depth++;
|
|
||||||
if (depth >= pj.depthcapacity) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
|
|
||||||
goto object_begin;
|
|
||||||
}
|
|
||||||
case '[': {
|
|
||||||
// we have not yet encountered ] so we need to come back for it
|
|
||||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
|
||||||
pj.write_tape(0, c); // here the compilers knows what c is so this gets optimized
|
|
||||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
|
||||||
pj.ret_address[depth] = &&array_continue;
|
|
||||||
#else
|
|
||||||
pj.ret_address[depth] = 'a';
|
|
||||||
#endif
|
|
||||||
// we found an array inside an array, so we need to increment the depth
|
|
||||||
depth++;
|
|
||||||
if (depth >= pj.depthcapacity) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
goto array_begin;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
|
|
||||||
array_continue:
|
|
||||||
UPDATE_CHAR();
|
|
||||||
switch (c) {
|
|
||||||
case ',':
|
|
||||||
UPDATE_CHAR();
|
|
||||||
goto main_array_switch;
|
|
||||||
case ']':
|
|
||||||
goto scope_end;
|
|
||||||
default:
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////// FINAL STATES /////////////////////////////
|
|
||||||
|
|
||||||
succeed:
|
|
||||||
depth --;
|
|
||||||
if(depth != 0) {
|
|
||||||
fprintf(stderr, "internal bug\n");
|
|
||||||
abort();
|
|
||||||
}
|
|
||||||
if(pj.containing_scope_offset[depth] != 0) {
|
|
||||||
fprintf(stderr, "internal bug\n");
|
|
||||||
abort();
|
|
||||||
}
|
|
||||||
pj.annotate_previousloc(pj.containing_scope_offset[depth],
|
|
||||||
pj.get_current_loc());
|
|
||||||
pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
|
|
||||||
|
|
||||||
pj.isvalid = true;
|
|
||||||
pj.errorcode = SUCCESS;
|
|
||||||
return pj.errorcode;
|
|
||||||
fail:
|
|
||||||
// we do not need the next line because this is done by pj.init(), pessimistically.
|
|
||||||
// pj.isvalid = false;
|
|
||||||
// At this point in the code, we have all the time in the world.
|
|
||||||
// Note that we know exactly where we are in the document so we could,
|
|
||||||
// without any overhead on the processing code, report a specific location.
|
|
||||||
// We could even trigger special code paths to assess what happened carefully,
|
|
||||||
// all without any added cost.
|
|
||||||
if (depth >= pj.depthcapacity) {
|
|
||||||
pj.errorcode = DEPTH_ERROR;
|
|
||||||
return pj.errorcode;
|
|
||||||
}
|
|
||||||
switch(c) {
|
|
||||||
case '"':
|
|
||||||
pj.errorcode = STRING_ERROR;
|
|
||||||
return pj.errorcode;
|
|
||||||
case '0':
|
|
||||||
case '1':
|
|
||||||
case '2':
|
|
||||||
case '3':
|
|
||||||
case '4':
|
|
||||||
case '5':
|
|
||||||
case '6':
|
|
||||||
case '7':
|
|
||||||
case '8':
|
|
||||||
case '9':
|
|
||||||
case '-':
|
|
||||||
pj.errorcode = NUMBER_ERROR;
|
|
||||||
return pj.errorcode;
|
|
||||||
case 't':
|
|
||||||
pj.errorcode = T_ATOM_ERROR;
|
|
||||||
return pj.errorcode;
|
|
||||||
case 'n':
|
|
||||||
pj.errorcode = N_ATOM_ERROR;
|
|
||||||
return pj.errorcode;
|
|
||||||
case 'f':
|
|
||||||
pj.errorcode = F_ATOM_ERROR;
|
|
||||||
return pj.errorcode;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
pj.errorcode = TAPE_ERROR;
|
|
||||||
return pj.errorcode;
|
|
||||||
}
|
|
||||||
|
|
||||||
int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
|
|
||||||
return unified_machine(reinterpret_cast<const uint8_t*>(buf), len, pj);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -38,7 +38,7 @@ bool is_in_bad_list(const char *buf) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void foundInvalidNumber(const uint8_t *buf) {
|
void foundInvalidNumber(const uint8_t *buf) {
|
||||||
invalid_count++;
|
invalid_count++;
|
||||||
char *endptr;
|
char *endptr;
|
||||||
double expected = strtod((const char *)buf, &endptr);
|
double expected = strtod((const char *)buf, &endptr);
|
||||||
|
@ -53,7 +53,7 @@ inline void foundInvalidNumber(const uint8_t *buf) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void foundInteger(int64_t result, const uint8_t *buf) {
|
void foundInteger(int64_t result, const uint8_t *buf) {
|
||||||
int_count++;
|
int_count++;
|
||||||
char *endptr;
|
char *endptr;
|
||||||
long long expected = strtoll((const char *)buf, &endptr, 10);
|
long long expected = strtoll((const char *)buf, &endptr, 10);
|
||||||
|
@ -64,7 +64,7 @@ inline void foundInteger(int64_t result, const uint8_t *buf) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void foundFloat(double result, const uint8_t *buf) {
|
void foundFloat(double result, const uint8_t *buf) {
|
||||||
char *endptr;
|
char *endptr;
|
||||||
float_count++;
|
float_count++;
|
||||||
double expected = strtod((const char *)buf, &endptr);
|
double expected = strtod((const char *)buf, &endptr);
|
||||||
|
|
|
@ -203,7 +203,7 @@ static bool parse_string(const char *p, char *output, char **end) {
|
||||||
// end of borrowed code
|
// end of borrowed code
|
||||||
char *bigbuffer; // global variable
|
char *bigbuffer; // global variable
|
||||||
|
|
||||||
inline void foundBadString(const uint8_t *buf) {
|
void foundBadString(const uint8_t *buf) {
|
||||||
bad_string++;
|
bad_string++;
|
||||||
char *end;
|
char *end;
|
||||||
if (parse_string((const char *)buf, bigbuffer, &end)) {
|
if (parse_string((const char *)buf, bigbuffer, &end)) {
|
||||||
|
@ -226,7 +226,7 @@ void print_cmp_hex(const char *s1, const char *s2, size_t len) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
|
void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
|
||||||
const uint8_t *parsed_end) {
|
const uint8_t *parsed_end) {
|
||||||
size_t thislen = parsed_end - parsed_begin;
|
size_t thislen = parsed_end - parsed_begin;
|
||||||
total_string_length += thislen;
|
total_string_length += thislen;
|
||||||
|
|
Loading…
Reference in New Issue