AVX minifier coded (first version).
This commit is contained in:
parent
fcc0391b58
commit
9cc00ab584
|
@ -1,6 +1,6 @@
|
|||
HEADERS:=include/avxprocessing.h include/benchmark.h include/common_defs.h include/jsonstruct.h include/scalarprocessing.h include/util.h
|
||||
bench: benchmarks/bench.cpp rapidjson/license.txt $(HEADERS)
|
||||
$(CXX) -std=c++11 -O3 -o $@ benchmarks/bench.cpp -Irapidjson/include -Iinclude -march=native -lm -Wall -Wextra
|
||||
$(CXX) -std=c++11 -O3 -o $@ benchmarks/bench.cpp -Irapidjson/include -Iinclude -march=native -lm -Wall -Wextra -Wno-narrowing
|
||||
|
||||
rapidjson/license.txt:
|
||||
git submodule update --init --recursive
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#include "avxprocessing.h"
|
||||
|
||||
#include "avxminifier.h"
|
||||
#include "benchmark.h"
|
||||
#include "jsonstruct.h"
|
||||
// #define RAPIDJSON_SSE2 // bad
|
||||
|
@ -79,9 +81,11 @@ int main(int argc, char *argv[]) {
|
|||
true);
|
||||
|
||||
rapidjson::Document d;
|
||||
|
||||
char * buffer = (char *) malloc(p.second);
|
||||
memcpy(buffer, p.first, p.second);
|
||||
buffer[p.second] = '\0';
|
||||
|
||||
BEST_TIME(d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(), false,
|
||||
memcpy(buffer, p.first, p.second), repeat, volume, true);
|
||||
BEST_TIME(d.Parse((const char *)buffer).HasParseError(), false,
|
||||
|
@ -92,5 +96,15 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "input length is "<< p.second << " stringified length is " << strlength << std::endl;
|
||||
BEST_TIME_NOCHECK(rapidstringme((char*) p.first), , repeat, volume,
|
||||
true);
|
||||
memcpy(buffer, p.first, p.second);
|
||||
size_t outlength = copy_without_useless_spaces((const uint8_t *)buffer, p.second,(uint8_t *) buffer);
|
||||
printf("these should match: %zu %zu \n", strlength, outlength);
|
||||
|
||||
|
||||
uint8_t * cbuffer = (uint8_t *)buffer;
|
||||
BEST_TIME(copy_without_useless_spaces(cbuffer, p.second,cbuffer), outlength,
|
||||
memcpy(buffer, p.first, p.second), repeat, volume, true);
|
||||
buffer[outlength] = '\0';
|
||||
|
||||
free(buffer);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,197 @@
|
|||
#include <stdint.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
#include "simdprune_tables.h"
|
||||
|
||||
// a straightforward comparison of a mask against input.
|
||||
static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi,
|
||||
__m256i mask) {
|
||||
__m256i cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask);
|
||||
uint64_t res_0 = (uint32_t)_mm256_movemask_epi8(cmp_res_0);
|
||||
__m256 cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask);
|
||||
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
|
||||
return res_0 | (res_1 << 32);
|
||||
}
|
||||
|
||||
// take input from buf and remove useless whitespace, input and output can be
|
||||
// the same
|
||||
static inline size_t copy_without_useless_spaces(const uint8_t *buf, size_t len,
|
||||
uint8_t *out) {
|
||||
// Useful constant masks
|
||||
const uint64_t even_bits = 0x5555555555555555ULL;
|
||||
const uint64_t odd_bits = ~even_bits;
|
||||
uint8_t *initout(out);
|
||||
uint64_t prev_iter_ends_odd_backslash =
|
||||
0ULL; // either 0 or 1, but a 64-bit value
|
||||
uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones
|
||||
size_t idx = 0;
|
||||
if (len >= 64) {
|
||||
size_t avxlen = len - 63;
|
||||
|
||||
for (; idx < avxlen; idx += 64) {
|
||||
__m256i input_lo = _mm256_load_si256((const __m256i *)(buf + idx + 0));
|
||||
__m256i input_hi = _mm256_load_si256((const __m256i *)(buf + idx + 32));
|
||||
uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi,
|
||||
_mm256_set1_epi8('\\'));
|
||||
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
|
||||
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
|
||||
uint64_t even_starts = start_edges & even_start_mask;
|
||||
uint64_t odd_starts = start_edges & ~even_start_mask;
|
||||
uint64_t even_carries = bs_bits + even_starts;
|
||||
uint64_t odd_carries;
|
||||
bool iter_ends_odd_backslash =
|
||||
__builtin_uaddll_overflow(bs_bits, odd_starts, &odd_carries);
|
||||
odd_carries |= prev_iter_ends_odd_backslash;
|
||||
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
|
||||
uint64_t even_carry_ends = even_carries & ~bs_bits;
|
||||
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
|
||||
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
|
||||
uint64_t odd_start_even_end = odd_carry_ends & even_bits;
|
||||
uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
|
||||
uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi,
|
||||
_mm256_set1_epi8('"'));
|
||||
quote_bits = quote_bits & ~odd_ends;
|
||||
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
||||
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
|
||||
quote_mask ^= prev_iter_inside_quote;
|
||||
prev_iter_inside_quote = (uint64_t)((s64)quote_mask >> 63);
|
||||
const __m256 low_nibble_mask = _mm256_setr_epi8(
|
||||
// 0 9 a b c d
|
||||
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
|
||||
const __m256 high_nibble_mask = _mm256_setr_epi8(
|
||||
// 0 2 3 5 7
|
||||
8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
|
||||
1, 0, 0, 0, 3, 2, 1, 0, 0);
|
||||
__m256 whitespace_shufti_mask = _mm256_set1_epi8(0x18);
|
||||
__m256 v_lo = _mm256_and_si256(
|
||||
_mm256_shuffle_epi8(low_nibble_mask, input_lo),
|
||||
_mm256_shuffle_epi8(high_nibble_mask,
|
||||
_mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
|
||||
_mm256_set1_epi8(0x7f))));
|
||||
|
||||
__m256 v_hi = _mm256_and_si256(
|
||||
_mm256_shuffle_epi8(low_nibble_mask, input_hi),
|
||||
_mm256_shuffle_epi8(high_nibble_mask,
|
||||
_mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
|
||||
_mm256_set1_epi8(0x7f))));
|
||||
__m256 tmp_ws_lo = _mm256_cmpeq_epi8(
|
||||
_mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
|
||||
__m256 tmp_ws_hi = _mm256_cmpeq_epi8(
|
||||
_mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
|
||||
|
||||
uint64_t ws_res_0 = (uint32_t)_mm256_movemask_epi8(tmp_ws_lo);
|
||||
uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
|
||||
uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
|
||||
whitespace &= ~quote_mask;
|
||||
int mask1 = whitespace & 0xFFFF;
|
||||
int mask2 = (whitespace >> 16) & 0xFFFF;
|
||||
int mask3 = (whitespace >> 32) & 0xFFFF;
|
||||
int mask4 = (whitespace >> 48) & 0xFFFF;
|
||||
// dumpbits(whitespace,"whitespace");
|
||||
int pop1 = _popcnt64((~whitespace) & 0xFFFF);
|
||||
int pop2 = _popcnt64((~whitespace) & UINT64_C(0xFFFFFFFF));
|
||||
int pop3 = _popcnt64((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
|
||||
int pop4 = _popcnt64((~whitespace));
|
||||
__m256i vmask1 =
|
||||
_mm256_loadu2_m128i((const __m128i *)mask128_epi8 + mask2,
|
||||
(const __m128i *)mask128_epi8 + mask1);
|
||||
__m256i vmask2 =
|
||||
_mm256_loadu2_m128i((const __m128i *)mask128_epi8 + mask4,
|
||||
(const __m128i *)mask128_epi8 + mask3);
|
||||
__m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
|
||||
__m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
|
||||
_mm256_storeu2_m128i((__m128i *)(out + pop1), (__m128i *)out, result1);
|
||||
_mm256_storeu2_m128i((__m128i *)(out + pop3), (__m128i *)(out + pop2),
|
||||
result2);
|
||||
out += pop4;
|
||||
}
|
||||
}
|
||||
// we finish off the job... copying and pasting the code is not ideal here,
|
||||
// but it gets the job done.
|
||||
if (idx < len) {
|
||||
uint8_t buffer[64];
|
||||
memset(buffer, 0, 64);
|
||||
memcpy(buffer, buf + idx, 64);
|
||||
__m256i input_lo = _mm256_load_si256((const __m256i *)(buffer));
|
||||
__m256i input_hi = _mm256_load_si256((const __m256i *)(buffer + 32));
|
||||
uint64_t bs_bits =
|
||||
cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\'));
|
||||
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
|
||||
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
|
||||
uint64_t even_starts = start_edges & even_start_mask;
|
||||
uint64_t odd_starts = start_edges & ~even_start_mask;
|
||||
uint64_t even_carries = bs_bits + even_starts;
|
||||
uint64_t odd_carries;
|
||||
bool iter_ends_odd_backslash =
|
||||
__builtin_uaddll_overflow(bs_bits, odd_starts, &odd_carries);
|
||||
odd_carries |= prev_iter_ends_odd_backslash;
|
||||
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
|
||||
uint64_t even_carry_ends = even_carries & ~bs_bits;
|
||||
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
|
||||
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
|
||||
uint64_t odd_start_even_end = odd_carry_ends & even_bits;
|
||||
uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
|
||||
uint64_t quote_bits =
|
||||
cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"'));
|
||||
quote_bits = quote_bits & ~odd_ends;
|
||||
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
||||
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
|
||||
quote_mask ^= prev_iter_inside_quote;
|
||||
prev_iter_inside_quote = (uint64_t)((s64)quote_mask >> 63);
|
||||
const __m256 low_nibble_mask = _mm256_setr_epi8(
|
||||
// 0 9 a b c d
|
||||
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 8, 12, 1, 2, 9, 0, 0);
|
||||
const __m256 high_nibble_mask = _mm256_setr_epi8(
|
||||
// 0 2 3 5 7
|
||||
8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
|
||||
1, 0, 0, 0, 3, 2, 1, 0, 0);
|
||||
__m256 whitespace_shufti_mask = _mm256_set1_epi8(0x18);
|
||||
__m256 v_lo = _mm256_and_si256(
|
||||
_mm256_shuffle_epi8(low_nibble_mask, input_lo),
|
||||
_mm256_shuffle_epi8(high_nibble_mask,
|
||||
_mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
|
||||
_mm256_set1_epi8(0x7f))));
|
||||
|
||||
__m256 v_hi = _mm256_and_si256(
|
||||
_mm256_shuffle_epi8(low_nibble_mask, input_hi),
|
||||
_mm256_shuffle_epi8(high_nibble_mask,
|
||||
_mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
|
||||
_mm256_set1_epi8(0x7f))));
|
||||
__m256 tmp_ws_lo = _mm256_cmpeq_epi8(
|
||||
_mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
|
||||
__m256 tmp_ws_hi = _mm256_cmpeq_epi8(
|
||||
_mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
|
||||
|
||||
uint64_t ws_res_0 = (uint32_t)_mm256_movemask_epi8(tmp_ws_lo);
|
||||
uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
|
||||
uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
|
||||
whitespace &= ~quote_mask;
|
||||
|
||||
//
|
||||
if (len - idx < 64) {
|
||||
whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx);
|
||||
}
|
||||
int mask1 = whitespace & 0xFFFF;
|
||||
int mask2 = (whitespace >> 16) & 0xFFFF;
|
||||
int mask3 = (whitespace >> 32) & 0xFFFF;
|
||||
int mask4 = (whitespace >> 48) & 0xFFFF;
|
||||
// dumpbits(whitespace,"whitespace");
|
||||
int pop1 = _popcnt64((~whitespace) & 0xFFFF);
|
||||
int pop2 = _popcnt64((~whitespace) & UINT64_C(0xFFFFFFFF));
|
||||
int pop3 = _popcnt64((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
|
||||
int pop4 = _popcnt64((~whitespace));
|
||||
__m256i vmask1 = _mm256_loadu2_m128i((const __m128i *)mask128_epi8 + mask2,
|
||||
(const __m128i *)mask128_epi8 + mask1);
|
||||
__m256i vmask2 = _mm256_loadu2_m128i((const __m128i *)mask128_epi8 + mask4,
|
||||
(const __m128i *)mask128_epi8 + mask3);
|
||||
__m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
|
||||
__m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
|
||||
_mm256_storeu2_m128i((__m128i *)(out + pop1), (__m128i *)out, result1);
|
||||
_mm256_storeu2_m128i((__m128i *)(out + pop3), (__m128i *)(out + pop2),
|
||||
result2);
|
||||
out += pop4;
|
||||
}
|
||||
return out - initout;
|
||||
}
|
|
@ -15,10 +15,8 @@
|
|||
#include "jsonstruct.h"
|
||||
using namespace std;
|
||||
|
||||
|
||||
|
||||
// a straightforward comparison of a mask against input. 5 uops; would be cheaper in AVX512.
|
||||
static inline u64 cmp_mask_against_input(m256 input_lo, m256 input_hi, m256 mask) {
|
||||
static u64 cmp_mask_against_input(m256 input_lo, m256 input_hi, m256 mask) {
|
||||
m256 cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask);
|
||||
u64 res_0 = (u32)_mm256_movemask_epi8(cmp_res_0);
|
||||
m256 cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask);
|
||||
|
@ -26,7 +24,7 @@ static inline u64 cmp_mask_against_input(m256 input_lo, m256 input_hi, m256 mask
|
|||
return res_0 | (res_1 << 32);
|
||||
}
|
||||
|
||||
static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
|
||||
static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
|
||||
// Useful constant masks
|
||||
const u64 even_bits = 0x5555555555555555ULL;
|
||||
const u64 odd_bits = ~even_bits;
|
||||
|
@ -37,9 +35,10 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
// persistent state across loop
|
||||
u64 prev_iter_ends_odd_backslash = 0ULL; // either 0 or 1, but a 64-bit value
|
||||
u64 prev_iter_inside_quote = 0ULL; // either all zeros or all ones
|
||||
u64 prev_iter_pseudo_structural_carry = 0ULL;
|
||||
u64 prev_iter_ends_pseudo_pred = 0ULL;
|
||||
|
||||
for (size_t idx = 0; idx < len; idx+=64) {
|
||||
__builtin_prefetch(buf + idx + 128);
|
||||
m256 input_lo = _mm256_load_si256((const m256 *)(buf + idx + 0));
|
||||
m256 input_hi = _mm256_load_si256((const m256 *)(buf + idx + 32));
|
||||
|
||||
|
@ -66,10 +65,8 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
// if we had an odd-numbered run at the end of
|
||||
// the previous iteration
|
||||
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
|
||||
|
||||
u64 even_carry_ends = even_carries & ~bs_bits;
|
||||
u64 odd_carry_ends = odd_carries & ~bs_bits;
|
||||
|
||||
u64 even_start_odd_end = even_carry_ends & odd_bits;
|
||||
u64 odd_start_even_end = odd_carry_ends & even_bits;
|
||||
|
||||
|
@ -136,40 +133,30 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
u64 ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
|
||||
u64 whitespace = ~(ws_res_0 | (ws_res_1 << 32));
|
||||
|
||||
|
||||
// mask off anything inside quotes
|
||||
structurals &= ~quote_mask;
|
||||
|
||||
// whitespace inside our quotes also doesn't count; otherwise " foo" would generate a spurious
|
||||
// pseudo-structural-character at 'foo'
|
||||
whitespace &= ~quote_mask;
|
||||
|
||||
// add the real quote bits back into our bitmask as well, so we can
|
||||
// quickly traverse the strings we've spent all this trouble gathering
|
||||
structurals |= quote_bits;
|
||||
|
||||
// Now, establish "pseudo-structural characters". These are characters that follow a structural
|
||||
// character followed by zero or more whitespace
|
||||
// this allows us to discover true/false/null and numbers in any location where they might legally
|
||||
// occur; it will also create another 'checkpoint' where if a non-quoted region of our input
|
||||
// has whitespace after a structural character fullowed by a syntax error, we can detect this
|
||||
// and get an error in a later stage (i.e. the state machine)
|
||||
// Now, establish "pseudo-structural characters". These are non-whitespace characters
|
||||
// that are (a) outside quotes and (b) have a predecessor that's either whitespace or a structural
|
||||
// character. This means that subsequent passes will get a chance to encounter the first character
|
||||
// of every string of non-whitespace and, if we're parsing an atom like true/false/null or a number
|
||||
// we can stop at the first whitespace or structural character following it.
|
||||
|
||||
// Slightly more painful than it would seem. It's possible that either structurals or whitespace are
|
||||
// all 1s (e.g. {{{{{{{....{{{{x64, or a really long whitespace). As such there is no safe place
|
||||
// to add a '1' from the previous iteration without *that* triggering the carry we are looking
|
||||
// out for, so we must check both carries for overflow
|
||||
|
||||
u64 tmp = structurals | whitespace;
|
||||
u64 tmp2;
|
||||
bool ps_carry = __builtin_uaddll_overflow(tmp, structurals, &tmp2);
|
||||
u64 tmp3;
|
||||
ps_carry = ps_carry | __builtin_uaddll_overflow(tmp2, prev_iter_pseudo_structural_carry, &tmp3);
|
||||
prev_iter_pseudo_structural_carry = ps_carry ? 0x1ULL : 0x0ULL;
|
||||
tmp3 &= ~quote_mask;
|
||||
tmp3 &= ~whitespace;
|
||||
structurals |= tmp3;
|
||||
// a qualified predecessor is something that can happen 1 position before an
|
||||
// psuedo-structural character
|
||||
u64 pseudo_pred = structurals | whitespace;
|
||||
u64 shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
|
||||
prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
|
||||
u64 pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask);
|
||||
structurals |= pseudo_structurals;
|
||||
|
||||
// now, we've used our close quotes all we need to. So let's switch them off
|
||||
// they will be off in the quote mask and on in quote bits.
|
||||
structurals &= ~(quote_bits & ~quote_mask);
|
||||
*(u64 *)(pj.structurals + idx/8) = structurals;
|
||||
}
|
||||
return true;
|
||||
|
@ -191,7 +178,7 @@ static bool flatten_indexes(size_t len, ParsedJson & pj) {
|
|||
u32 cnt = __builtin_popcountll(s);
|
||||
u32 next_base = base + cnt;
|
||||
while (s) {
|
||||
// spoil the suspense
|
||||
// spoil the suspense by reducing dependency chains; actually a win even with cost of pdep
|
||||
u64 s3 = _pdep_u64(~0x7ULL, s); // s3 will have bottom 3 1-bits unset
|
||||
u64 s5 = _pdep_u64(~0x1fULL, s); // s5 will have bottom 5 1-bits unset
|
||||
|
||||
|
@ -202,63 +189,664 @@ static bool flatten_indexes(size_t len, ParsedJson & pj) {
|
|||
|
||||
base_ptr[base+4] = (u32)idx + __builtin_ctzll(s4); //u64 s5 = s4 & (s4 - 1ULL);
|
||||
base_ptr[base+5] = (u32)idx + __builtin_ctzll(s5); u64 s6 = s5 & (s5 - 1ULL);
|
||||
base_ptr[base+6] = (u32)idx + __builtin_ctzll(s6); u64 s7 = s6 & (s6 - 1ULL);
|
||||
s = s7;
|
||||
base += 7;
|
||||
s = s6;
|
||||
base += 6;
|
||||
}
|
||||
base = next_base;
|
||||
}
|
||||
pj.n_structural_indexes = base;
|
||||
base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array
|
||||
return true;
|
||||
}
|
||||
|
||||
// Parse our json given a big array of 32-bit integers telling us where
|
||||
// the interesting stuff is
|
||||
static bool json_parse(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
|
||||
u32 last; // index of previous structure at this level or 0 if none
|
||||
u32 up; // index of structure that contains this one
|
||||
|
||||
JsonNode * nodes = pj.nodes;
|
||||
const u32 MAX_DEPTH = 256;
|
||||
|
||||
JsonNode & dummy = nodes[DUMMY_NODE];
|
||||
JsonNode & root = nodes[ROOT_NODE];
|
||||
dummy.prev = dummy.up = DUMMY_NODE;
|
||||
root.prev = DUMMY_NODE;
|
||||
root.up = ROOT_NODE;
|
||||
last = up = ROOT_NODE;
|
||||
// the ape machine consists of two parts:
|
||||
//
|
||||
// 1) The "state machine", which is a multiple channel per-level state machine
|
||||
// It is a conventional DFA except in that it 'changes track' on {}[] characters
|
||||
//
|
||||
// 2) The "tape machine": this records offsets of various structures as they go by
|
||||
// These structures are either u32 offsets of other tapes or u32 offsets into our input
|
||||
// or structures.
|
||||
//
|
||||
// The state machine doesn't record ouput.
|
||||
// The tape machine doesn't validate.
|
||||
//
|
||||
// The output of the tape machine is meaningful only if the state machine is in non-error states.
|
||||
|
||||
// depth adjustment is strictly based on whether we are {[ or }]
|
||||
|
||||
// depth adjustment is a pre-increment which, in effect, means that a {[ contained in an object
|
||||
// is in the level one deeper, while the corresponding }] is at the level
|
||||
|
||||
|
||||
// TAPE MACHINE DEFINITIONS
|
||||
|
||||
const u32 DEPTH_PLUS_ONE = 0x01000000;
|
||||
const u32 DEPTH_ZERO = 0x00000000;
|
||||
const u32 DEPTH_MINUS_ONE = 0xff000000;
|
||||
const u32 WRITE_ZERO = 0x0;
|
||||
const u32 WRITE_FOUR = 0x1;
|
||||
|
||||
const u32 CDF = DEPTH_ZERO | WRITE_ZERO; // default 'control'
|
||||
const u32 C04 = DEPTH_ZERO | WRITE_FOUR;
|
||||
const u32 CP4 = DEPTH_PLUS_ONE | WRITE_FOUR;
|
||||
const u32 CM4 = DEPTH_MINUS_ONE | WRITE_FOUR;
|
||||
|
||||
inline s8 get_depth_adjust(u32 control) { return (s8)(((s32)control) >> 24); }
|
||||
inline size_t get_write_size(u32 control) { return control & 0xff; }
|
||||
|
||||
const u32 char_control[256] = {
|
||||
// nothing interesting from 0x00-0x20
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
|
||||
// " is 0x22, - is 0x2d
|
||||
CDF,CDF,C04,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,C04,CDF,CDF,
|
||||
|
||||
// numbers are 0x30-0x39
|
||||
C04,C04,C04,C04, C04,C04,C04,C04, C04,C04,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
|
||||
// nothing interesting from 0x40-0x49
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
|
||||
// 0x5b/5d are []
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CP4, CDF,CM4,CDF,CDF,
|
||||
|
||||
// f is 0x66 n is 0x6e
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,C04,CDF, CDF,CDF,CDF,CDF, CDF,CDF,C04,CDF,
|
||||
|
||||
// 0x7b/7d are {}, 74 is t
|
||||
CDF,CDF,CDF,CDF, C04,CDF,CDF,CDF, CDF,CDF,CDF,CP4, CDF,CM4,CDF,CDF,
|
||||
|
||||
// nothing interesting from 0x80-0xff
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF
|
||||
};
|
||||
|
||||
const size_t MAX_TAPE_ENTRIES = 127*1024;
|
||||
const size_t MAX_TAPE = MAX_DEPTH * MAX_TAPE_ENTRIES;
|
||||
|
||||
// all of this stuff needs to get moved somewhere reasonable
|
||||
// like our ParsedJson structure
|
||||
u32 tape[MAX_TAPE];
|
||||
u32 tape_locs[MAX_DEPTH];
|
||||
u8 string_buf[512*1024];
|
||||
u8 * current_string_buf_loc;
|
||||
u8 number_buf[512*1024]; // holds either doubles or longs, really
|
||||
u8 * current_number_buf_loc;
|
||||
|
||||
// STATE MACHINE DECLARATIONS
|
||||
const u32 MAX_STATES = 16;
|
||||
u32 trans[MAX_STATES][256];
|
||||
u32 states[MAX_DEPTH];
|
||||
const int START_STATE = 1;
|
||||
|
||||
never_inline void init_state_machine() {
|
||||
// states 10 and 6 eliminated
|
||||
|
||||
trans[ 1]['{'] = 2;
|
||||
trans[ 2]['"'] = 4;
|
||||
trans[ 4][':'] = 5;
|
||||
// 5->7 on all values ftn0123456789-"
|
||||
trans[ 7][','] = 8;
|
||||
trans[ 8]['"'] = 4;
|
||||
|
||||
trans[ 1]['['] = 9;
|
||||
// 9->11 on all values ftn0123456789-"
|
||||
trans[11][','] = 12;
|
||||
// 12->11 on all values ftn0123456789-"
|
||||
|
||||
const char * UNARIES = "}]ftn0123456789-\"";
|
||||
for (u32 i = 0; i < strlen(UNARIES); i++) {
|
||||
trans[ 5][(u32)UNARIES[i]] = 7;
|
||||
trans[ 9][(u32)UNARIES[i]] = 11;
|
||||
trans[12][(u32)UNARIES[i]] = 11;
|
||||
}
|
||||
|
||||
// back transitions when new things are open
|
||||
trans[2]['{'] = 2;
|
||||
trans[7]['{'] = 2;
|
||||
trans[9]['{'] = 2;
|
||||
trans[11]['{'] = 2;
|
||||
trans[2]['['] = 9;
|
||||
trans[7]['['] = 9;
|
||||
trans[9]['['] = 9;
|
||||
trans[11]['['] = 9;
|
||||
}
|
||||
|
||||
static bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
|
||||
|
||||
// NOTE - our depth is used by both the tape machine and the state machine
|
||||
// Further, in production we will set it to a largish value in a generous buffer as a rogue input
|
||||
// could consist of many {[ characters or many }] characters. We aren't busily checking errors
|
||||
// (and in fact, a aggressive sequence of [ characters is actually valid input!) so something that
|
||||
// blows out maximum depth will need to be periodically checked for, as will something that tries
|
||||
// to set depth very low. If we set our starting depth, say, to 256, we can tolerate 256 bogus close brace
|
||||
// characters without aggressively going wrong and writing to bad memory
|
||||
// Note that any specious depth can have a specious tape associated with and all these specious depths
|
||||
// can share a region of the tape - it's harmless. Since tape is one-way, any movement in a specious tape
|
||||
// is an error (so we can detect max_depth violations by making sure that specious tape locations haven't
|
||||
// moved from their starting values)
|
||||
|
||||
u32 depth = 1;
|
||||
|
||||
for (u32 i = 0; i < MAX_DEPTH; i++) {
|
||||
tape_locs[i] = i*MAX_TAPE_ENTRIES;
|
||||
states[i] = START_STATE;
|
||||
}
|
||||
|
||||
current_string_buf_loc = string_buf;
|
||||
current_number_buf_loc = number_buf;
|
||||
|
||||
u32 error_sump = 0;
|
||||
u32 old_tape_loc = tape_locs[depth]; // need to initialize for first write
|
||||
|
||||
u32 next_idx = pj.structural_indexes[0];
|
||||
u8 next_c = buf[next_idx];
|
||||
u32 next_control = char_control[next_c];
|
||||
|
||||
for (u32 i = NUM_RESERVED_NODES; i < pj.n_structural_indexes; i++) {
|
||||
u32 idx = pj.structural_indexes[i];
|
||||
JsonNode & n = nodes[i];
|
||||
u8 c = buf[idx];
|
||||
if (unlikely((c & 0xdf) == 0x5b)) { // meaning 7b or 5b, { or [
|
||||
// open a scope
|
||||
n.prev = last;
|
||||
n.up = up;
|
||||
up = i;
|
||||
last = 0;
|
||||
} else if (unlikely((c & 0xdf) == 0x5d)) { // meaning 7d or 5d, } or ]
|
||||
// close a scope
|
||||
n.prev = up;
|
||||
n.up = pj.nodes[up].up;
|
||||
up = pj.nodes[up].up;
|
||||
last = i;
|
||||
} else {
|
||||
n.prev = last;
|
||||
n.up = up;
|
||||
last = i;
|
||||
}
|
||||
n.next = 0;
|
||||
nodes[n.prev].next = i;
|
||||
u32 idx = next_idx;
|
||||
u8 c = next_c;
|
||||
u32 control = next_control;
|
||||
|
||||
next_idx = pj.structural_indexes[i+1];
|
||||
next_c = buf[next_idx];
|
||||
next_control = char_control[next_c];
|
||||
|
||||
// TAPE MACHINE
|
||||
s8 depth_adjust = get_depth_adjust(control);
|
||||
u8 write_size = get_write_size(control);
|
||||
u32 write_val = (depth_adjust != 0) ? old_tape_loc : idx;
|
||||
depth += depth_adjust;
|
||||
//states[depth] = trans[states[depth]][c];
|
||||
// TAPE MACHINE, again
|
||||
tape[tape_locs[depth]] = write_val | (c << 24); // hack. Assumes no more than 2^24 tape items and buffer size for now
|
||||
old_tape_loc = tape_locs[depth] += write_size;
|
||||
}
|
||||
|
||||
if (error_sump) {
|
||||
return false;
|
||||
}
|
||||
dummy.next = DUMMY_NODE; // dummy.next is a sump for meaningless 'nexts', clear it
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
|
||||
// these go into the first 3 buckets of the comparison (1/2/4)
|
||||
|
||||
// we are also interested in the four whitespace characters
|
||||
// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
|
||||
|
||||
const u32 structural_or_whitespace_negated[256] = {
|
||||
1,1,1,1, 1,1,1,1, 1,0,0,1, 1,0,1,1,
|
||||
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
|
||||
0,1,1,1, 1,1,1,1, 1,1,1,1, 0,1,1,1,
|
||||
1,1,1,1, 1,1,1,1, 1,1,0,1, 1,1,1,1,
|
||||
|
||||
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
|
||||
1,1,1,1, 1,1,1,1, 1,1,1,0, 1,0,1,1,
|
||||
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
|
||||
1,1,1,1, 1,1,1,1, 1,1,1,0, 1,0,1,1,
|
||||
|
||||
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
|
||||
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
|
||||
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
|
||||
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
|
||||
|
||||
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
|
||||
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
|
||||
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
|
||||
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
|
||||
};
|
||||
|
||||
// return non-zero if not a structural or whitespace char
|
||||
// zero otherwise
|
||||
really_inline u32 is_not_structural_or_whitespace(u8 c) {
|
||||
return structural_or_whitespace_negated[c];
|
||||
}
|
||||
|
||||
// These chars yield themselves: " \ /
|
||||
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
||||
// u not handled in this table as it's complex
|
||||
const u8 escape_map[256] = {
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, //0x0.
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
0,0,0x22,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x2f,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, //0x4.
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x5c,0,0,0, //0x5.
|
||||
0,0,0x08,0, 0,0,0x12,0, 0,0,0,0, 0,0,0x0a,0, //0x6.
|
||||
0,0,0x0d,0, 0x09,0,0,0, 0,0,0,0, 0,0,0,0, //0x7.
|
||||
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
};
|
||||
|
||||
|
||||
const u32 leading_zeros_to_utf_bytes[33] = {
|
||||
1,
|
||||
1, 1, 1, 1, 1, 1, 1, // 7 bits for first one
|
||||
2, 2, 2, 2, // 11 bits for next
|
||||
3, 3, 3, 3, 3, // 16 bits for next
|
||||
4, 4, 4, 4, 4, // 21 bits for next
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // error
|
||||
|
||||
|
||||
const u32 UTF_PDEP_MASK[5] = {
|
||||
0x00, // error
|
||||
0x7f,
|
||||
0x1f3f,
|
||||
0x0f3f3f,
|
||||
0x073f3f3f
|
||||
};
|
||||
|
||||
const u32 UTF_OR_MASK[5] = {
|
||||
0x00, // error
|
||||
0x00,
|
||||
0xc080,
|
||||
0xe08080,
|
||||
0xf0808080
|
||||
};
|
||||
|
||||
bool is_hex_digit(u8 v) {
|
||||
if (v >= '0' && v <= '9')
|
||||
return true;
|
||||
v &= 0xdf;
|
||||
if (v >= 'A' && v <= 'F')
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
u8 digit_to_val(u8 v) {
|
||||
if (v >= '0' && v <= '9')
|
||||
return v - '0';
|
||||
v &= 0xdf;
|
||||
return v - 'A' + 10;
|
||||
}
|
||||
|
||||
bool hex_to_u32(const u8 * src, u32 * res) {
|
||||
u8 v1 = src[0];
|
||||
u8 v2 = src[1];
|
||||
u8 v3 = src[2];
|
||||
u8 v4 = src[3];
|
||||
if (!is_hex_digit(v1) || !is_hex_digit(v2) || !is_hex_digit(v3) || !is_hex_digit(v4)) {
|
||||
return false;
|
||||
}
|
||||
*res = digit_to_val(v1) << 24 | digit_to_val(v2) << 16 | digit_to_val(v3) << 8 | digit_to_val(v4);
|
||||
return true;
|
||||
}
|
||||
|
||||
// handle a unicode codepoint
|
||||
// write appropriate values into dest
|
||||
// src will always advance 6 bytes
|
||||
// dest will advance a variable amount (return via pointer)
|
||||
// return true if the unicode codepoint was valid
|
||||
// We work in little-endian then swap at write time
|
||||
static bool handle_unicode_codepoint(const u8 ** src_ptr, u8 ** dst_ptr) {
|
||||
u32 code_point = 0; // read the hex, potentially reading another \u beyond if it's a // wacky one
|
||||
if (!hex_to_u32(*src_ptr + 2, &code_point)) {
|
||||
return false;
|
||||
}
|
||||
*src_ptr += 6;
|
||||
// check for the weirdo double-UTF-16 nonsense for things outside Basic Multilingual Plane.
|
||||
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
||||
// TODO: sanity check and clean up; snippeted from RapidJSON and poorly understood at the moment
|
||||
if (( (*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
|
||||
return false;
|
||||
}
|
||||
u32 code_point_2 = 0;
|
||||
if (!hex_to_u32(*src_ptr + 2, &code_point_2)) {
|
||||
return false;
|
||||
}
|
||||
if (code_point_2 < 0xdc00 || code_point_2 > 0xdfff) {
|
||||
return false;
|
||||
}
|
||||
code_point = (((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
|
||||
*src_ptr += 6;
|
||||
}
|
||||
// TODO: check to see whether the below code is nonsense (it's really only a sketch at this point)
|
||||
u32 lz = __builtin_clz(code_point);
|
||||
u32 utf_bytes = leading_zeros_to_utf_bytes[lz];
|
||||
u32 tmp = _pdep_u32(code_point, UTF_PDEP_MASK[utf_bytes]) | UTF_OR_MASK[utf_bytes];
|
||||
// swap and move to the other side of the register
|
||||
tmp = __builtin_bswap32(tmp);
|
||||
tmp >>= (4 - utf_bytes) * 8;
|
||||
**(u32 **)dst_ptr = tmp;
|
||||
*dst_ptr += utf_bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc) {
|
||||
u32 offset = tape[tape_loc] & 0xffffff;
|
||||
const u8 * src = &buf[offset+1]; // we know that buf at offset is a "
|
||||
u8 * dst = current_string_buf_loc;
|
||||
// basic non-sexy parsing code
|
||||
while (1) {
|
||||
m256 v = _mm256_loadu_si256((const m256 *)(src));
|
||||
u32 bs_bits = (u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')));
|
||||
u32 quote_bits = (u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('"')));
|
||||
u32 quote_dist = __builtin_ctz(quote_bits);
|
||||
u32 bs_dist = __builtin_ctz(bs_bits);
|
||||
// store to dest unconditionally - we can overwrite the bits we don't like later
|
||||
_mm256_storeu_si256((m256 *)(dst), v);
|
||||
if (quote_dist < bs_dist) {
|
||||
// we encountered quotes first. Move dst to point to quotes and exit
|
||||
dst[quote_dist] = 0; // null terminate and get out
|
||||
current_string_buf_loc = dst + quote_dist + 1;
|
||||
tape[tape_loc] = ((u32)'"') << 24 | (current_string_buf_loc - string_buf); // assume 2^24 will hold all strings for now
|
||||
return true;
|
||||
} else if (quote_dist > bs_dist) {
|
||||
u8 escape_char = src[bs_dist+1];
|
||||
// we encountered backslash first. Handle backslash
|
||||
if (escape_char == 'u') {
|
||||
// move src/dst up to the start; they will be further adjusted
|
||||
// within the unicode codepoint handling code.
|
||||
src += bs_dist;
|
||||
dst += bs_dist;
|
||||
if (!handle_unicode_codepoint(&src, &dst)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
// simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
||||
// write bs_dist+1 characters to output
|
||||
// note this may reach beyond the part of the buffer we've actually seen.
|
||||
// I think this is ok
|
||||
u8 escape_result = escape_map[escape_char];
|
||||
if (!escape_result)
|
||||
return false; // bogus escape value is an error
|
||||
dst[bs_dist] = escape_result;
|
||||
src += bs_dist+2;
|
||||
dst += bs_dist+1;
|
||||
}
|
||||
} else {
|
||||
// they are the same. Since they can't co-occur, it means we encountered neither.
|
||||
src+=32;
|
||||
dst+=32;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// later extensions -
|
||||
// if \\ we could detect whether it's a substantial run of \ or just eat 2 chars and write 1
|
||||
// handle anything short of \u or \\\ (as a prefix) with clever PSHUFB stuff and don't leave SIMD
|
||||
return true;
|
||||
}
|
||||
|
||||
// put a parsed version of number (either as a double or a signed long) into the number buffer,
|
||||
// put a 'tag' indicating which type and where it is back onto the tape at that location
|
||||
// return false if we can't parse the number which means either
|
||||
// (a) the number isn't valid, or (b) the number is followed by something that isn't whitespace, comma or a close }] character
|
||||
// which are the only things that should follow a number at this stage
|
||||
// bools to detect what we found in our initial character already here - we are already
|
||||
// switching on 0 vs 1-9 vs - so we may as well keep separate paths where that's useful
|
||||
|
||||
// TODO: see if we really need a separate number_buf or whether we should just
|
||||
// have a generic scratch - would need to align before using for this
|
||||
static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc, UNUSED bool found_zero, bool found_minus) {
|
||||
u32 offset = tape[tape_loc] & 0xffffff;
|
||||
if (found_minus) {
|
||||
offset++;
|
||||
}
|
||||
const u8 * src = &buf[offset];
|
||||
m256 v = _mm256_loadu_si256((const m256 *)(src));
|
||||
u64 error_sump = 0;
|
||||
|
||||
// categories to extract
|
||||
// Digits:
|
||||
// 0 (0x30) - bucket 0
|
||||
// 1-9 (never any distinction except if we didn't get the free kick at 0 due to the leading minus) (0x31-0x39) - bucket 1
|
||||
// . (0x2e) - bucket 2
|
||||
// E or e - no distinction (0x45/0x65) - bucket 3
|
||||
// + (0x2b) - bucket 4
|
||||
// - (0x2d) - bucket 4
|
||||
// Terminators
|
||||
// Whitespace: 0x20, 0x09, 0x0a, 0x0d - bucket 5+6
|
||||
// Comma and the closes: 0x2c is comma, } is 0x5d, ] is 0x7d - bucket 5+7
|
||||
|
||||
// Another shufti - also a bit hand-hacked. Need to make a better construction
|
||||
const m256 low_nibble_mask = _mm256_setr_epi8(
|
||||
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
|
||||
33, 2, 2, 2, 2, 10, 2, 2, 2, 66, 64, 16, 32,208, 4, 0,
|
||||
33, 2, 2, 2, 2, 10, 2, 2, 2, 66, 64, 16, 32,208, 4, 0
|
||||
);
|
||||
const m256 high_nibble_mask = _mm256_setr_epi8(
|
||||
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
|
||||
64, 0, 52, 3, 8,128, 8,128, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
64, 0, 52, 3, 8,128, 8,128, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
);
|
||||
|
||||
m256 tmp = _mm256_and_si256(
|
||||
_mm256_shuffle_epi8(low_nibble_mask, v),
|
||||
_mm256_shuffle_epi8(high_nibble_mask,
|
||||
_mm256_and_si256(_mm256_srli_epi32(v, 4), _mm256_set1_epi8(0x7f))));
|
||||
|
||||
m256 enders_mask = _mm256_set1_epi8(0xe0);
|
||||
m256 tmp_enders = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, enders_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
u32 enders = ~(u32)_mm256_movemask_epi8(tmp_enders);
|
||||
|
||||
if (enders == 0) {
|
||||
// TODO: scream for help if enders == 0 which means we have
|
||||
// a heroically long number string or some garbage
|
||||
}
|
||||
// TODO: make a mask that indicates where our digits are
|
||||
u32 number_mask = ~enders & (enders-1);
|
||||
|
||||
m256 n_mask = _mm256_set1_epi8(0x1f);
|
||||
m256 tmp_n = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, n_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
u32 number_characters = ~(u32)_mm256_movemask_epi8(tmp_n);
|
||||
|
||||
// put something into our error sump if we have something
|
||||
// before our ending characters that isn't a valid character
|
||||
// for the inside of our JSON
|
||||
number_characters &= number_mask;
|
||||
error_sump |= number_characters ^ number_mask;
|
||||
|
||||
m256 d_mask = _mm256_set1_epi8(0x03);
|
||||
m256 tmp_d = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, d_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
u32 digit_characters = ~(u32)_mm256_movemask_epi8(tmp_d);
|
||||
digit_characters &= number_mask;
|
||||
|
||||
m256 p_mask = _mm256_set1_epi8(0x04);
|
||||
m256 tmp_p = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, p_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
u32 decimal_characters = ~(u32)_mm256_movemask_epi8(tmp_p);
|
||||
decimal_characters &= number_mask;
|
||||
|
||||
m256 e_mask = _mm256_set1_epi8(0x08);
|
||||
m256 tmp_e = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, e_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
u32 exponent_characters = ~(u32)_mm256_movemask_epi8(tmp_e);
|
||||
exponent_characters &= number_mask;
|
||||
|
||||
m256 s_mask = _mm256_set1_epi8(0x10);
|
||||
m256 tmp_s = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, s_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
u32 sign_characters = ~(u32)_mm256_movemask_epi8(tmp_s);
|
||||
sign_characters &= number_mask;
|
||||
|
||||
u32 digit_edges = ~(digit_characters << 1) & digit_characters;
|
||||
|
||||
// check that we have 1-3 'edges' only
|
||||
u32 t = digit_edges;
|
||||
t &= t-1; t &= t-1; t &= t-1;
|
||||
error_sump |= t;
|
||||
|
||||
// check that we start with a digit
|
||||
error_sump |= ~digit_characters & 0x1;
|
||||
|
||||
// having done some checks, get lazy and fall back
|
||||
// to strtoll or strtod
|
||||
// TODO: handle the easy cases ourselves; these are
|
||||
// expensive and we've done a lot of the prepwork.
|
||||
// return errors if strto* fail, otherwise fill in a code on the tape
|
||||
// 'd' for floating point and 'l' for long and put a pointer to the
|
||||
// spot in the buffer.
|
||||
if (__builtin_popcount(digit_edges) == 1) {
|
||||
// try a strtoll
|
||||
char * end;
|
||||
u64 result = strtoll((const char *)src, &end, 10);
|
||||
if ((errno != 0) || (end == (const char *)src)) {
|
||||
error_sump |= 1;
|
||||
}
|
||||
error_sump |= is_not_structural_or_whitespace(*end);
|
||||
if (found_minus) {
|
||||
result = -result;
|
||||
}
|
||||
*((u64 *)current_number_buf_loc) = result;
|
||||
tape[tape_loc] = ((u32)'l') << 24 | (current_number_buf_loc - number_buf); // assume 2^24 will hold all numbers for now
|
||||
current_number_buf_loc += 8;
|
||||
} else {
|
||||
// try a strtod
|
||||
char * end;
|
||||
double result = strtod((const char *)src, &end);
|
||||
if ((errno != 0) || (end == (const char *)src)) {
|
||||
error_sump |= 1;
|
||||
}
|
||||
error_sump |= is_not_structural_or_whitespace(*end);
|
||||
if (found_minus) {
|
||||
result = -result;
|
||||
}
|
||||
*((double *)current_number_buf_loc) = result;
|
||||
tape[tape_loc] = ((u32)'d') << 24 | (current_number_buf_loc - number_buf); // assume 2^24 will hold all numbers for now
|
||||
current_number_buf_loc += 8;
|
||||
}
|
||||
// TODO: check the MSB element is a digit
|
||||
|
||||
// TODO: a whole bunch of checks
|
||||
|
||||
// TODO: <=1 decimal point, eE mark, +- construct
|
||||
|
||||
// TODO: first and last character in mask region must be
|
||||
// digit
|
||||
|
||||
// TODO: if it exists,
|
||||
// Decimal point is after the first cluster of numbers only
|
||||
// and before the second cluster of numbers only. It must
|
||||
// be digit_or_zero . digit_or_zero strictly
|
||||
|
||||
// TODO: eE mark and +- construct are adjacent with eE first
|
||||
// eE mark preceeds final cluster of numbers only
|
||||
// and immediately follows second-last cluster of numbers only (not
|
||||
// necessarily second, as we may have 4e10).
|
||||
// it may suffice to insist that eE is preceeded immediately
|
||||
// by a digit of any kind and that it's followed locally by
|
||||
// a digit immediately or a +- construct then a digit.
|
||||
|
||||
// TODO: if we have both . and the eE mark then the . must
|
||||
// precede the eE mark
|
||||
|
||||
// TODO: if first character is a zero (we know in advance except for -0)
|
||||
// second char must be . or eE.
|
||||
|
||||
if (error_sump)
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
|
||||
// fixup the mess made by the ape_machine
|
||||
// as such it does a bunch of miscellaneous things on the tapes
|
||||
u32 error_sump = 0;
|
||||
u64 tv = *(const u64 *)"true ";
|
||||
u64 nv = *(const u64 *)"null ";
|
||||
u64 fv = *(const u64 *)"false ";
|
||||
u64 mask4 = 0x00000000ffffffff;
|
||||
u64 mask5 = 0x000000ffffffffff;
|
||||
|
||||
// walk over each tape
|
||||
for (u32 i = 0; i < MAX_DEPTH; i++) {
|
||||
u32 start_loc = i*MAX_TAPE_ENTRIES;
|
||||
u32 end_loc = tape_locs[i];
|
||||
|
||||
for (u32 j = start_loc; j < end_loc; j++) {
|
||||
switch (tape[j]>>24) {
|
||||
case '{': case '[': {
|
||||
// pivot our tapes
|
||||
// point the enclosing structural char (}]) to the head marker ({[) and
|
||||
// put the end of the sequence on the tape at the head marker
|
||||
// we start with head marker pointing at the enclosing structural char
|
||||
// and the enclosing structural char pointing at the end. Just swap them.
|
||||
// also check the balanced-{} or [] property here
|
||||
u8 head_marker_c = tape[j] >> 24;
|
||||
u32 head_marker_loc = tape[j] & 0xffffff;
|
||||
u32 tape_enclosing = tape[head_marker_loc];
|
||||
u8 enclosing_c = tape_enclosing >> 24;
|
||||
tape[head_marker_loc] = tape[j];
|
||||
tape[j] = tape_enclosing;
|
||||
error_sump |= (enclosing_c - head_marker_c - 2); // [] and {} only differ by 2 chars
|
||||
break;
|
||||
}
|
||||
case '"': {
|
||||
error_sump |= !parse_string(buf, len, pj, j);
|
||||
break;
|
||||
}
|
||||
case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
||||
error_sump |= !parse_number(buf, len, pj, j, false, false);
|
||||
break;
|
||||
case '0':
|
||||
error_sump |= !parse_number(buf, len, pj, j, true, false);
|
||||
break;
|
||||
case '-':
|
||||
error_sump |= !parse_number(buf, len, pj, j, false, true);
|
||||
break;
|
||||
case 't': {
|
||||
u32 offset = tape[j] & 0xffffff;
|
||||
const u8 * loc = buf + offset;
|
||||
error_sump |= ((*(const u64 *)loc) & mask4) ^ tv;
|
||||
error_sump |= is_not_structural_or_whitespace(loc[4]);
|
||||
break;
|
||||
}
|
||||
case 'f': {
|
||||
u32 offset = tape[j] & 0xffffff;
|
||||
const u8 * loc = buf + offset;
|
||||
error_sump |= ((*(const u64 *)loc) & mask5) ^ fv;
|
||||
error_sump |= is_not_structural_or_whitespace(loc[5]);
|
||||
break;
|
||||
}
|
||||
case 'n': {
|
||||
u32 offset = tape[j] & 0xffffff;
|
||||
const u8 * loc = buf + offset;
|
||||
error_sump |= ((*(const u64 *)loc) & mask4) ^ nv;
|
||||
error_sump |= is_not_structural_or_whitespace(loc[4]);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (error_sump) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static bool avx_json_parse(const u8 * buf, size_t len, ParsedJson & pj) {
|
||||
find_structural_bits(buf, len, pj);
|
||||
flatten_indexes(len, pj);
|
||||
json_parse(buf, len, pj);
|
||||
return true;
|
||||
return ape_machine(buf, len, pj) && shovel_machine(buf, len, pj);
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue