Cleaning.
This commit is contained in:
parent
bf4089b33b
commit
46ef59c679
Binary file not shown.
After Width: | Height: | Size: 49 KiB |
|
@ -713,39 +713,6 @@ private :
|
|||
|
||||
};
|
||||
|
||||
#ifdef DEBUG
|
||||
inline void dump256(m256 d, const std::string &msg) {
|
||||
for (uint32_t i = 0; i < 32; i++) {
|
||||
std::cout << std::setw(3) << (int)*(((uint8_t *)(&d)) + i);
|
||||
if (!((i + 1) % 8))
|
||||
std::cout << "|";
|
||||
else if (!((i + 1) % 4))
|
||||
std::cout << ":";
|
||||
else
|
||||
std::cout << " ";
|
||||
}
|
||||
std::cout << " " << msg << "\n";
|
||||
}
|
||||
|
||||
// dump bits low to high
|
||||
inline void dumpbits(uint64_t v, const std::string &msg) {
|
||||
for (uint32_t i = 0; i < 64; i++) {
|
||||
std::cout << (((v >> (uint64_t)i) & 0x1ULL) ? "1" : "_");
|
||||
}
|
||||
std::cout << " " << msg << "\n";
|
||||
}
|
||||
|
||||
inline void dumpbits32(uint32_t v, const std::string &msg) {
|
||||
for (uint32_t i = 0; i < 32; i++) {
|
||||
std::cout << (((v >> (uint32_t)i) & 0x1ULL) ? "1" : "_");
|
||||
}
|
||||
std::cout << " " << msg << "\n";
|
||||
}
|
||||
#else
|
||||
#define dump256(a, b) ;
|
||||
#define dumpbits(a, b) ;
|
||||
#define dumpbits32(a, b) ;
|
||||
#endif
|
||||
|
||||
// dump bits low to high
|
||||
inline void dumpbits_always(uint64_t v, const std::string &msg) {
|
||||
|
|
|
@ -69,29 +69,13 @@ really_inline bool parse_string(const uint8_t *buf, UNUSED size_t len,
|
|||
uint8_t *dst = pj.current_string_buf_loc;
|
||||
#ifdef JSON_TEST_STRINGS // for unit testing
|
||||
uint8_t *const start_of_string = dst;
|
||||
#endif
|
||||
#ifdef DEBUG
|
||||
cout << "Entering parse string with offset " << offset << "\n";
|
||||
#endif
|
||||
while (1) {
|
||||
#ifdef DEBUG
|
||||
for (uint32_t j = 0; j < 32; j++) {
|
||||
char c = *(src + j);
|
||||
if (isprint(c)) {
|
||||
cout << c;
|
||||
} else {
|
||||
cout << '_';
|
||||
}
|
||||
}
|
||||
cout << "| ... string handling input\n";
|
||||
#endif
|
||||
__m256i v = _mm256_loadu_si256((const __m256i *)(src));
|
||||
uint32_t bs_bits =
|
||||
(uint32_t)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')));
|
||||
dumpbits32(bs_bits, "backslash bits 2");
|
||||
uint32_t quote_bits =
|
||||
(uint32_t)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('"')));
|
||||
dumpbits32(quote_bits, "quote_bits");
|
||||
#define CHECKUNESCAPED
|
||||
// All Unicode characters may be placed within the
|
||||
// quotation marks, except for the characters that MUST be escaped:
|
||||
|
@ -108,14 +92,7 @@ really_inline bool parse_string(const uint8_t *buf, UNUSED size_t len,
|
|||
// store to dest unconditionally - we can overwrite the bits we don't like
|
||||
// later
|
||||
_mm256_storeu_si256((__m256i *)(dst), v);
|
||||
#ifdef DEBUG
|
||||
cout << "quote dist: " << quote_dist << " bs dist: " << bs_dist << "\n";
|
||||
#endif
|
||||
|
||||
if (quote_dist < bs_dist) {
|
||||
#ifdef DEBUG
|
||||
cout << "Found end, leaving!\n";
|
||||
#endif
|
||||
// we encountered quotes first. Move dst to point to quotes and exit
|
||||
dst[quote_dist] = 0; // null terminate and get out
|
||||
|
||||
|
@ -139,9 +116,6 @@ really_inline bool parse_string(const uint8_t *buf, UNUSED size_t len,
|
|||
#endif //CHECKUNESCAPED
|
||||
} else if (quote_dist > bs_dist) {
|
||||
uint8_t escape_char = src[bs_dist + 1];
|
||||
#ifdef DEBUG
|
||||
cout << "Found escape char: " << escape_char << "\n";
|
||||
#endif
|
||||
#ifdef CHECKUNESCAPED
|
||||
// we are going to need the unescaped_bits to check for unescaped chars
|
||||
uint32_t unescaped_bits = (uint32_t)_mm256_movemask_epi8(unescaped_vec);
|
||||
|
|
|
@ -67,18 +67,6 @@ WARN_UNUSED
|
|||
size_t idx = 0;
|
||||
for (; idx < lenminus64; idx += 64) {
|
||||
__builtin_prefetch(buf + idx + 128);
|
||||
#ifdef DEBUG
|
||||
cout << "Idx is " << idx << "\n";
|
||||
for (uint32_t j = 0; j < 64; j++) {
|
||||
char c = *(buf + idx + j);
|
||||
if (isprint(c)) {
|
||||
cout << c;
|
||||
} else {
|
||||
cout << '_';
|
||||
}
|
||||
}
|
||||
cout << "| ... input\n";
|
||||
#endif
|
||||
__m256i input_lo = _mm256_loadu_si256((const __m256i *)(buf + idx + 0));
|
||||
__m256i input_hi = _mm256_loadu_si256((const __m256i *)(buf + idx + 32));
|
||||
#ifdef SIMDJSON_UTF8VALIDATE
|
||||
|
@ -103,19 +91,12 @@ WARN_UNUSED
|
|||
|
||||
uint64_t bs_bits =
|
||||
cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('\\'));
|
||||
dumpbits(bs_bits, "backslash bits");
|
||||
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
|
||||
dumpbits(start_edges, "start_edges");
|
||||
|
||||
// flip lowest if we have an odd-length run at the end of the prior
|
||||
// iteration
|
||||
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
|
||||
uint64_t even_starts = start_edges & even_start_mask;
|
||||
uint64_t odd_starts = start_edges & ~even_start_mask;
|
||||
|
||||
dumpbits(even_starts, "even_starts");
|
||||
dumpbits(odd_starts, "odd_starts");
|
||||
|
||||
uint64_t even_carries = bs_bits + even_starts;
|
||||
|
||||
uint64_t odd_carries;
|
||||
|
@ -130,22 +111,11 @@ WARN_UNUSED
|
|||
// if we had an odd-numbered run at the
|
||||
// end of the previous iteration
|
||||
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
|
||||
|
||||
dumpbits(even_carries, "even_carries");
|
||||
dumpbits(odd_carries, "odd_carries");
|
||||
|
||||
uint64_t even_carry_ends = even_carries & ~bs_bits;
|
||||
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
|
||||
dumpbits(even_carry_ends, "even_carry_ends");
|
||||
dumpbits(odd_carry_ends, "odd_carry_ends");
|
||||
|
||||
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
|
||||
uint64_t odd_start_even_end = odd_carry_ends & even_bits;
|
||||
dumpbits(even_start_odd_end, "esoe");
|
||||
dumpbits(odd_start_even_end, "osee");
|
||||
|
||||
uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
|
||||
dumpbits(odd_ends, "odd_ends");
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Step 2: detect insides of quote pairs
|
||||
|
@ -154,12 +124,10 @@ WARN_UNUSED
|
|||
uint64_t quote_bits =
|
||||
cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
|
||||
quote_bits = quote_bits & ~odd_ends;
|
||||
dumpbits(quote_bits, "quote_bits");
|
||||
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
||||
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
|
||||
quote_mask ^= prev_iter_inside_quote;
|
||||
prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20, John Regher from Utah U. says this is fine code
|
||||
dumpbits(quote_mask, "quote_mask");
|
||||
|
||||
// How do we build up a user traversable data structure
|
||||
// first, do a 'shufti' to detect structural JSON characters
|
||||
|
@ -211,10 +179,6 @@ WARN_UNUSED
|
|||
uint64_t ws_res_0 = (uint32_t)_mm256_movemask_epi8(tmp_ws_lo);
|
||||
uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
|
||||
uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
|
||||
|
||||
dumpbits(structurals, "structurals");
|
||||
dumpbits(whitespace, "whitespace");
|
||||
|
||||
// mask off anything inside quotes
|
||||
structurals &= ~quote_mask;
|
||||
|
||||
|
@ -233,23 +197,15 @@ WARN_UNUSED
|
|||
// a qualified predecessor is something that can happen 1 position before an
|
||||
// psuedo-structural character
|
||||
uint64_t pseudo_pred = structurals | whitespace;
|
||||
dumpbits(pseudo_pred, "pseudo_pred");
|
||||
uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
|
||||
dumpbits(shifted_pseudo_pred, "shifted_pseudo_pred");
|
||||
prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
|
||||
uint64_t pseudo_structurals =
|
||||
shifted_pseudo_pred & (~whitespace) & (~quote_mask);
|
||||
dumpbits(pseudo_structurals, "pseudo_structurals");
|
||||
dumpbits(structurals, "final structurals without pseudos");
|
||||
structurals |= pseudo_structurals;
|
||||
dumpbits(structurals, "final structurals and pseudo structurals");
|
||||
|
||||
// now, we've used our close quotes all we need to. So let's switch them off
|
||||
// they will be off in the quote mask and on in quote bits.
|
||||
structurals &= ~(quote_bits & ~quote_mask);
|
||||
dumpbits(
|
||||
structurals,
|
||||
"final structurals and pseudo structurals after close quote removal");
|
||||
*(uint64_t *)(pj.structurals + idx / 8) = structurals;
|
||||
}
|
||||
|
||||
|
|
|
@ -15,20 +15,8 @@
|
|||
#include "simdjson/stringparsing.h"
|
||||
|
||||
#include <iostream>
|
||||
//#define DEBUG
|
||||
#define PATH_SEP '/'
|
||||
|
||||
#if defined(DEBUG) && !defined(DEBUG_PRINTF)
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#define DEBUG_PRINTF(format, ...) \
|
||||
printf("%s:%s:%d:" format, strrchr(__FILE__, PATH_SEP) + 1, __func__, \
|
||||
__LINE__, ##__VA_ARGS__)
|
||||
#elif !defined(DEBUG_PRINTF)
|
||||
#define DEBUG_PRINTF(format, ...) \
|
||||
do { \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
@ -93,13 +81,10 @@ bool unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
|||
{ \
|
||||
idx = pj.structural_indexes[i++]; \
|
||||
c = buf[idx]; \
|
||||
DEBUG_PRINTF("Got %c at %d (%d offset) (depth %d)\n", c, idx, i - 1, \
|
||||
depth); \
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////// START STATE /////////////////////////////
|
||||
DEBUG_PRINTF("at start\n");
|
||||
pj.ret_address[depth] = &&start_continue;
|
||||
pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
|
||||
|
@ -235,7 +220,6 @@ bool unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
|||
goto fail;
|
||||
}
|
||||
start_continue:
|
||||
DEBUG_PRINTF("in start_object_close\n");
|
||||
// the string might not be NULL terminated.
|
||||
if(i + 1 == pj.n_structural_indexes) {
|
||||
goto succeed;
|
||||
|
@ -245,7 +229,6 @@ start_continue:
|
|||
////////////////////////////// OBJECT STATES /////////////////////////////
|
||||
|
||||
object_begin:
|
||||
DEBUG_PRINTF("in object_begin\n");
|
||||
UPDATE_CHAR();
|
||||
switch (c) {
|
||||
case '"': {
|
||||
|
@ -261,7 +244,6 @@ object_begin:
|
|||
}
|
||||
|
||||
object_key_state:
|
||||
DEBUG_PRINTF("in object_key_state\n");
|
||||
UPDATE_CHAR();
|
||||
if (c != ':') {
|
||||
goto fail;
|
||||
|
@ -343,7 +325,6 @@ object_key_state:
|
|||
}
|
||||
|
||||
object_continue:
|
||||
DEBUG_PRINTF("in object_continue\n");
|
||||
UPDATE_CHAR();
|
||||
switch (c) {
|
||||
case ',':
|
||||
|
@ -375,8 +356,6 @@ scope_end:
|
|||
|
||||
////////////////////////////// ARRAY STATES /////////////////////////////
|
||||
array_begin:
|
||||
DEBUG_PRINTF("in array_begin\n");
|
||||
//pj.containing_scope_offset[depth] = pj.get_current_loc();
|
||||
UPDATE_CHAR();
|
||||
if (c == ']') {
|
||||
goto scope_end; // could also go to array_continue
|
||||
|
@ -462,7 +441,6 @@ main_array_switch:
|
|||
}
|
||||
|
||||
array_continue:
|
||||
DEBUG_PRINTF("in array_continue\n");
|
||||
UPDATE_CHAR();
|
||||
switch (c) {
|
||||
case ',':
|
||||
|
@ -477,7 +455,6 @@ array_continue:
|
|||
////////////////////////////// FINAL STATES /////////////////////////////
|
||||
|
||||
succeed:
|
||||
DEBUG_PRINTF("in succeed, depth = %d \n", depth);
|
||||
depth --;
|
||||
if(depth != 0) {
|
||||
printf("internal bug\n");
|
||||
|
@ -492,16 +469,10 @@ succeed:
|
|||
pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
|
||||
|
||||
|
||||
#ifdef DEBUG
|
||||
pj.dump_raw_tape();
|
||||
#endif
|
||||
|
||||
pj.isvalid = true;
|
||||
return true;
|
||||
|
||||
fail:
|
||||
DEBUG_PRINTF("in fail\n");
|
||||
#ifdef DEBUG
|
||||
pj.dump_tapes();
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue