Cleaning.

This commit is contained in:
Daniel Lemire 2018-12-27 20:19:10 -05:00
parent bf4089b33b
commit 46ef59c679
5 changed files with 1 additions and 133 deletions

BIN
doc/gbps.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

View File

@ -713,39 +713,6 @@ private :
};
#ifdef DEBUG
inline void dump256(m256 d, const std::string &msg) {
for (uint32_t i = 0; i < 32; i++) {
std::cout << std::setw(3) << (int)*(((uint8_t *)(&d)) + i);
if (!((i + 1) % 8))
std::cout << "|";
else if (!((i + 1) % 4))
std::cout << ":";
else
std::cout << " ";
}
std::cout << " " << msg << "\n";
}
// dump bits low to high
inline void dumpbits(uint64_t v, const std::string &msg) {
for (uint32_t i = 0; i < 64; i++) {
std::cout << (((v >> (uint64_t)i) & 0x1ULL) ? "1" : "_");
}
std::cout << " " << msg << "\n";
}
inline void dumpbits32(uint32_t v, const std::string &msg) {
for (uint32_t i = 0; i < 32; i++) {
std::cout << (((v >> (uint32_t)i) & 0x1ULL) ? "1" : "_");
}
std::cout << " " << msg << "\n";
}
#else
#define dump256(a, b) ;
#define dumpbits(a, b) ;
#define dumpbits32(a, b) ;
#endif
// dump bits low to high
inline void dumpbits_always(uint64_t v, const std::string &msg) {

View File

@ -69,29 +69,13 @@ really_inline bool parse_string(const uint8_t *buf, UNUSED size_t len,
uint8_t *dst = pj.current_string_buf_loc;
#ifdef JSON_TEST_STRINGS // for unit testing
uint8_t *const start_of_string = dst;
#endif
#ifdef DEBUG
cout << "Entering parse string with offset " << offset << "\n";
#endif
while (1) {
#ifdef DEBUG
for (uint32_t j = 0; j < 32; j++) {
char c = *(src + j);
if (isprint(c)) {
cout << c;
} else {
cout << '_';
}
}
cout << "| ... string handling input\n";
#endif
__m256i v = _mm256_loadu_si256((const __m256i *)(src));
uint32_t bs_bits =
(uint32_t)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')));
dumpbits32(bs_bits, "backslash bits 2");
uint32_t quote_bits =
(uint32_t)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('"')));
dumpbits32(quote_bits, "quote_bits");
#define CHECKUNESCAPED
// All Unicode characters may be placed within the
// quotation marks, except for the characters that MUST be escaped:
@ -108,14 +92,7 @@ really_inline bool parse_string(const uint8_t *buf, UNUSED size_t len,
// store to dest unconditionally - we can overwrite the bits we don't like
// later
_mm256_storeu_si256((__m256i *)(dst), v);
#ifdef DEBUG
cout << "quote dist: " << quote_dist << " bs dist: " << bs_dist << "\n";
#endif
if (quote_dist < bs_dist) {
#ifdef DEBUG
cout << "Found end, leaving!\n";
#endif
// we encountered quotes first. Move dst to point to quotes and exit
dst[quote_dist] = 0; // null terminate and get out
@ -139,9 +116,6 @@ really_inline bool parse_string(const uint8_t *buf, UNUSED size_t len,
#endif //CHECKUNESCAPED
} else if (quote_dist > bs_dist) {
uint8_t escape_char = src[bs_dist + 1];
#ifdef DEBUG
cout << "Found escape char: " << escape_char << "\n";
#endif
#ifdef CHECKUNESCAPED
// we are going to need the unescaped_bits to check for unescaped chars
uint32_t unescaped_bits = (uint32_t)_mm256_movemask_epi8(unescaped_vec);

View File

@ -67,18 +67,6 @@ WARN_UNUSED
size_t idx = 0;
for (; idx < lenminus64; idx += 64) {
__builtin_prefetch(buf + idx + 128);
#ifdef DEBUG
cout << "Idx is " << idx << "\n";
for (uint32_t j = 0; j < 64; j++) {
char c = *(buf + idx + j);
if (isprint(c)) {
cout << c;
} else {
cout << '_';
}
}
cout << "| ... input\n";
#endif
__m256i input_lo = _mm256_loadu_si256((const __m256i *)(buf + idx + 0));
__m256i input_hi = _mm256_loadu_si256((const __m256i *)(buf + idx + 32));
#ifdef SIMDJSON_UTF8VALIDATE
@ -103,19 +91,12 @@ WARN_UNUSED
uint64_t bs_bits =
cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('\\'));
dumpbits(bs_bits, "backslash bits");
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
dumpbits(start_edges, "start_edges");
// flip lowest if we have an odd-length run at the end of the prior
// iteration
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
uint64_t even_starts = start_edges & even_start_mask;
uint64_t odd_starts = start_edges & ~even_start_mask;
dumpbits(even_starts, "even_starts");
dumpbits(odd_starts, "odd_starts");
uint64_t even_carries = bs_bits + even_starts;
uint64_t odd_carries;
@ -130,22 +111,11 @@ WARN_UNUSED
// if we had an odd-numbered run at the
// end of the previous iteration
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
dumpbits(even_carries, "even_carries");
dumpbits(odd_carries, "odd_carries");
uint64_t even_carry_ends = even_carries & ~bs_bits;
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
dumpbits(even_carry_ends, "even_carry_ends");
dumpbits(odd_carry_ends, "odd_carry_ends");
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
uint64_t odd_start_even_end = odd_carry_ends & even_bits;
dumpbits(even_start_odd_end, "esoe");
dumpbits(odd_start_even_end, "osee");
uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
dumpbits(odd_ends, "odd_ends");
////////////////////////////////////////////////////////////////////////////////////////////
// Step 2: detect insides of quote pairs
@ -154,12 +124,10 @@ WARN_UNUSED
uint64_t quote_bits =
cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
quote_bits = quote_bits & ~odd_ends;
dumpbits(quote_bits, "quote_bits");
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20, John Regher from Utah U. says this is fine code
dumpbits(quote_mask, "quote_mask");
// How do we build up a user traversable data structure
// first, do a 'shufti' to detect structural JSON characters
@ -211,10 +179,6 @@ WARN_UNUSED
uint64_t ws_res_0 = (uint32_t)_mm256_movemask_epi8(tmp_ws_lo);
uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
dumpbits(structurals, "structurals");
dumpbits(whitespace, "whitespace");
// mask off anything inside quotes
structurals &= ~quote_mask;
@ -233,23 +197,15 @@ WARN_UNUSED
// a qualified predecessor is something that can happen 1 position before an
// psuedo-structural character
uint64_t pseudo_pred = structurals | whitespace;
dumpbits(pseudo_pred, "pseudo_pred");
uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
dumpbits(shifted_pseudo_pred, "shifted_pseudo_pred");
prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
uint64_t pseudo_structurals =
shifted_pseudo_pred & (~whitespace) & (~quote_mask);
dumpbits(pseudo_structurals, "pseudo_structurals");
dumpbits(structurals, "final structurals without pseudos");
structurals |= pseudo_structurals;
dumpbits(structurals, "final structurals and pseudo structurals");
// now, we've used our close quotes all we need to. So let's switch them off
// they will be off in the quote mask and on in quote bits.
structurals &= ~(quote_bits & ~quote_mask);
dumpbits(
structurals,
"final structurals and pseudo structurals after close quote removal");
*(uint64_t *)(pj.structurals + idx / 8) = structurals;
}

View File

@ -15,20 +15,8 @@
#include "simdjson/stringparsing.h"
#include <iostream>
//#define DEBUG
#define PATH_SEP '/'
#if defined(DEBUG) && !defined(DEBUG_PRINTF)
#include <stdio.h>
#include <string.h>
#define DEBUG_PRINTF(format, ...) \
printf("%s:%s:%d:" format, strrchr(__FILE__, PATH_SEP) + 1, __func__, \
__LINE__, ##__VA_ARGS__)
#elif !defined(DEBUG_PRINTF)
#define DEBUG_PRINTF(format, ...) \
do { \
} while (0)
#endif
using namespace std;
@ -93,13 +81,10 @@ bool unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
{ \
idx = pj.structural_indexes[i++]; \
c = buf[idx]; \
DEBUG_PRINTF("Got %c at %d (%d offset) (depth %d)\n", c, idx, i - 1, \
depth); \
}
////////////////////////////// START STATE /////////////////////////////
DEBUG_PRINTF("at start\n");
pj.ret_address[depth] = &&start_continue;
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
@ -235,7 +220,6 @@ bool unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) {
goto fail;
}
start_continue:
DEBUG_PRINTF("in start_object_close\n");
// the string might not be NULL terminated.
if(i + 1 == pj.n_structural_indexes) {
goto succeed;
@ -245,7 +229,6 @@ start_continue:
////////////////////////////// OBJECT STATES /////////////////////////////
object_begin:
DEBUG_PRINTF("in object_begin\n");
UPDATE_CHAR();
switch (c) {
case '"': {
@ -261,7 +244,6 @@ object_begin:
}
object_key_state:
DEBUG_PRINTF("in object_key_state\n");
UPDATE_CHAR();
if (c != ':') {
goto fail;
@ -343,7 +325,6 @@ object_key_state:
}
object_continue:
DEBUG_PRINTF("in object_continue\n");
UPDATE_CHAR();
switch (c) {
case ',':
@ -375,8 +356,6 @@ scope_end:
////////////////////////////// ARRAY STATES /////////////////////////////
array_begin:
DEBUG_PRINTF("in array_begin\n");
//pj.containing_scope_offset[depth] = pj.get_current_loc();
UPDATE_CHAR();
if (c == ']') {
goto scope_end; // could also go to array_continue
@ -462,7 +441,6 @@ main_array_switch:
}
array_continue:
DEBUG_PRINTF("in array_continue\n");
UPDATE_CHAR();
switch (c) {
case ',':
@ -477,7 +455,6 @@ array_continue:
////////////////////////////// FINAL STATES /////////////////////////////
succeed:
DEBUG_PRINTF("in succeed, depth = %d \n", depth);
depth --;
if(depth != 0) {
printf("internal bug\n");
@ -492,16 +469,10 @@ succeed:
pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
#ifdef DEBUG
pj.dump_raw_tape();
#endif
pj.isvalid = true;
return true;
fail:
DEBUG_PRINTF("in fail\n");
#ifdef DEBUG
pj.dump_tapes();
#endif
return false;
}