Merge branch 'master' of https://github.com/lemire/simdjson
This commit is contained in:
commit
59bfb807e1
4
Makefile
4
Makefile
|
@ -6,7 +6,7 @@
|
|||
|
||||
.PHONY: clean cleandist
|
||||
|
||||
CXXFLAGS = -std=c++11 -O2 -march=native -Wall -Wextra -Wshadow
|
||||
CXXFLAGS = -std=c++11 -O2 -march=native -Wall -Wextra -Wshadow
|
||||
#CXXFLAGS = -std=c++11 -O2 -march=native -Wall -Wextra -Wshadow -Wno-implicit-function-declaration
|
||||
|
||||
EXECUTABLES=parse
|
||||
|
@ -15,7 +15,7 @@ EXECUTABLES=parse
|
|||
|
||||
all: $(EXECUTABLES)
|
||||
|
||||
parse: main.cpp common_defs.h linux-perf-events.h
|
||||
parse: main.cpp common_defs.h vecdecode.h linux-perf-events.h
|
||||
$(CXX) $(CXXFLAGS) -o parse main.cpp
|
||||
|
||||
|
||||
|
|
|
@ -83,7 +83,7 @@ APA
|
|||
- N. Nakasato, "Implementation of a parallel tree method on a GPU", Journal of Computational Science, vol. 3, no. 3, pp. 132-141, 2012.
|
||||
|
||||
## References
|
||||
|
||||
- [How to implement atoi using SIMD?](https://stackoverflow.com/questions/35127060/how-to-implement-atoi-using-simd)
|
||||
- [Parsing JSON is a Minefield 💣](http://seriot.ch/parsing_json.php)
|
||||
- https://tools.ietf.org/html/rfc7159
|
||||
- The only public Mison implementation (in rust) https://github.com/pikkr/pikkr
|
||||
|
|
118
main.cpp
118
main.cpp
|
@ -245,7 +245,7 @@ never_inline bool find_structural_bits(const u8 * buf, size_t len, ParsedJson &
|
|||
|
||||
// Now, establish "pseudo-structural characters". These are non-whitespace characters
|
||||
// that are (a) outside quotes and (b) have a predecessor that's either whitespace or a structural
|
||||
// character. This means that subsequent passes will get a chance to encounter the first character
|
||||
// character. This means that subsequent passes will get a chance to encounter the first character
|
||||
// of every string of non-whitespace and, if we're parsing an atom like true/false/null or a number
|
||||
// we can stop at the first whitespace or structural character following it.
|
||||
|
||||
|
@ -275,13 +275,23 @@ const u32 NUM_RESERVED_NODES = 2;
|
|||
const u32 DUMMY_NODE = 0;
|
||||
const u32 ROOT_NODE = 1;
|
||||
|
||||
#define VECDECODE
|
||||
#ifdef VECDECODE
|
||||
#include "vecdecode.h"
|
||||
#endif
|
||||
// just transform the bitmask to a big list of 32-bit integers for now
|
||||
// that's all; the type of character the offset points to will
|
||||
// tell us exactly what we need to know. Naive but straightforward implementation
|
||||
never_inline bool flatten_indexes(size_t len, ParsedJson & pj) {
|
||||
u32 base = NUM_RESERVED_NODES;
|
||||
u32 * base_ptr = pj.structural_indexes;
|
||||
base_ptr[DUMMY_NODE] = base_ptr[ROOT_NODE] = 0; // really shouldn't matter
|
||||
#ifdef VECDECODE
|
||||
u32 number = bitmap_decode_avx2(pj.structurals, len, base_ptr + NUM_RESERVED_NODES) + NUM_RESERVED_NODES;
|
||||
pj.n_structural_indexes = number;
|
||||
base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array
|
||||
return true;
|
||||
#else
|
||||
u32 base = NUM_RESERVED_NODES;
|
||||
for (size_t idx = 0; idx < len; idx+=64) {
|
||||
u64 s = *(u64 *)(pj.structurals + idx/8);
|
||||
#ifdef SUPPRESS_CHEESY_FLATTEN
|
||||
|
@ -312,12 +322,13 @@ never_inline bool flatten_indexes(size_t len, ParsedJson & pj) {
|
|||
pj.n_structural_indexes = base;
|
||||
base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
const u32 MAX_DEPTH = 256;
|
||||
const u32 DEPTH_SAFETY_MARGIN = 32; // should be power-of-2 as we check this with a modulo in our
|
||||
// hot stage 3 loop
|
||||
// hot stage 3 loop
|
||||
const u32 START_DEPTH = DEPTH_SAFETY_MARGIN;
|
||||
const u32 REDLINE_DEPTH = MAX_DEPTH - DEPTH_SAFETY_MARGIN;
|
||||
|
||||
|
@ -330,7 +341,7 @@ const u32 REDLINE_DEPTH = MAX_DEPTH - DEPTH_SAFETY_MARGIN;
|
|||
// These structures are either u32 offsets of other tapes or u32 offsets into our input
|
||||
// or structures.
|
||||
//
|
||||
// The state machine doesn't record ouput.
|
||||
// The state machine doesn't record ouput.
|
||||
// The tape machine doesn't validate.
|
||||
//
|
||||
// The output of the tape machine is meaningful only if the state machine is in non-error states.
|
||||
|
@ -360,7 +371,7 @@ inline size_t get_write_size(u32 control) { return control & 0xff; }
|
|||
const u32 char_control[256] = {
|
||||
// nothing interesting from 0x00-0x20
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
|
||||
// " is 0x22, - is 0x2d
|
||||
CDF,CDF,C04,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,C04,CDF,CDF,
|
||||
|
@ -369,24 +380,24 @@ const u32 char_control[256] = {
|
|||
C04,C04,C04,C04, C04,C04,C04,C04, C04,C04,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
|
||||
// nothing interesting from 0x40-0x49
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
|
||||
// 0x5b/5d are []
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CP4, CDF,CM4,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CP4, CDF,CM4,CDF,CDF,
|
||||
|
||||
// f is 0x66 n is 0x6e
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,C04,CDF, CDF,CDF,CDF,CDF, CDF,CDF,C04,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,C04,CDF, CDF,CDF,CDF,CDF, CDF,CDF,C04,CDF,
|
||||
|
||||
// 0x7b/7d are {}, 74 is t
|
||||
CDF,CDF,CDF,CDF, C04,CDF,CDF,CDF, CDF,CDF,CDF,CP4, CDF,CM4,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, C04,CDF,CDF,CDF, CDF,CDF,CDF,CP4, CDF,CM4,CDF,CDF,
|
||||
|
||||
// nothing interesting from 0x80-0xff
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
|
||||
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF
|
||||
};
|
||||
|
@ -396,7 +407,7 @@ const size_t MAX_TAPE = MAX_DEPTH * MAX_TAPE_ENTRIES;
|
|||
|
||||
// all of this stuff needs to get moved somewhere reasonable
|
||||
// like our ParsedJson structure
|
||||
u64 tape[MAX_TAPE];
|
||||
u64 tape[MAX_TAPE];
|
||||
u32 tape_locs[MAX_DEPTH];
|
||||
u8 string_buf[512*1024];
|
||||
u8 * current_string_buf_loc;
|
||||
|
@ -416,7 +427,7 @@ const int START_STATE = 1;
|
|||
const int START_DEPTH_START_STATE = 13;
|
||||
|
||||
// ANYTHING_IS_ERROR_STATE is useful both as a target
|
||||
// for a transition at the start depth and also as
|
||||
// for a transition at the start depth and also as
|
||||
// a good initial value for "red line" depths; that
|
||||
// is, depths that are maintained strictly to avoid
|
||||
// undefined behavior (e.g. depths below the starting
|
||||
|
@ -445,7 +456,7 @@ never_inline void init_state_machine() {
|
|||
trans[12][(u32)UNARIES[i]] = 11;
|
||||
trans[13][(u32)UNARIES[i]] = 14;
|
||||
}
|
||||
|
||||
|
||||
// back transitions when new things are open
|
||||
trans[2]['{'] = 2;
|
||||
trans[7]['{'] = 2;
|
||||
|
@ -469,7 +480,7 @@ never_inline bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj
|
|||
// characters without aggressively going wrong and writing to bad memory
|
||||
// Note that any specious depth can have a specious tape associated with and all these specious depths
|
||||
// can share a region of the tape - it's harmless. Since tape is one-way, any movement in a specious tape
|
||||
// is an error (so we can detect max_depth violations by making sure that specious tape locations haven't
|
||||
// is an error (so we can detect max_depth violations by making sure that specious tape locations haven't
|
||||
// moved from their starting values)
|
||||
|
||||
u32 depth = START_DEPTH;
|
||||
|
@ -510,12 +521,12 @@ never_inline bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj
|
|||
|
||||
u32 idx = next_idx;
|
||||
u8 c = next_c;
|
||||
u32 control = next_control;
|
||||
u32 control = next_control;
|
||||
|
||||
next_idx = pj.structural_indexes[i+1];
|
||||
next_c = buf[next_idx];
|
||||
next_control = char_control[next_c];
|
||||
|
||||
|
||||
// TAPE MACHINE
|
||||
s8 depth_adjust = get_depth_adjust(control);
|
||||
u8 write_size = get_write_size(control);
|
||||
|
@ -523,8 +534,8 @@ never_inline bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj
|
|||
depth += depth_adjust;
|
||||
#ifdef DEBUG
|
||||
cout << "i: " << i << " idx: " << idx << " c " << c << "\n";
|
||||
cout << "TAPE MACHINE: depth change " << (s32)depth_adjust
|
||||
<< " write_size " << (u32)write_size << " current_depth: " << depth << "\n";
|
||||
cout << "TAPE MACHINE: depth change " << (s32)depth_adjust
|
||||
<< " write_size " << (u32)write_size << " current_depth: " << depth << "\n";
|
||||
#endif
|
||||
|
||||
// STATE MACHINE - hoisted here to fill in during the tape machine's latencies
|
||||
|
@ -536,15 +547,16 @@ never_inline bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj
|
|||
cout << "post " << states[depth] << "\n";
|
||||
#endif
|
||||
// TAPE MACHINE, again
|
||||
tape[tape_locs[depth]] = write_val | (((u64)c) << 56);
|
||||
tape[tape_locs[depth]] = write_val | (((u64)c) << 56);
|
||||
old_tape_loc = tape_locs[depth] += write_size;
|
||||
}
|
||||
|
||||
for (u32 i = 0; i < MAX_DEPTH; i++) {
|
||||
if (states[i] == 0) {
|
||||
printf("states[%d] == 0\n", i);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define DUMP_TAPES
|
||||
#ifdef DEBUG
|
||||
|
@ -559,14 +571,14 @@ never_inline bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj
|
|||
cout << " (NORMAL) ";
|
||||
}
|
||||
|
||||
cout << " from: " << start_loc
|
||||
cout << " from: " << start_loc
|
||||
<< " to: " << tape_locs[i] << " "
|
||||
<< " size: " << (tape_locs[i]-start_loc) << "\n";
|
||||
cout << " state: " << states[i] << "\n";
|
||||
cout << " state: " << states[i] << "\n";
|
||||
#ifdef DUMP_TAPES
|
||||
for (u32 j = start_loc; j < tape_locs[i]; j++) {
|
||||
if (tape[j]) {
|
||||
cout << "j: " << j << " tape[j] char " << (char)(tape[j]>>56)
|
||||
cout << "j: " << j << " tape[j] char " << (char)(tape[j]>>56)
|
||||
<< " tape[j][0..55]: " << (tape[j]&0xffffffffffffffULL ) << "\n";
|
||||
}
|
||||
}
|
||||
|
@ -609,12 +621,12 @@ const u32 structural_or_whitespace_negated[256] = {
|
|||
};
|
||||
|
||||
// return non-zero if not a structural or whitespace char
|
||||
// zero otherwise
|
||||
// zero otherwise
|
||||
really_inline u32 is_not_structural_or_whitespace(u8 c) {
|
||||
return structural_or_whitespace_negated[c];
|
||||
}
|
||||
|
||||
// These chars yield themselves: " \ /
|
||||
// These chars yield themselves: " \ /
|
||||
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
||||
// u not handled in this table as it's complex
|
||||
const u8 escape_map[256] = {
|
||||
|
@ -628,7 +640,7 @@ const u8 escape_map[256] = {
|
|||
0,0,0x08,0, 0,0,0x12,0, 0,0,0,0, 0,0,0x0a,0, //0x6.
|
||||
0,0,0x0d,0, 0x09,0,0,0, 0,0,0,0, 0,0,0,0, //0x7.
|
||||
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
|
||||
|
@ -642,7 +654,7 @@ const u8 escape_map[256] = {
|
|||
|
||||
const u32 leading_zeros_to_utf_bytes[33] = {
|
||||
1,
|
||||
1, 1, 1, 1, 1, 1, 1, // 7 bits for first one
|
||||
1, 1, 1, 1, 1, 1, 1, // 7 bits for first one
|
||||
2, 2, 2, 2, // 11 bits for next
|
||||
3, 3, 3, 3, 3, // 16 bits for next
|
||||
4, 4, 4, 4, 4, // 21 bits for next
|
||||
|
@ -724,7 +736,7 @@ really_inline bool handle_unicode_codepoint(const u8 ** src_ptr, u8 ** dst_ptr)
|
|||
// TODO: check to see whether the below code is nonsense (it's really only a sketch at this point)
|
||||
u32 lz = __builtin_clz(code_point);
|
||||
u32 utf_bytes = leading_zeros_to_utf_bytes[lz];
|
||||
u32 tmp = _pdep_u32(code_point, UTF_PDEP_MASK[utf_bytes]) | UTF_OR_MASK[utf_bytes];
|
||||
u32 tmp = _pdep_u32(code_point, UTF_PDEP_MASK[utf_bytes]) | UTF_OR_MASK[utf_bytes];
|
||||
// swap and move to the other side of the register
|
||||
tmp = __builtin_bswap32(tmp);
|
||||
tmp >>= (4 - utf_bytes) * 8;
|
||||
|
@ -734,7 +746,7 @@ really_inline bool handle_unicode_codepoint(const u8 ** src_ptr, u8 ** dst_ptr)
|
|||
}
|
||||
|
||||
really_inline bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc) {
|
||||
u32 offset = tape[tape_loc] & 0xffffff;
|
||||
u32 offset = tape[tape_loc] & 0xffffff;
|
||||
const u8 * src = &buf[offset+1]; // we know that buf at offset is a "
|
||||
u8 * dst = current_string_buf_loc;
|
||||
#ifdef DEBUG
|
||||
|
@ -789,11 +801,11 @@ really_inline bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED Parsed
|
|||
if (!handle_unicode_codepoint(&src, &dst)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
return true;
|
||||
} else {
|
||||
// simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
||||
// write bs_dist+1 characters to output
|
||||
// note this may reach beyond the part of the buffer we've actually seen.
|
||||
// note this may reach beyond the part of the buffer we've actually seen.
|
||||
// I think this is ok
|
||||
u8 escape_result = escape_map[escape_char];
|
||||
if (!escape_result)
|
||||
|
@ -801,15 +813,15 @@ really_inline bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED Parsed
|
|||
dst[bs_dist] = escape_result;
|
||||
src += bs_dist+2;
|
||||
dst += bs_dist+1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// they are the same. Since they can't co-occur, it means we encountered neither.
|
||||
src+=32;
|
||||
dst+=32;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// later extensions -
|
||||
// later extensions -
|
||||
// if \\ we could detect whether it's a substantial run of \ or just eat 2 chars and write 1
|
||||
// handle anything short of \u or \\\ (as a prefix) with clever PSHUFB stuff and don't leave SIMD
|
||||
return true;
|
||||
|
@ -826,11 +838,11 @@ really_inline bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED Parsed
|
|||
// TODO: see if we really need a separate number_buf or whether we should just
|
||||
// have a generic scratch - would need to align before using for this
|
||||
really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc, UNUSED bool found_zero, bool found_minus) {
|
||||
u32 offset = tape[tape_loc] & 0xffffff;
|
||||
u32 offset = tape[tape_loc] & 0xffffff;
|
||||
if (found_minus) {
|
||||
offset++;
|
||||
}
|
||||
const u8 * src = &buf[offset];
|
||||
const u8 * src = &buf[offset];
|
||||
m256 v = _mm256_loadu_si256((const m256 *)(src));
|
||||
u64 error_sump = 0;
|
||||
#ifdef DEBUG
|
||||
|
@ -856,7 +868,7 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
|
|||
// Terminators
|
||||
// Whitespace: 0x20, 0x09, 0x0a, 0x0d - bucket 5+6
|
||||
// Comma and the closes: 0x2c is comma, } is 0x5d, ] is 0x7d - bucket 5+7
|
||||
|
||||
|
||||
// Another shufti - also a bit hand-hacked. Need to make a better construction
|
||||
const m256 low_nibble_mask = _mm256_setr_epi8(
|
||||
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
|
||||
|
@ -887,7 +899,7 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
|
|||
// TODO: make a mask that indicates where our digits are
|
||||
u32 number_mask = ~enders & (enders-1);
|
||||
dumpbits32(number_mask, "number mask");
|
||||
|
||||
|
||||
m256 n_mask = _mm256_set1_epi8(0x1f);
|
||||
m256 tmp_n = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, n_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
|
@ -895,11 +907,11 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
|
|||
|
||||
// put something into our error sump if we have something
|
||||
// before our ending characters that isn't a valid character
|
||||
// for the inside of our JSON
|
||||
// for the inside of our JSON
|
||||
number_characters &= number_mask;
|
||||
error_sump |= number_characters ^ number_mask;
|
||||
error_sump |= number_characters ^ number_mask;
|
||||
dumpbits32(number_characters, "number characters");
|
||||
|
||||
|
||||
m256 d_mask = _mm256_set1_epi8(0x03);
|
||||
m256 tmp_d = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, d_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
|
@ -927,10 +939,10 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
|
|||
u32 sign_characters = ~(u32)_mm256_movemask_epi8(tmp_s);
|
||||
sign_characters &= number_mask;
|
||||
dumpbits32(sign_characters, "sign characters");
|
||||
|
||||
|
||||
u32 digit_edges = ~(digit_characters << 1) & digit_characters;
|
||||
dumpbits32(digit_edges, "digit_edges");
|
||||
|
||||
|
||||
// check that we have 1-3 'edges' only
|
||||
u32 t = digit_edges;
|
||||
t &= t-1; t &= t-1; t &= t-1;
|
||||
|
@ -962,7 +974,7 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
|
|||
#endif
|
||||
*((u64 *)current_number_buf_loc) = result;
|
||||
tape[tape_loc] = ((u32)'l') << 24 | (current_number_buf_loc - number_buf); // assume 2^24 will hold all numbers for now
|
||||
current_number_buf_loc += 8;
|
||||
current_number_buf_loc += 8;
|
||||
} else {
|
||||
// try a strtod
|
||||
char * end;
|
||||
|
@ -979,7 +991,7 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
|
|||
#endif
|
||||
*((double *)current_number_buf_loc) = result;
|
||||
tape[tape_loc] = ((u32)'d') << 24 | (current_number_buf_loc - number_buf); // assume 2^24 will hold all numbers for now
|
||||
current_number_buf_loc += 8;
|
||||
current_number_buf_loc += 8;
|
||||
}
|
||||
// TODO: check the MSB element is a digit
|
||||
|
||||
|
@ -997,12 +1009,12 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
|
|||
|
||||
// TODO: eE mark and +- construct are adjacent with eE first
|
||||
// eE mark preceeds final cluster of numbers only
|
||||
// and immediately follows second-last cluster of numbers only (not
|
||||
// and immediately follows second-last cluster of numbers only (not
|
||||
// necessarily second, as we may have 4e10).
|
||||
// it may suffice to insist that eE is preceeded immediately
|
||||
// by a digit of any kind and that it's followed locally by
|
||||
// a digit immediately or a +- construct then a digit.
|
||||
|
||||
|
||||
// TODO: if we have both . and the eE mark then the . must
|
||||
// precede the eE mark
|
||||
|
||||
|
@ -1058,7 +1070,7 @@ never_inline bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
u32 head_marker_loc = tape[j] & 0xffffffffffffffULL;
|
||||
u64 tape_enclosing = tape[head_marker_loc];
|
||||
u8 enclosing_c = tape_enclosing >> 56;
|
||||
tape[head_marker_loc] = tape[j];
|
||||
tape[head_marker_loc] = tape[j];
|
||||
tape[j] = tape_enclosing;
|
||||
error_sump |= (enclosing_c - head_marker_c - 2); // [] and {} only differ by 2 chars
|
||||
break;
|
||||
|
@ -1073,7 +1085,7 @@ never_inline bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
case '0':
|
||||
error_sump |= !parse_number(buf, len, pj, j, true, false);
|
||||
break;
|
||||
case '-':
|
||||
case '-':
|
||||
error_sump |= !parse_number(buf, len, pj, j, false, true);
|
||||
break;
|
||||
case 't': {
|
||||
|
@ -1187,7 +1199,7 @@ int main(int argc, char * argv[]) {
|
|||
pj.structural_indexes = new u32[max_structures];
|
||||
pj.nodes = new JsonNode[max_structures];
|
||||
|
||||
#if defined(DEBUG)
|
||||
#if defined(DEBUG)
|
||||
const u32 iterations = 1;
|
||||
#else
|
||||
const u32 iterations = 1000;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
HEADERS:=include/avxprocessing.h include/benchmark.h include/common_defs.h include/jsonstruct.h include/scalarprocessing.h include/util.h
|
||||
HEADERS:=include/avxprocessing.h include/benchmark.h include/common_defs.h include/jsonstruct.h include/ include/util.h
|
||||
bench: benchmarks/bench.cpp rapidjson/license.txt $(HEADERS)
|
||||
$(CXX) -std=c++11 -O3 -o $@ benchmarks/bench.cpp -Irapidjson/include -Iinclude -march=native -lm -Wall -Wextra -Wno-narrowing
|
||||
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
#include "rapidjson/reader.h" // you have to check in the submodule
|
||||
#include "rapidjson/stringbuffer.h"
|
||||
#include "rapidjson/writer.h"
|
||||
#include "scalarprocessing.h"
|
||||
#include "util.h"
|
||||
|
||||
// colorfuldisplay(ParsedJson & pj, const u8 * buf)
|
||||
|
@ -82,22 +81,11 @@ int main(int argc, char *argv[]) {
|
|||
u32 max_structures = ROUNDUP_N(p.second, 64) + 2 + 7;
|
||||
pj.structural_indexes = new u32[max_structures];
|
||||
pj.nodes = new JsonNode[max_structures];
|
||||
if (verbose) {
|
||||
std::cout << "Parsing SIMD (once) " << std::endl;
|
||||
avx_json_parse(p.first, p.second, pj);
|
||||
colorfuldisplay(pj, p.first);
|
||||
debugdisplay(pj, p.first);
|
||||
std::cout << "Parsing scalar (once) " << std::endl;
|
||||
scalar_json_parse(p.first, p.second, pj);
|
||||
colorfuldisplay(pj, p.first);
|
||||
debugdisplay(pj, p.first);
|
||||
}
|
||||
|
||||
|
||||
int repeat = 10;
|
||||
int volume = p.second;
|
||||
BEST_TIME_NOCHECK(avx_json_parse(p.first, p.second, pj), , repeat, volume,
|
||||
true);
|
||||
BEST_TIME_NOCHECK(scalar_json_parse(p.first, p.second, pj), , repeat, volume,
|
||||
BEST_TIME(avx_json_parse(p.first, p.second, pj), true , , repeat, volume,
|
||||
true);
|
||||
|
||||
rapidjson::Document d;
|
||||
|
@ -120,18 +108,18 @@ int main(int argc, char *argv[]) {
|
|||
true);
|
||||
memcpy(buffer, p.first, p.second);
|
||||
|
||||
size_t outlength = copy_without_useless_spaces((const uint8_t *)buffer, p.second,(uint8_t *) buffer);
|
||||
size_t outlength = copy_without_useless_spaces_avx((const uint8_t *)buffer, p.second,(uint8_t *) buffer);
|
||||
printf("these should match: %zu %zu \n", strlength, outlength);
|
||||
|
||||
|
||||
uint8_t * cbuffer = (uint8_t *)buffer;
|
||||
BEST_TIME(copy_without_useless_spaces(cbuffer, p.second,cbuffer), outlength,
|
||||
BEST_TIME(copy_without_useless_spaces_avx(cbuffer, p.second,cbuffer), outlength,
|
||||
memcpy(buffer, p.first, p.second), repeat, volume, true);
|
||||
|
||||
BEST_TIME(despace(cbuffer, p.second,cbuffer), outlength,
|
||||
BEST_TIME(scalar_despace(cbuffer, p.second,cbuffer), outlength,
|
||||
memcpy(buffer, p.first, p.second), repeat, volume, true);
|
||||
|
||||
BEST_TIME(d.ParseInsitu(buffer).HasParseError(),false, cbuffer[copy_without_useless_spaces((const uint8_t *)p.first, p.second,cbuffer)]='\0' , repeat, volume,
|
||||
printf("parsing with RapidJSON after despacing:\n");
|
||||
BEST_TIME(d.ParseInsitu(buffer).HasParseError(),false, cbuffer[copy_without_useless_spaces_avx((const uint8_t *)p.first, p.second,cbuffer)]='\0' , repeat, volume,
|
||||
true);
|
||||
|
||||
free(buffer);
|
||||
|
|
|
@ -33,7 +33,7 @@ static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi,
|
|||
|
||||
// take input from buf and remove useless whitespace, input and output can be
|
||||
// the same
|
||||
static inline size_t copy_without_useless_spaces(const uint8_t *buf, size_t len,
|
||||
static inline size_t copy_without_useless_spaces_avx(const uint8_t *buf, size_t len,
|
||||
uint8_t *out) {
|
||||
// Useful constant masks
|
||||
const uint64_t even_bits = 0x5555555555555555ULL;
|
||||
|
@ -117,11 +117,11 @@ static inline size_t copy_without_useless_spaces(const uint8_t *buf, size_t len,
|
|||
int pop3 = _popcnt64((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
|
||||
int pop4 = _popcnt64((~whitespace));
|
||||
__m256i vmask1 =
|
||||
_mm256_loadu2_m128i((const __m128i *)mask128_epi8 + mask2,
|
||||
(const __m128i *)mask128_epi8 + mask1);
|
||||
_mm256_loadu2_m128i((const __m128i *)mask128_epi8 + (mask2 & 0x7FFFF),
|
||||
(const __m128i *)mask128_epi8 + (mask1 & 0x7FFFF));
|
||||
__m256i vmask2 =
|
||||
_mm256_loadu2_m128i((const __m128i *)mask128_epi8 + mask4,
|
||||
(const __m128i *)mask128_epi8 + mask3);
|
||||
_mm256_loadu2_m128i((const __m128i *)mask128_epi8 + (mask4 & 0x7FFFF),
|
||||
(const __m128i *)mask128_epi8 + (mask3 & 0x7FFFF));
|
||||
__m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
|
||||
__m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
|
||||
_mm256_storeu2_m128i((__m128i *)(out + pop1), (__m128i *)out, result1);
|
||||
|
@ -163,36 +163,7 @@ static inline size_t copy_without_useless_spaces(const uint8_t *buf, size_t len,
|
|||
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
|
||||
quote_mask ^= prev_iter_inside_quote;
|
||||
prev_iter_inside_quote = (uint64_t)((s64)quote_mask >> 63);
|
||||
/*
|
||||
const __m256i low_nibble_mask = _mm256_setr_epi8(
|
||||
// 0 9 a b c d
|
||||
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 8, 12, 1, 2, 9, 0, 0);
|
||||
const __m256i high_nibble_mask = _mm256_setr_epi8(
|
||||
// 0 2 3 5 7
|
||||
8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
|
||||
1, 0, 0, 0, 3, 2, 1, 0, 0);
|
||||
__m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);
|
||||
__m256i v_lo = _mm256_and_si256(
|
||||
_mm256_shuffle_epi8(low_nibble_mask, input_lo),
|
||||
_mm256_shuffle_epi8(high_nibble_mask,
|
||||
_mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
|
||||
_mm256_set1_epi8(0x7f))));
|
||||
|
||||
__m256i v_hi = _mm256_and_si256(
|
||||
_mm256_shuffle_epi8(low_nibble_mask, input_hi),
|
||||
_mm256_shuffle_epi8(high_nibble_mask,
|
||||
_mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
|
||||
_mm256_set1_epi8(0x7f))));
|
||||
__m256i tmp_ws_lo = _mm256_cmpeq_epi8(
|
||||
_mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
|
||||
__m256i tmp_ws_hi = _mm256_cmpeq_epi8(
|
||||
_mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
|
||||
|
||||
uint64_t ws_res_0 = (uint32_t)_mm256_movemask_epi8(tmp_ws_lo);
|
||||
uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
|
||||
uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
|
||||
*/
|
||||
__m256i mask_20 = _mm256_set1_epi8(0x20); // c==32
|
||||
__m256i mask_70 =
|
||||
_mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits
|
||||
|
@ -228,10 +199,10 @@ static inline size_t copy_without_useless_spaces(const uint8_t *buf, size_t len,
|
|||
int pop2 = _popcnt64((~whitespace) & UINT64_C(0xFFFFFFFF));
|
||||
int pop3 = _popcnt64((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
|
||||
int pop4 = _popcnt64((~whitespace));
|
||||
__m256i vmask1 = _mm256_loadu2_m128i((const __m128i *)mask128_epi8 + mask2,
|
||||
(const __m128i *)mask128_epi8 + mask1);
|
||||
__m256i vmask2 = _mm256_loadu2_m128i((const __m128i *)mask128_epi8 + mask4,
|
||||
(const __m128i *)mask128_epi8 + mask3);
|
||||
__m256i vmask1 = _mm256_loadu2_m128i((const __m128i *)mask128_epi8 + (mask2 & 0x7FFF),
|
||||
(const __m128i *)mask128_epi8 + (mask1 & 0x7FFF));
|
||||
__m256i vmask2 = _mm256_loadu2_m128i((const __m128i *)mask128_epi8 + (mask4 & 0x7FFF),
|
||||
(const __m128i *)mask128_epi8 + (mask3 & 0x7FFF));
|
||||
__m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
|
||||
__m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
|
||||
_mm256_storeu2_m128i((__m128i *)(buffer + pop1), (__m128i *)buffer,
|
||||
|
|
|
@ -15,8 +15,41 @@
|
|||
#include "jsonstruct.h"
|
||||
using namespace std;
|
||||
|
||||
#ifdef DEBUG
|
||||
inline void dump256(m256 d, string msg) {
|
||||
for (u32 i = 0; i < 32; i++) {
|
||||
cout << setw(3) << (int)*(((u8 *)(&d)) + i);
|
||||
if (!((i+1)%8))
|
||||
cout << "|";
|
||||
else if (!((i+1)%4))
|
||||
cout << ":";
|
||||
else
|
||||
cout << " ";
|
||||
}
|
||||
cout << " " << msg << "\n";
|
||||
}
|
||||
|
||||
// dump bits low to high
|
||||
void dumpbits(u64 v, string msg) {
|
||||
for (u32 i = 0; i < 64; i++) {
|
||||
std::cout << (((v>>(u64)i) & 0x1ULL) ? "1" : "_");
|
||||
}
|
||||
cout << " " << msg << "\n";
|
||||
}
|
||||
|
||||
void dumpbits32(u32 v, string msg) {
|
||||
for (u32 i = 0; i < 32; i++) {
|
||||
std::cout << (((v>>(u32)i) & 0x1ULL) ? "1" : "_");
|
||||
}
|
||||
cout << " " << msg << "\n";
|
||||
}
|
||||
#else
|
||||
#define dump256(a,b) ;
|
||||
#define dumpbits(a,b) ;
|
||||
#define dumpbits32(a,b) ;
|
||||
#endif
|
||||
// a straightforward comparison of a mask against input. 5 uops; would be cheaper in AVX512.
|
||||
static u64 cmp_mask_against_input(m256 input_lo, m256 input_hi, m256 mask) {
|
||||
really_inline u64 cmp_mask_against_input(m256 input_lo, m256 input_hi, m256 mask) {
|
||||
m256 cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask);
|
||||
u64 res_0 = (u32)_mm256_movemask_epi8(cmp_res_0);
|
||||
m256 cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask);
|
||||
|
@ -24,7 +57,7 @@ static u64 cmp_mask_against_input(m256 input_lo, m256 input_hi, m256 mask) {
|
|||
return res_0 | (res_1 << 32);
|
||||
}
|
||||
|
||||
static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
|
||||
never_inline bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
|
||||
// Useful constant masks
|
||||
const u64 even_bits = 0x5555555555555555ULL;
|
||||
const u64 odd_bits = ~even_bits;
|
||||
|
@ -39,6 +72,18 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
|
||||
for (size_t idx = 0; idx < len; idx+=64) {
|
||||
__builtin_prefetch(buf + idx + 128);
|
||||
#ifdef DEBUG
|
||||
cout << "Idx is " << idx << "\n";
|
||||
for (u32 j = 0; j < 64; j++) {
|
||||
char c = *(buf+idx+j);
|
||||
if (isprint(c)) {
|
||||
cout << c;
|
||||
} else {
|
||||
cout << '_';
|
||||
}
|
||||
}
|
||||
cout << "| ... input\n";
|
||||
#endif
|
||||
m256 input_lo = _mm256_load_si256((const m256 *)(buf + idx + 0));
|
||||
m256 input_hi = _mm256_load_si256((const m256 *)(buf + idx + 32));
|
||||
|
||||
|
@ -47,13 +92,18 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
u64 bs_bits = cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('\\'));
|
||||
dumpbits(bs_bits, "backslash bits");
|
||||
u64 start_edges = bs_bits & ~(bs_bits << 1);
|
||||
dumpbits(start_edges, "start_edges");
|
||||
|
||||
// flip lowest if we have an odd-length run at the end of the prior iteration
|
||||
u64 even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
|
||||
u64 even_starts = start_edges & even_start_mask;
|
||||
u64 odd_starts = start_edges & ~even_start_mask;
|
||||
|
||||
dumpbits(even_starts, "even_starts");
|
||||
dumpbits(odd_starts, "odd_starts");
|
||||
|
||||
u64 even_carries = bs_bits + even_starts;
|
||||
|
||||
u64 odd_carries;
|
||||
|
@ -65,12 +115,22 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
// if we had an odd-numbered run at the end of
|
||||
// the previous iteration
|
||||
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
|
||||
|
||||
dumpbits(even_carries, "even_carries");
|
||||
dumpbits(odd_carries, "odd_carries");
|
||||
|
||||
u64 even_carry_ends = even_carries & ~bs_bits;
|
||||
u64 odd_carry_ends = odd_carries & ~bs_bits;
|
||||
dumpbits(even_carry_ends, "even_carry_ends");
|
||||
dumpbits(odd_carry_ends, "odd_carry_ends");
|
||||
|
||||
u64 even_start_odd_end = even_carry_ends & odd_bits;
|
||||
u64 odd_start_even_end = odd_carry_ends & even_bits;
|
||||
dumpbits(even_start_odd_end, "esoe");
|
||||
dumpbits(odd_start_even_end, "osee");
|
||||
|
||||
u64 odd_ends = even_start_odd_end | odd_start_even_end;
|
||||
dumpbits(odd_ends, "odd_ends");
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Step 2: detect insides of quote pairs
|
||||
|
@ -78,10 +138,12 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
|
||||
u64 quote_bits = cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
|
||||
quote_bits = quote_bits & ~odd_ends;
|
||||
dumpbits(quote_bits, "quote_bits");
|
||||
u64 quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(_mm_set_epi64x(0ULL, quote_bits),
|
||||
_mm_set1_epi8(0xFF), 0));
|
||||
quote_mask ^= prev_iter_inside_quote;
|
||||
prev_iter_inside_quote = (u64)((s64)quote_mask>>63);
|
||||
dumpbits(quote_mask, "quote_mask");
|
||||
|
||||
// How do we build up a user traversable data structure
|
||||
// first, do a 'shufti' to detect structural JSON characters
|
||||
|
@ -133,6 +195,9 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
u64 ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
|
||||
u64 whitespace = ~(ws_res_0 | (ws_res_1 << 32));
|
||||
|
||||
dumpbits(structurals, "structurals");
|
||||
dumpbits(whitespace, "whitespace");
|
||||
|
||||
// mask off anything inside quotes
|
||||
structurals &= ~quote_mask;
|
||||
|
||||
|
@ -149,14 +214,20 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
// a qualified predecessor is something that can happen 1 position before an
|
||||
// psuedo-structural character
|
||||
u64 pseudo_pred = structurals | whitespace;
|
||||
dumpbits(pseudo_pred, "pseudo_pred");
|
||||
u64 shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
|
||||
dumpbits(shifted_pseudo_pred, "shifted_pseudo_pred");
|
||||
prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
|
||||
u64 pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask);
|
||||
dumpbits(pseudo_structurals, "pseudo_structurals");
|
||||
dumpbits(structurals, "final structurals without pseudos");
|
||||
structurals |= pseudo_structurals;
|
||||
dumpbits(structurals, "final structurals and pseudo structurals");
|
||||
|
||||
// now, we've used our close quotes all we need to. So let's switch them off
|
||||
// they will be off in the quote mask and on in quote bits.
|
||||
structurals &= ~(quote_bits & ~quote_mask);
|
||||
dumpbits(structurals, "final structurals and pseudo structurals after close quote removal");
|
||||
*(u64 *)(pj.structurals + idx/8) = structurals;
|
||||
}
|
||||
return true;
|
||||
|
@ -166,15 +237,30 @@ const u32 NUM_RESERVED_NODES = 2;
|
|||
const u32 DUMMY_NODE = 0;
|
||||
const u32 ROOT_NODE = 1;
|
||||
|
||||
#define VECDECODE
|
||||
#ifdef VECDECODE
|
||||
#include "vecdecode.h"
|
||||
#endif
|
||||
// just transform the bitmask to a big list of 32-bit integers for now
|
||||
// that's all; the type of character the offset points to will
|
||||
// tell us exactly what we need to know. Naive but straightforward implementation
|
||||
static bool flatten_indexes(size_t len, ParsedJson & pj) {
|
||||
u32 base = NUM_RESERVED_NODES;
|
||||
never_inline bool flatten_indexes(size_t len, ParsedJson & pj) {
|
||||
u32 * base_ptr = pj.structural_indexes;
|
||||
base_ptr[DUMMY_NODE] = base_ptr[ROOT_NODE] = 0; // really shouldn't matter
|
||||
#ifdef VECDECODE
|
||||
u32 number = bitmap_decode_avx2(pj.structurals, len, base_ptr + NUM_RESERVED_NODES) + NUM_RESERVED_NODES;
|
||||
pj.n_structural_indexes = number;
|
||||
base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array
|
||||
return true;
|
||||
#else
|
||||
u32 base = NUM_RESERVED_NODES;
|
||||
for (size_t idx = 0; idx < len; idx+=64) {
|
||||
u64 s = *(u64 *)(pj.structurals + idx/8);
|
||||
#ifdef SUPPRESS_CHEESY_FLATTEN
|
||||
while (s) {
|
||||
base_ptr[base++] = (u32)idx + __builtin_ctzll(s); s &= s - 1ULL;
|
||||
}
|
||||
#else
|
||||
u32 cnt = __builtin_popcountll(s);
|
||||
u32 next_base = base + cnt;
|
||||
while (s) {
|
||||
|
@ -193,14 +279,20 @@ static bool flatten_indexes(size_t len, ParsedJson & pj) {
|
|||
base += 6;
|
||||
}
|
||||
base = next_base;
|
||||
#endif
|
||||
}
|
||||
pj.n_structural_indexes = base;
|
||||
base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
const u32 MAX_DEPTH = 256;
|
||||
const u32 DEPTH_SAFETY_MARGIN = 32; // should be power-of-2 as we check this with a modulo in our
|
||||
// hot stage 3 loop
|
||||
const u32 START_DEPTH = DEPTH_SAFETY_MARGIN;
|
||||
const u32 REDLINE_DEPTH = MAX_DEPTH - DEPTH_SAFETY_MARGIN;
|
||||
|
||||
// the ape machine consists of two parts:
|
||||
//
|
||||
|
@ -277,7 +369,7 @@ const size_t MAX_TAPE = MAX_DEPTH * MAX_TAPE_ENTRIES;
|
|||
|
||||
// all of this stuff needs to get moved somewhere reasonable
|
||||
// like our ParsedJson structure
|
||||
u32 tape[MAX_TAPE];
|
||||
u64 tape[MAX_TAPE];
|
||||
u32 tape_locs[MAX_DEPTH];
|
||||
u8 string_buf[512*1024];
|
||||
u8 * current_string_buf_loc;
|
||||
|
@ -290,6 +382,20 @@ u32 trans[MAX_STATES][256];
|
|||
u32 states[MAX_DEPTH];
|
||||
const int START_STATE = 1;
|
||||
|
||||
// weird sub-machine for starting depth only
|
||||
// we start at 13 and go to 14 on a single UNARY
|
||||
// 14 doesn't have to have any transitions. Anything
|
||||
// else arrives after the single thing it's an error
|
||||
const int START_DEPTH_START_STATE = 13;
|
||||
|
||||
// ANYTHING_IS_ERROR_STATE is useful both as a target
|
||||
// for a transition at the start depth and also as
|
||||
// a good initial value for "red line" depths; that
|
||||
// is, depths that are maintained strictly to avoid
|
||||
// undefined behavior (e.g. depths below the starting
|
||||
// depth).
|
||||
const int ANYTHING_IS_ERROR_STATE = 14;
|
||||
|
||||
never_inline void init_state_machine() {
|
||||
// states 10 and 6 eliminated
|
||||
|
||||
|
@ -310,6 +416,7 @@ never_inline void init_state_machine() {
|
|||
trans[ 5][(u32)UNARIES[i]] = 7;
|
||||
trans[ 9][(u32)UNARIES[i]] = 11;
|
||||
trans[12][(u32)UNARIES[i]] = 11;
|
||||
trans[13][(u32)UNARIES[i]] = 14;
|
||||
}
|
||||
|
||||
// back transitions when new things are open
|
||||
|
@ -321,10 +428,10 @@ never_inline void init_state_machine() {
|
|||
trans[7]['['] = 9;
|
||||
trans[9]['['] = 9;
|
||||
trans[11]['['] = 9;
|
||||
|
||||
}
|
||||
|
||||
static bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
|
||||
|
||||
never_inline bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
|
||||
// NOTE - our depth is used by both the tape machine and the state machine
|
||||
// Further, in production we will set it to a largish value in a generous buffer as a rogue input
|
||||
// could consist of many {[ characters or many }] characters. We aren't busily checking errors
|
||||
|
@ -337,11 +444,17 @@ static bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
|
|||
// is an error (so we can detect max_depth violations by making sure that specious tape locations haven't
|
||||
// moved from their starting values)
|
||||
|
||||
u32 depth = 1;
|
||||
u32 depth = START_DEPTH;
|
||||
|
||||
for (u32 i = 0; i < MAX_DEPTH; i++) {
|
||||
tape_locs[i] = i*MAX_TAPE_ENTRIES;
|
||||
states[i] = START_STATE;
|
||||
if (i == START_DEPTH) {
|
||||
states[i] = START_DEPTH_START_STATE;
|
||||
} else if ((i < START_DEPTH) || (i >= REDLINE_DEPTH)) {
|
||||
states[i] = ANYTHING_IS_ERROR_STATE;
|
||||
} else {
|
||||
states[i] = START_STATE;
|
||||
}
|
||||
}
|
||||
|
||||
current_string_buf_loc = string_buf;
|
||||
|
@ -355,6 +468,18 @@ static bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
|
|||
u32 next_control = char_control[next_c];
|
||||
|
||||
for (u32 i = NUM_RESERVED_NODES; i < pj.n_structural_indexes; i++) {
|
||||
|
||||
// very periodic safety checking. This does NOT guarantee that we
|
||||
// haven't been in our dangerous zones above or below our normal
|
||||
// depths. It ONLY checks to be sure that we don't manage to leave
|
||||
// these zones and write completely off our tape.
|
||||
if (!(i%DEPTH_SAFETY_MARGIN)) {
|
||||
if (depth < START_DEPTH || depth >= REDLINE_DEPTH) {
|
||||
error_sump |= 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
u32 idx = next_idx;
|
||||
u8 c = next_c;
|
||||
u32 control = next_control;
|
||||
|
@ -368,13 +493,61 @@ static bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
|
|||
u8 write_size = get_write_size(control);
|
||||
u32 write_val = (depth_adjust != 0) ? old_tape_loc : idx;
|
||||
depth += depth_adjust;
|
||||
//states[depth] = trans[states[depth]][c];
|
||||
#ifdef DEBUG
|
||||
cout << "i: " << i << " idx: " << idx << " c " << c << "\n";
|
||||
cout << "TAPE MACHINE: depth change " << (s32)depth_adjust
|
||||
<< " write_size " << (u32)write_size << " current_depth: " << depth << "\n";
|
||||
#endif
|
||||
|
||||
// STATE MACHINE - hoisted here to fill in during the tape machine's latencies
|
||||
#ifdef DEBUG
|
||||
cout << "STATE MACHINE: state[depth] pre " << states[depth] << " ";
|
||||
#endif
|
||||
states[depth] = trans[states[depth]][c];
|
||||
#ifdef DEBUG
|
||||
cout << "post " << states[depth] << "\n";
|
||||
#endif
|
||||
// TAPE MACHINE, again
|
||||
tape[tape_locs[depth]] = write_val | (c << 24); // hack. Assumes no more than 2^24 tape items and buffer size for now
|
||||
tape[tape_locs[depth]] = write_val | (((u64)c) << 56);
|
||||
old_tape_loc = tape_locs[depth] += write_size;
|
||||
}
|
||||
/*
|
||||
for (u32 i = 0; i < MAX_DEPTH; i++) {
|
||||
if (states[i] == 0) {
|
||||
printf("duuh\n");
|
||||
return false;
|
||||
}
|
||||
}*/
|
||||
|
||||
#define DUMP_TAPES
|
||||
#ifdef DEBUG
|
||||
for (u32 i = 0; i < MAX_DEPTH; i++) {
|
||||
u32 start_loc = i*MAX_TAPE_ENTRIES;
|
||||
cout << " tape section i " << i;
|
||||
if (i == START_DEPTH) {
|
||||
cout << " (START) ";
|
||||
} else if ((i < START_DEPTH) || (i >= REDLINE_DEPTH)) {
|
||||
cout << " (REDLINE) ";
|
||||
} else {
|
||||
cout << " (NORMAL) ";
|
||||
}
|
||||
|
||||
cout << " from: " << start_loc
|
||||
<< " to: " << tape_locs[i] << " "
|
||||
<< " size: " << (tape_locs[i]-start_loc) << "\n";
|
||||
cout << " state: " << states[i] << "\n";
|
||||
#ifdef DUMP_TAPES
|
||||
for (u32 j = start_loc; j < tape_locs[i]; j++) {
|
||||
if (tape[j]) {
|
||||
cout << "j: " << j << " tape[j] char " << (char)(tape[j]>>56)
|
||||
<< " tape[j][0..55]: " << (tape[j]&0xffffffffffffffULL ) << "\n";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
if (error_sump) {
|
||||
printf("error_sump\n");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -500,7 +673,7 @@ bool hex_to_u32(const u8 * src, u32 * res) {
|
|||
// dest will advance a variable amount (return via pointer)
|
||||
// return true if the unicode codepoint was valid
|
||||
// We work in little-endian then swap at write time
|
||||
static bool handle_unicode_codepoint(const u8 ** src_ptr, u8 ** dst_ptr) {
|
||||
really_inline bool handle_unicode_codepoint(const u8 ** src_ptr, u8 ** dst_ptr) {
|
||||
u32 code_point = 0; // read the hex, potentially reading another \u beyond if it's a // wacky one
|
||||
if (!hex_to_u32(*src_ptr + 2, &code_point)) {
|
||||
return false;
|
||||
|
@ -534,20 +707,43 @@ static bool handle_unicode_codepoint(const u8 ** src_ptr, u8 ** dst_ptr) {
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc) {
|
||||
really_inline bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc) {
|
||||
u32 offset = tape[tape_loc] & 0xffffff;
|
||||
const u8 * src = &buf[offset+1]; // we know that buf at offset is a "
|
||||
u8 * dst = current_string_buf_loc;
|
||||
#ifdef DEBUG
|
||||
cout << "Entering parse string with offset " << offset << "\n";
|
||||
#endif
|
||||
// basic non-sexy parsing code
|
||||
while (1) {
|
||||
#ifdef DEBUG
|
||||
for (u32 j = 0; j < 32; j++) {
|
||||
char c = *(src+j);
|
||||
if (isprint(c)) {
|
||||
cout << c;
|
||||
} else {
|
||||
cout << '_';
|
||||
}
|
||||
}
|
||||
cout << "| ... string handling input\n";
|
||||
#endif
|
||||
m256 v = _mm256_loadu_si256((const m256 *)(src));
|
||||
u32 bs_bits = (u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')));
|
||||
dumpbits32(bs_bits, "backslash bits 2");
|
||||
u32 quote_bits = (u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('"')));
|
||||
dumpbits32(quote_bits, "quote_bits");
|
||||
u32 quote_dist = __builtin_ctz(quote_bits);
|
||||
u32 bs_dist = __builtin_ctz(bs_bits);
|
||||
// store to dest unconditionally - we can overwrite the bits we don't like later
|
||||
_mm256_storeu_si256((m256 *)(dst), v);
|
||||
#ifdef DEBUG
|
||||
cout << "quote dist: " << quote_dist << " bs dist: " << bs_dist << "\n";
|
||||
#endif
|
||||
|
||||
if (quote_dist < bs_dist) {
|
||||
#ifdef DEBUG
|
||||
cout << "Found end, leaving!\n";
|
||||
#endif
|
||||
// we encountered quotes first. Move dst to point to quotes and exit
|
||||
dst[quote_dist] = 0; // null terminate and get out
|
||||
current_string_buf_loc = dst + quote_dist + 1;
|
||||
|
@ -555,6 +751,9 @@ static bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
|
|||
return true;
|
||||
} else if (quote_dist > bs_dist) {
|
||||
u8 escape_char = src[bs_dist+1];
|
||||
#ifdef DEBUG
|
||||
cout << "Found escape char: " << escape_char << "\n";
|
||||
#endif
|
||||
// we encountered backslash first. Handle backslash
|
||||
if (escape_char == 'u') {
|
||||
// move src/dst up to the start; they will be further adjusted
|
||||
|
@ -600,7 +799,7 @@ static bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
|
|||
|
||||
// TODO: see if we really need a separate number_buf or whether we should just
|
||||
// have a generic scratch - would need to align before using for this
|
||||
static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc, UNUSED bool found_zero, bool found_minus) {
|
||||
really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc, UNUSED bool found_zero, bool found_minus) {
|
||||
u32 offset = tape[tape_loc] & 0xffffff;
|
||||
if (found_minus) {
|
||||
offset++;
|
||||
|
@ -608,6 +807,17 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
|
|||
const u8 * src = &buf[offset];
|
||||
m256 v = _mm256_loadu_si256((const m256 *)(src));
|
||||
u64 error_sump = 0;
|
||||
#ifdef DEBUG
|
||||
for (u32 j = 0; j < 32; j++) {
|
||||
char c = *(src+j);
|
||||
if (isprint(c)) {
|
||||
cout << c;
|
||||
} else {
|
||||
cout << '_';
|
||||
}
|
||||
}
|
||||
cout << "| ... number handling input\n";
|
||||
#endif
|
||||
|
||||
// categories to extract
|
||||
// Digits:
|
||||
|
@ -642,6 +852,7 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
|
|||
m256 tmp_enders = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, enders_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
u32 enders = ~(u32)_mm256_movemask_epi8(tmp_enders);
|
||||
dumpbits32(enders, "ender characters");
|
||||
|
||||
if (enders == 0) {
|
||||
// TODO: scream for help if enders == 0 which means we have
|
||||
|
@ -649,6 +860,7 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
|
|||
}
|
||||
// TODO: make a mask that indicates where our digits are
|
||||
u32 number_mask = ~enders & (enders-1);
|
||||
dumpbits32(number_mask, "number mask");
|
||||
|
||||
m256 n_mask = _mm256_set1_epi8(0x1f);
|
||||
m256 tmp_n = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, n_mask),
|
||||
|
@ -660,32 +872,38 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
|
|||
// for the inside of our JSON
|
||||
number_characters &= number_mask;
|
||||
error_sump |= number_characters ^ number_mask;
|
||||
dumpbits32(number_characters, "number characters");
|
||||
|
||||
m256 d_mask = _mm256_set1_epi8(0x03);
|
||||
m256 tmp_d = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, d_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
u32 digit_characters = ~(u32)_mm256_movemask_epi8(tmp_d);
|
||||
digit_characters &= number_mask;
|
||||
dumpbits32(digit_characters, "digit characters");
|
||||
|
||||
m256 p_mask = _mm256_set1_epi8(0x04);
|
||||
m256 tmp_p = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, p_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
u32 decimal_characters = ~(u32)_mm256_movemask_epi8(tmp_p);
|
||||
decimal_characters &= number_mask;
|
||||
dumpbits32(decimal_characters, "decimal characters");
|
||||
|
||||
m256 e_mask = _mm256_set1_epi8(0x08);
|
||||
m256 tmp_e = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, e_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
u32 exponent_characters = ~(u32)_mm256_movemask_epi8(tmp_e);
|
||||
exponent_characters &= number_mask;
|
||||
dumpbits32(exponent_characters, "exponent characters");
|
||||
|
||||
m256 s_mask = _mm256_set1_epi8(0x10);
|
||||
m256 tmp_s = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, s_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
u32 sign_characters = ~(u32)_mm256_movemask_epi8(tmp_s);
|
||||
sign_characters &= number_mask;
|
||||
dumpbits32(sign_characters, "sign characters");
|
||||
|
||||
u32 digit_edges = ~(digit_characters << 1) & digit_characters;
|
||||
dumpbits32(digit_edges, "digit_edges");
|
||||
|
||||
// check that we have 1-3 'edges' only
|
||||
u32 t = digit_edges;
|
||||
|
@ -713,6 +931,9 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
|
|||
if (found_minus) {
|
||||
result = -result;
|
||||
}
|
||||
#ifdef DEBUG
|
||||
cout << "Found number " << result << "\n";
|
||||
#endif
|
||||
*((u64 *)current_number_buf_loc) = result;
|
||||
tape[tape_loc] = ((u32)'l') << 24 | (current_number_buf_loc - number_buf); // assume 2^24 will hold all numbers for now
|
||||
current_number_buf_loc += 8;
|
||||
|
@ -727,6 +948,9 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
|
|||
if (found_minus) {
|
||||
result = -result;
|
||||
}
|
||||
#ifdef DEBUG
|
||||
cout << "Found number " << result << "\n";
|
||||
#endif
|
||||
*((double *)current_number_buf_loc) = result;
|
||||
tape[tape_loc] = ((u32)'d') << 24 | (current_number_buf_loc - number_buf); // assume 2^24 will hold all numbers for now
|
||||
current_number_buf_loc += 8;
|
||||
|
@ -764,7 +988,13 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
|
||||
bool tape_disturbed(u32 i) {
|
||||
u32 start_loc = i*MAX_TAPE_ENTRIES;
|
||||
u32 end_loc = tape_locs[i];
|
||||
return start_loc != end_loc;
|
||||
}
|
||||
|
||||
never_inline bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
|
||||
// fixup the mess made by the ape_machine
|
||||
// as such it does a bunch of miscellaneous things on the tapes
|
||||
u32 error_sump = 0;
|
||||
|
@ -774,13 +1004,23 @@ static bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
u64 mask4 = 0x00000000ffffffff;
|
||||
u64 mask5 = 0x000000ffffffffff;
|
||||
|
||||
// if the tape has been touched at all at the depths outside the safe
|
||||
// zone we need to quit. Note that our periodic checks to see that we're
|
||||
// inside our safe zone in stage 3 don't guarantee that the system did
|
||||
// not get into the danger area briefly.
|
||||
if (tape_disturbed(START_DEPTH - 1) || tape_disturbed(REDLINE_DEPTH)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// walk over each tape
|
||||
for (u32 i = 0; i < MAX_DEPTH; i++) {
|
||||
for (u32 i = START_DEPTH; i < MAX_DEPTH; i++) {
|
||||
u32 start_loc = i*MAX_TAPE_ENTRIES;
|
||||
u32 end_loc = tape_locs[i];
|
||||
|
||||
if (start_loc == end_loc) {
|
||||
break;
|
||||
}
|
||||
for (u32 j = start_loc; j < end_loc; j++) {
|
||||
switch (tape[j]>>24) {
|
||||
switch (tape[j]>>56) {
|
||||
case '{': case '[': {
|
||||
// pivot our tapes
|
||||
// point the enclosing structural char (}]) to the head marker ({[) and
|
||||
|
@ -788,10 +1028,10 @@ static bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
// we start with head marker pointing at the enclosing structural char
|
||||
// and the enclosing structural char pointing at the end. Just swap them.
|
||||
// also check the balanced-{} or [] property here
|
||||
u8 head_marker_c = tape[j] >> 24;
|
||||
u32 head_marker_loc = tape[j] & 0xffffff;
|
||||
u32 tape_enclosing = tape[head_marker_loc];
|
||||
u8 enclosing_c = tape_enclosing >> 24;
|
||||
u8 head_marker_c = tape[j] >> 56;
|
||||
u32 head_marker_loc = tape[j] & 0xffffffffffffffULL;
|
||||
u64 tape_enclosing = tape[head_marker_loc];
|
||||
u8 enclosing_c = tape_enclosing >> 56;
|
||||
tape[head_marker_loc] = tape[j];
|
||||
tape[j] = tape_enclosing;
|
||||
error_sump |= (enclosing_c - head_marker_c - 2); // [] and {} only differ by 2 chars
|
||||
|
@ -811,21 +1051,21 @@ static bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
error_sump |= !parse_number(buf, len, pj, j, false, true);
|
||||
break;
|
||||
case 't': {
|
||||
u32 offset = tape[j] & 0xffffff;
|
||||
u32 offset = tape[j] & 0xffffffffffffffULL;
|
||||
const u8 * loc = buf + offset;
|
||||
error_sump |= ((*(const u64 *)loc) & mask4) ^ tv;
|
||||
error_sump |= is_not_structural_or_whitespace(loc[4]);
|
||||
break;
|
||||
}
|
||||
case 'f': {
|
||||
u32 offset = tape[j] & 0xffffff;
|
||||
u32 offset = tape[j] & 0xffffffffffffffULL;
|
||||
const u8 * loc = buf + offset;
|
||||
error_sump |= ((*(const u64 *)loc) & mask5) ^ fv;
|
||||
error_sump |= is_not_structural_or_whitespace(loc[5]);
|
||||
break;
|
||||
}
|
||||
case 'n': {
|
||||
u32 offset = tape[j] & 0xffffff;
|
||||
u32 offset = tape[j] & 0xffffffffffffffULL;
|
||||
const u8 * loc = buf + offset;
|
||||
error_sump |= ((*(const u64 *)loc) & mask4) ^ nv;
|
||||
error_sump |= is_not_structural_or_whitespace(loc[4]);
|
||||
|
@ -837,6 +1077,7 @@ static bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
}
|
||||
}
|
||||
if (error_sump) {
|
||||
cerr << "Ugh!\n";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -845,8 +1086,14 @@ static bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
static bool avx_json_parse(const u8 * buf, size_t len, ParsedJson & pj) {
|
||||
find_structural_bits(buf, len, pj);
|
||||
flatten_indexes(len, pj);
|
||||
return ape_machine(buf, len, pj) && shovel_machine(buf, len, pj);
|
||||
bool apeok = ape_machine(buf, len, pj);
|
||||
if(!apeok) {
|
||||
return false;
|
||||
}
|
||||
return shovel_machine(buf, len, pj);
|
||||
}
|
||||
|
|
|
@ -3,9 +3,9 @@
|
|||
#include "common_defs.h"
|
||||
|
||||
struct JsonNode {
|
||||
u32 up;
|
||||
u32 next;
|
||||
u32 prev;
|
||||
u32 next_type;
|
||||
u64 payload; // a freeform 'payload' holding a parsed representation of *something*
|
||||
};
|
||||
|
||||
struct ParsedJson {
|
||||
|
@ -68,15 +68,3 @@ void colorfuldisplay(ParsedJson & pj, const u8 * buf) {
|
|||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
void debugdisplay(ParsedJson & pj, const u8 * buf) {
|
||||
for (u32 i = 0; i < pj.n_structural_indexes; i++) {
|
||||
u32 idx = pj.structural_indexes[i];
|
||||
JsonNode & n = pj.nodes[i];
|
||||
std::cout << "i: " << i;
|
||||
std::cout << " n.up: " << n.up;
|
||||
std::cout << " n.next: " << n.next;
|
||||
std::cout << " n.prev: " << n.prev;
|
||||
std::cout << " idx: " << idx << " buf[idx] " << buf[idx] << std::endl;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@ static uint8_t jump_table[256 * 3] = {
|
|||
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
|
||||
};
|
||||
|
||||
static inline size_t despace(const unsigned char *bytes, size_t howmany,
|
||||
static inline size_t scalar_despace(const unsigned char *bytes, size_t howmany,
|
||||
unsigned char *out) {
|
||||
size_t i = 0, pos = 0;
|
||||
uint8_t quote = 0;
|
||||
|
|
|
@ -1,79 +0,0 @@
|
|||
#include "common_defs.h"
|
||||
#include "jsonstruct.h"
|
||||
|
||||
bool is_valid_escape(char c) {
|
||||
return (c == '"') || (c == '\\') || (c == '/') || (c == 'b') || (c == 'f') || (c == 'n') || (c == 'r') || (c == 't') || (c == 'u');
|
||||
}
|
||||
|
||||
bool scalar_json_parse(const u8 * buf, size_t len, ParsedJson & pj) {
|
||||
// this is a naive attempt at this point
|
||||
// it will probably be subject to failures given adversarial inputs
|
||||
size_t pos = 0;
|
||||
size_t last = 0;
|
||||
size_t up = 0;
|
||||
|
||||
const u32 DUMMY_NODE = 0;
|
||||
const u32 ROOT_NODE = 1;
|
||||
pj.structural_indexes[DUMMY_NODE] = 0;
|
||||
pj.structural_indexes[ROOT_NODE] = 0;
|
||||
JsonNode & dummy = pj.nodes[DUMMY_NODE];
|
||||
JsonNode & root = pj.nodes[ROOT_NODE];
|
||||
dummy.prev = dummy.up = DUMMY_NODE;
|
||||
dummy.next = 0;
|
||||
root.prev = DUMMY_NODE;
|
||||
root.up = ROOT_NODE;
|
||||
root.next = 0;
|
||||
|
||||
last = up = ROOT_NODE;
|
||||
|
||||
pos = 2;
|
||||
for(size_t i = 0; i < len; i++) {
|
||||
JsonNode & n = pj.nodes[pos];
|
||||
switch (buf[i]) {
|
||||
case '[':
|
||||
case '{':
|
||||
pj.structural_indexes[pos] = i;
|
||||
n.prev = last;
|
||||
pj.nodes[last].next = pos;// two-way linked list
|
||||
n.up = up;
|
||||
up = pos;// new possible up
|
||||
last = 0;
|
||||
pos += 1;
|
||||
|
||||
break;
|
||||
case ']':
|
||||
case '}':
|
||||
pj.structural_indexes[pos] = i;
|
||||
n.prev = up;
|
||||
n.next = 0;// necessary?
|
||||
pj.nodes[up].next = pos;// two-way linked list
|
||||
n.up = pj.nodes[up].up;
|
||||
up = pj.nodes[up].up;
|
||||
last = pos;// potential previous
|
||||
pos += 1;
|
||||
break;
|
||||
case '"':
|
||||
case ':':
|
||||
case ',':
|
||||
pj.structural_indexes[pos] = i;
|
||||
n.prev = last;
|
||||
n.next = 0;// necessary
|
||||
pj.nodes[last].next = pos;// two-way linked list
|
||||
n.up = up;
|
||||
last = pos;// potential previous
|
||||
pos += 1;
|
||||
break;
|
||||
case '\\':
|
||||
if(i == len - 1) return false;
|
||||
if(!is_valid_escape(buf[i+1])) return false;
|
||||
i = i + 1; // skip valid escape
|
||||
default:
|
||||
// nothing
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
pj.n_structural_indexes = pos;
|
||||
dummy.next = DUMMY_NODE; // dummy.next is a sump for meaningless 'nexts', clear it
|
||||
return true;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,335 @@
|
|||
#ifndef VECDECODE_H
|
||||
#define VECDECODE_H
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define ALIGNED(x) __declspec(align(x))
|
||||
#else
|
||||
#if defined(__GNUC__)
|
||||
#define ALIGNED(x) __attribute__((aligned(x)))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static uint8_t lengthTable[256] = {
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
|
||||
|
||||
static uint32_t vecDecodeTable[256][8] ALIGNED(16) = {
|
||||
{0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
|
||||
{1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
|
||||
{2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
|
||||
{1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
|
||||
{3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
|
||||
{1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
|
||||
{2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
|
||||
{1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
|
||||
{4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
|
||||
{1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
|
||||
{2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
|
||||
{1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
|
||||
{3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
|
||||
{1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
|
||||
{2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
|
||||
{1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
|
||||
{5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
|
||||
{1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
|
||||
{2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
|
||||
{1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
|
||||
{3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
|
||||
{1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
|
||||
{2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
|
||||
{1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
|
||||
{4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
|
||||
{1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
|
||||
{2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
|
||||
{1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
|
||||
{3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
|
||||
{1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
|
||||
{2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
|
||||
{1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
|
||||
{6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
|
||||
{1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
|
||||
{2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
|
||||
{1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
|
||||
{3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
|
||||
{1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
|
||||
{2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
|
||||
{1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
|
||||
{4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
|
||||
{1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
|
||||
{2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
|
||||
{1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
|
||||
{3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
|
||||
{1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
|
||||
{2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
|
||||
{1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
|
||||
{5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
|
||||
{1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
|
||||
{2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
|
||||
{1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
|
||||
{3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
|
||||
{1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
|
||||
{2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
|
||||
{1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
|
||||
{4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
|
||||
{1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
|
||||
{2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
|
||||
{1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
|
||||
{3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
|
||||
{1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
|
||||
{2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
|
||||
{1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
|
||||
{7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
|
||||
{1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
|
||||
{2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
|
||||
{1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
|
||||
{3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
|
||||
{1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
|
||||
{2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
|
||||
{1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
|
||||
{4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
|
||||
{1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
|
||||
{2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
|
||||
{1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
|
||||
{3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
|
||||
{1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
|
||||
{2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
|
||||
{1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
|
||||
{5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
|
||||
{1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
|
||||
{2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
|
||||
{1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
|
||||
{3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
|
||||
{1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
|
||||
{2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
|
||||
{1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
|
||||
{4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
|
||||
{1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
|
||||
{2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
|
||||
{1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
|
||||
{3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
|
||||
{1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
|
||||
{2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
|
||||
{1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
|
||||
{6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
|
||||
{1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
|
||||
{2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
|
||||
{1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
|
||||
{3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
|
||||
{1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
|
||||
{2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
|
||||
{1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
|
||||
{4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
|
||||
{1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
|
||||
{2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
|
||||
{1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
|
||||
{3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
|
||||
{1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
|
||||
{2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
|
||||
{1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
|
||||
{5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
|
||||
{1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
|
||||
{2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
|
||||
{1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
|
||||
{3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
|
||||
{1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
|
||||
{2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
|
||||
{1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
|
||||
{4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
|
||||
{1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
|
||||
{2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
|
||||
{1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
|
||||
{3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
|
||||
{1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
|
||||
{2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
|
||||
{1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
|
||||
{8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
|
||||
{1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
|
||||
{2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
|
||||
{1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
|
||||
{3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
|
||||
{1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
|
||||
{2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
|
||||
{1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
|
||||
{4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
|
||||
{1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
|
||||
{2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
|
||||
{1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
|
||||
{3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
|
||||
{1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
|
||||
{2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
|
||||
{1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
|
||||
{5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
|
||||
{1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
|
||||
{2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
|
||||
{1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
|
||||
{3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
|
||||
{1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
|
||||
{2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
|
||||
{1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
|
||||
{4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
|
||||
{1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
|
||||
{2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
|
||||
{1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
|
||||
{3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
|
||||
{1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
|
||||
{2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
|
||||
{1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
|
||||
{6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
|
||||
{1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
|
||||
{2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
|
||||
{1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
|
||||
{3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
|
||||
{1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
|
||||
{2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
|
||||
{1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
|
||||
{4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
|
||||
{1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
|
||||
{2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
|
||||
{1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
|
||||
{3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
|
||||
{1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
|
||||
{2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
|
||||
{1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
|
||||
{5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
|
||||
{1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
|
||||
{2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
|
||||
{1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
|
||||
{3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
|
||||
{1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
|
||||
{2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
|
||||
{1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
|
||||
{4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
|
||||
{1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
|
||||
{2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
|
||||
{1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
|
||||
{3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
|
||||
{1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
|
||||
{2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
|
||||
{1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
|
||||
{7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
|
||||
{1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
|
||||
{2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
|
||||
{1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
|
||||
{3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
|
||||
{1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
|
||||
{2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
|
||||
{1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
|
||||
{4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
|
||||
{1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
|
||||
{2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
|
||||
{1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
|
||||
{3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
|
||||
{1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
|
||||
{2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
|
||||
{1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
|
||||
{5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
|
||||
{1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
|
||||
{2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
|
||||
{1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
|
||||
{3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
|
||||
{1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
|
||||
{2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
|
||||
{1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
|
||||
{4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
|
||||
{1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
|
||||
{2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
|
||||
{1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
|
||||
{3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
|
||||
{1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
|
||||
{2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
|
||||
{1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
|
||||
{6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
|
||||
{1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
|
||||
{2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
|
||||
{1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
|
||||
{3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
|
||||
{1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
|
||||
{2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
|
||||
{1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
|
||||
{4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
|
||||
{1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
|
||||
{2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
|
||||
{1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
|
||||
{3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
|
||||
{1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
|
||||
{2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
|
||||
{1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
|
||||
{5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
|
||||
{1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
|
||||
{2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
|
||||
{1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
|
||||
{3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
|
||||
{1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
|
||||
{2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
|
||||
{1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
|
||||
{4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
|
||||
{1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
|
||||
{2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
|
||||
{1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
|
||||
{3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
|
||||
{1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
|
||||
{2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
|
||||
{1, 2, 3, 4, 5, 6, 7, 8} /* 0xFF (11111111) */
|
||||
};
|
||||
|
||||
static size_t bitmap_decode_avx2(uint8_t *bitmapInPtr, size_t bitsin, uint32_t *out) {
|
||||
uint32_t *initout = out;
|
||||
__m256i baseVec = _mm256_set1_epi32(-1);
|
||||
__m256i incVec = _mm256_set1_epi32(64);
|
||||
__m256i add8 = _mm256_set1_epi32(8);
|
||||
|
||||
int sizeinwords = bitsin / 64;
|
||||
uint64_t *array = (uint64_t *)bitmapInPtr;
|
||||
|
||||
for (int i = 0; i < sizeinwords; ++i) {
|
||||
uint64_t w = array[i];
|
||||
if (w == 0) {
|
||||
baseVec = _mm256_add_epi32(baseVec, incVec);
|
||||
} else {
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
uint8_t byteA = (uint8_t)w;
|
||||
uint8_t byteB = (uint8_t)(w >> 8);
|
||||
w >>= 16;
|
||||
__m256i vecA =
|
||||
_mm256_load_si256((const __m256i *)vecDecodeTable[byteA]);
|
||||
__m256i vecB =
|
||||
_mm256_load_si256((const __m256i *)vecDecodeTable[byteB]);
|
||||
uint8_t advanceA = lengthTable[byteA];
|
||||
uint8_t advanceB = lengthTable[byteB];
|
||||
vecA = _mm256_add_epi32(baseVec, vecA);
|
||||
baseVec = _mm256_add_epi32(baseVec, add8);
|
||||
vecB = _mm256_add_epi32(baseVec, vecB);
|
||||
baseVec = _mm256_add_epi32(baseVec, add8);
|
||||
_mm256_storeu_si256((__m256i *)out, vecA);
|
||||
out += advanceA;
|
||||
_mm256_storeu_si256((__m256i *)out, vecB);
|
||||
out += advanceB;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((bitsin % 64) != 0) {
|
||||
// finish off the work the slow way.
|
||||
uint64_t bitset = 0;
|
||||
memcpy(&bitset, bitmapInPtr, sizeof(bitset));
|
||||
bitset = bitset & ((UINT64_C(1) << (bitsin % 64)) - 1);
|
||||
while (bitset != 0) {
|
||||
uint64_t t = bitset & -bitset;
|
||||
int r = __builtin_ctzll(bitset);
|
||||
*out = sizeinwords * 64 + r;
|
||||
out++;
|
||||
bitset ^= t;
|
||||
}
|
||||
}
|
||||
return out - initout;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,335 @@
|
|||
#ifndef VECDECODE_H
|
||||
#define VECDECODE_H
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define ALIGNED(x) __declspec(align(x))
|
||||
#else
|
||||
#if defined(__GNUC__)
|
||||
#define ALIGNED(x) __attribute__((aligned(x)))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static uint8_t lengthTable[256] = {
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
|
||||
|
||||
static uint32_t vecDecodeTable[256][8] ALIGNED(16) = {
|
||||
{0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
|
||||
{1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
|
||||
{2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
|
||||
{1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
|
||||
{3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
|
||||
{1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
|
||||
{2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
|
||||
{1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
|
||||
{4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
|
||||
{1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
|
||||
{2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
|
||||
{1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
|
||||
{3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
|
||||
{1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
|
||||
{2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
|
||||
{1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
|
||||
{5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
|
||||
{1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
|
||||
{2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
|
||||
{1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
|
||||
{3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
|
||||
{1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
|
||||
{2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
|
||||
{1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
|
||||
{4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
|
||||
{1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
|
||||
{2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
|
||||
{1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
|
||||
{3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
|
||||
{1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
|
||||
{2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
|
||||
{1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
|
||||
{6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
|
||||
{1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
|
||||
{2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
|
||||
{1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
|
||||
{3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
|
||||
{1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
|
||||
{2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
|
||||
{1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
|
||||
{4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
|
||||
{1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
|
||||
{2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
|
||||
{1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
|
||||
{3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
|
||||
{1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
|
||||
{2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
|
||||
{1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
|
||||
{5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
|
||||
{1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
|
||||
{2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
|
||||
{1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
|
||||
{3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
|
||||
{1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
|
||||
{2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
|
||||
{1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
|
||||
{4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
|
||||
{1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
|
||||
{2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
|
||||
{1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
|
||||
{3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
|
||||
{1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
|
||||
{2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
|
||||
{1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
|
||||
{7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
|
||||
{1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
|
||||
{2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
|
||||
{1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
|
||||
{3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
|
||||
{1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
|
||||
{2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
|
||||
{1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
|
||||
{4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
|
||||
{1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
|
||||
{2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
|
||||
{1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
|
||||
{3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
|
||||
{1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
|
||||
{2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
|
||||
{1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
|
||||
{5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
|
||||
{1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
|
||||
{2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
|
||||
{1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
|
||||
{3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
|
||||
{1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
|
||||
{2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
|
||||
{1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
|
||||
{4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
|
||||
{1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
|
||||
{2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
|
||||
{1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
|
||||
{3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
|
||||
{1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
|
||||
{2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
|
||||
{1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
|
||||
{6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
|
||||
{1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
|
||||
{2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
|
||||
{1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
|
||||
{3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
|
||||
{1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
|
||||
{2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
|
||||
{1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
|
||||
{4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
|
||||
{1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
|
||||
{2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
|
||||
{1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
|
||||
{3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
|
||||
{1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
|
||||
{2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
|
||||
{1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
|
||||
{5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
|
||||
{1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
|
||||
{2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
|
||||
{1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
|
||||
{3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
|
||||
{1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
|
||||
{2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
|
||||
{1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
|
||||
{4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
|
||||
{1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
|
||||
{2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
|
||||
{1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
|
||||
{3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
|
||||
{1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
|
||||
{2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
|
||||
{1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
|
||||
{8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
|
||||
{1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
|
||||
{2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
|
||||
{1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
|
||||
{3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
|
||||
{1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
|
||||
{2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
|
||||
{1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
|
||||
{4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
|
||||
{1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
|
||||
{2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
|
||||
{1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
|
||||
{3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
|
||||
{1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
|
||||
{2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
|
||||
{1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
|
||||
{5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
|
||||
{1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
|
||||
{2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
|
||||
{1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
|
||||
{3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
|
||||
{1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
|
||||
{2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
|
||||
{1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
|
||||
{4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
|
||||
{1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
|
||||
{2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
|
||||
{1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
|
||||
{3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
|
||||
{1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
|
||||
{2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
|
||||
{1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
|
||||
{6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
|
||||
{1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
|
||||
{2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
|
||||
{1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
|
||||
{3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
|
||||
{1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
|
||||
{2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
|
||||
{1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
|
||||
{4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
|
||||
{1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
|
||||
{2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
|
||||
{1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
|
||||
{3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
|
||||
{1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
|
||||
{2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
|
||||
{1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
|
||||
{5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
|
||||
{1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
|
||||
{2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
|
||||
{1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
|
||||
{3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
|
||||
{1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
|
||||
{2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
|
||||
{1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
|
||||
{4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
|
||||
{1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
|
||||
{2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
|
||||
{1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
|
||||
{3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
|
||||
{1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
|
||||
{2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
|
||||
{1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
|
||||
{7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
|
||||
{1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
|
||||
{2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
|
||||
{1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
|
||||
{3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
|
||||
{1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
|
||||
{2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
|
||||
{1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
|
||||
{4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
|
||||
{1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
|
||||
{2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
|
||||
{1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
|
||||
{3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
|
||||
{1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
|
||||
{2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
|
||||
{1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
|
||||
{5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
|
||||
{1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
|
||||
{2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
|
||||
{1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
|
||||
{3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
|
||||
{1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
|
||||
{2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
|
||||
{1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
|
||||
{4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
|
||||
{1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
|
||||
{2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
|
||||
{1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
|
||||
{3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
|
||||
{1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
|
||||
{2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
|
||||
{1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
|
||||
{6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
|
||||
{1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
|
||||
{2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
|
||||
{1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
|
||||
{3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
|
||||
{1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
|
||||
{2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
|
||||
{1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
|
||||
{4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
|
||||
{1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
|
||||
{2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
|
||||
{1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
|
||||
{3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
|
||||
{1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
|
||||
{2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
|
||||
{1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
|
||||
{5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
|
||||
{1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
|
||||
{2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
|
||||
{1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
|
||||
{3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
|
||||
{1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
|
||||
{2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
|
||||
{1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
|
||||
{4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
|
||||
{1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
|
||||
{2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
|
||||
{1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
|
||||
{3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
|
||||
{1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
|
||||
{2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
|
||||
{1, 2, 3, 4, 5, 6, 7, 8} /* 0xFF (11111111) */
|
||||
};
|
||||
|
||||
static size_t bitmap_decode_avx2(uint8_t *bitmapInPtr, size_t bitsin, uint32_t *out) {
|
||||
uint32_t *initout = out;
|
||||
__m256i baseVec = _mm256_set1_epi32(-1);
|
||||
__m256i incVec = _mm256_set1_epi32(64);
|
||||
__m256i add8 = _mm256_set1_epi32(8);
|
||||
|
||||
int sizeinwords = bitsin / 64;
|
||||
uint64_t *array = (uint64_t *)bitmapInPtr;
|
||||
|
||||
for (int i = 0; i < sizeinwords; ++i) {
|
||||
uint64_t w = array[i];
|
||||
if (w == 0) {
|
||||
baseVec = _mm256_add_epi32(baseVec, incVec);
|
||||
} else {
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
uint8_t byteA = (uint8_t)w;
|
||||
uint8_t byteB = (uint8_t)(w >> 8);
|
||||
w >>= 16;
|
||||
__m256i vecA =
|
||||
_mm256_load_si256((const __m256i *)vecDecodeTable[byteA]);
|
||||
__m256i vecB =
|
||||
_mm256_load_si256((const __m256i *)vecDecodeTable[byteB]);
|
||||
uint8_t advanceA = lengthTable[byteA];
|
||||
uint8_t advanceB = lengthTable[byteB];
|
||||
vecA = _mm256_add_epi32(baseVec, vecA);
|
||||
baseVec = _mm256_add_epi32(baseVec, add8);
|
||||
vecB = _mm256_add_epi32(baseVec, vecB);
|
||||
baseVec = _mm256_add_epi32(baseVec, add8);
|
||||
_mm256_storeu_si256((__m256i *)out, vecA);
|
||||
out += advanceA;
|
||||
_mm256_storeu_si256((__m256i *)out, vecB);
|
||||
out += advanceB;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((bitsin % 64) != 0) {
|
||||
// finish off the work the slow way.
|
||||
uint64_t bitset = 0;
|
||||
memcpy(&bitset, bitmapInPtr, sizeof(bitset));
|
||||
bitset = bitset & ((UINT64_C(1) << (bitsin % 64)) - 1);
|
||||
while (bitset != 0) {
|
||||
uint64_t t = bitset & -bitset;
|
||||
int r = __builtin_ctzll(bitset);
|
||||
*out = sizeinwords * 64 + r;
|
||||
out++;
|
||||
bitset ^= t;
|
||||
}
|
||||
}
|
||||
return out - initout;
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue