This commit is contained in:
Geoff Langdale 2018-07-24 14:42:53 +10:00
commit 59bfb807e1
13 changed files with 1032 additions and 33262 deletions

View File

@ -6,7 +6,7 @@
.PHONY: clean cleandist
CXXFLAGS = -std=c++11 -O2 -march=native -Wall -Wextra -Wshadow
CXXFLAGS = -std=c++11 -O2 -march=native -Wall -Wextra -Wshadow
#CXXFLAGS = -std=c++11 -O2 -march=native -Wall -Wextra -Wshadow -Wno-implicit-function-declaration
EXECUTABLES=parse
@ -15,7 +15,7 @@ EXECUTABLES=parse
all: $(EXECUTABLES)
parse: main.cpp common_defs.h linux-perf-events.h
parse: main.cpp common_defs.h vecdecode.h linux-perf-events.h
$(CXX) $(CXXFLAGS) -o parse main.cpp

View File

@ -83,7 +83,7 @@ APA
- N. Nakasato, "Implementation of a parallel tree method on a GPU", Journal of Computational Science, vol. 3, no. 3, pp. 132-141, 2012.
## References
- [How to implement atoi using SIMD?](https://stackoverflow.com/questions/35127060/how-to-implement-atoi-using-simd)
- [Parsing JSON is a Minefield 💣](http://seriot.ch/parsing_json.php)
- https://tools.ietf.org/html/rfc7159
- The only public Mison implementation (in rust) https://github.com/pikkr/pikkr

118
main.cpp
View File

@ -245,7 +245,7 @@ never_inline bool find_structural_bits(const u8 * buf, size_t len, ParsedJson &
// Now, establish "pseudo-structural characters". These are non-whitespace characters
// that are (a) outside quotes and (b) have a predecessor that's either whitespace or a structural
// character. This means that subsequent passes will get a chance to encounter the first character
// character. This means that subsequent passes will get a chance to encounter the first character
// of every string of non-whitespace and, if we're parsing an atom like true/false/null or a number
// we can stop at the first whitespace or structural character following it.
@ -275,13 +275,23 @@ const u32 NUM_RESERVED_NODES = 2;
const u32 DUMMY_NODE = 0;
const u32 ROOT_NODE = 1;
#define VECDECODE
#ifdef VECDECODE
#include "vecdecode.h"
#endif
// just transform the bitmask to a big list of 32-bit integers for now
// that's all; the type of character the offset points to will
// tell us exactly what we need to know. Naive but straightforward implementation
never_inline bool flatten_indexes(size_t len, ParsedJson & pj) {
u32 base = NUM_RESERVED_NODES;
u32 * base_ptr = pj.structural_indexes;
base_ptr[DUMMY_NODE] = base_ptr[ROOT_NODE] = 0; // really shouldn't matter
#ifdef VECDECODE
u32 number = bitmap_decode_avx2(pj.structurals, len, base_ptr + NUM_RESERVED_NODES) + NUM_RESERVED_NODES;
pj.n_structural_indexes = number;
base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array
return true;
#else
u32 base = NUM_RESERVED_NODES;
for (size_t idx = 0; idx < len; idx+=64) {
u64 s = *(u64 *)(pj.structurals + idx/8);
#ifdef SUPPRESS_CHEESY_FLATTEN
@ -312,12 +322,13 @@ never_inline bool flatten_indexes(size_t len, ParsedJson & pj) {
pj.n_structural_indexes = base;
base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array
return true;
#endif
}
const u32 MAX_DEPTH = 256;
const u32 DEPTH_SAFETY_MARGIN = 32; // should be power-of-2 as we check this with a modulo in our
// hot stage 3 loop
// hot stage 3 loop
const u32 START_DEPTH = DEPTH_SAFETY_MARGIN;
const u32 REDLINE_DEPTH = MAX_DEPTH - DEPTH_SAFETY_MARGIN;
@ -330,7 +341,7 @@ const u32 REDLINE_DEPTH = MAX_DEPTH - DEPTH_SAFETY_MARGIN;
// These structures are either u32 offsets of other tapes or u32 offsets into our input
// or structures.
//
// The state machine doesn't record ouput.
// The state machine doesn't record ouput.
// The tape machine doesn't validate.
//
// The output of the tape machine is meaningful only if the state machine is in non-error states.
@ -360,7 +371,7 @@ inline size_t get_write_size(u32 control) { return control & 0xff; }
const u32 char_control[256] = {
// nothing interesting from 0x00-0x20
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
// " is 0x22, - is 0x2d
CDF,CDF,C04,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,C04,CDF,CDF,
@ -369,24 +380,24 @@ const u32 char_control[256] = {
C04,C04,C04,C04, C04,C04,C04,C04, C04,C04,CDF,CDF, CDF,CDF,CDF,CDF,
// nothing interesting from 0x40-0x49
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
// 0x5b/5d are []
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CP4, CDF,CM4,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CP4, CDF,CM4,CDF,CDF,
// f is 0x66 n is 0x6e
CDF,CDF,CDF,CDF, CDF,CDF,C04,CDF, CDF,CDF,CDF,CDF, CDF,CDF,C04,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,C04,CDF, CDF,CDF,CDF,CDF, CDF,CDF,C04,CDF,
// 0x7b/7d are {}, 74 is t
CDF,CDF,CDF,CDF, C04,CDF,CDF,CDF, CDF,CDF,CDF,CP4, CDF,CM4,CDF,CDF,
CDF,CDF,CDF,CDF, C04,CDF,CDF,CDF, CDF,CDF,CDF,CP4, CDF,CM4,CDF,CDF,
// nothing interesting from 0x80-0xff
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF,
CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF, CDF,CDF,CDF,CDF
};
@ -396,7 +407,7 @@ const size_t MAX_TAPE = MAX_DEPTH * MAX_TAPE_ENTRIES;
// all of this stuff needs to get moved somewhere reasonable
// like our ParsedJson structure
u64 tape[MAX_TAPE];
u64 tape[MAX_TAPE];
u32 tape_locs[MAX_DEPTH];
u8 string_buf[512*1024];
u8 * current_string_buf_loc;
@ -416,7 +427,7 @@ const int START_STATE = 1;
const int START_DEPTH_START_STATE = 13;
// ANYTHING_IS_ERROR_STATE is useful both as a target
// for a transition at the start depth and also as
// for a transition at the start depth and also as
// a good initial value for "red line" depths; that
// is, depths that are maintained strictly to avoid
// undefined behavior (e.g. depths below the starting
@ -445,7 +456,7 @@ never_inline void init_state_machine() {
trans[12][(u32)UNARIES[i]] = 11;
trans[13][(u32)UNARIES[i]] = 14;
}
// back transitions when new things are open
trans[2]['{'] = 2;
trans[7]['{'] = 2;
@ -469,7 +480,7 @@ never_inline bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj
// characters without aggressively going wrong and writing to bad memory
// Note that any specious depth can have a specious tape associated with and all these specious depths
// can share a region of the tape - it's harmless. Since tape is one-way, any movement in a specious tape
// is an error (so we can detect max_depth violations by making sure that specious tape locations haven't
// is an error (so we can detect max_depth violations by making sure that specious tape locations haven't
// moved from their starting values)
u32 depth = START_DEPTH;
@ -510,12 +521,12 @@ never_inline bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj
u32 idx = next_idx;
u8 c = next_c;
u32 control = next_control;
u32 control = next_control;
next_idx = pj.structural_indexes[i+1];
next_c = buf[next_idx];
next_control = char_control[next_c];
// TAPE MACHINE
s8 depth_adjust = get_depth_adjust(control);
u8 write_size = get_write_size(control);
@ -523,8 +534,8 @@ never_inline bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj
depth += depth_adjust;
#ifdef DEBUG
cout << "i: " << i << " idx: " << idx << " c " << c << "\n";
cout << "TAPE MACHINE: depth change " << (s32)depth_adjust
<< " write_size " << (u32)write_size << " current_depth: " << depth << "\n";
cout << "TAPE MACHINE: depth change " << (s32)depth_adjust
<< " write_size " << (u32)write_size << " current_depth: " << depth << "\n";
#endif
// STATE MACHINE - hoisted here to fill in during the tape machine's latencies
@ -536,15 +547,16 @@ never_inline bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj
cout << "post " << states[depth] << "\n";
#endif
// TAPE MACHINE, again
tape[tape_locs[depth]] = write_val | (((u64)c) << 56);
tape[tape_locs[depth]] = write_val | (((u64)c) << 56);
old_tape_loc = tape_locs[depth] += write_size;
}
for (u32 i = 0; i < MAX_DEPTH; i++) {
if (states[i] == 0) {
printf("states[%d] == 0\n", i);
return false;
}
}
}
#define DUMP_TAPES
#ifdef DEBUG
@ -559,14 +571,14 @@ never_inline bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj
cout << " (NORMAL) ";
}
cout << " from: " << start_loc
cout << " from: " << start_loc
<< " to: " << tape_locs[i] << " "
<< " size: " << (tape_locs[i]-start_loc) << "\n";
cout << " state: " << states[i] << "\n";
cout << " state: " << states[i] << "\n";
#ifdef DUMP_TAPES
for (u32 j = start_loc; j < tape_locs[i]; j++) {
if (tape[j]) {
cout << "j: " << j << " tape[j] char " << (char)(tape[j]>>56)
cout << "j: " << j << " tape[j] char " << (char)(tape[j]>>56)
<< " tape[j][0..55]: " << (tape[j]&0xffffffffffffffULL ) << "\n";
}
}
@ -609,12 +621,12 @@ const u32 structural_or_whitespace_negated[256] = {
};
// return non-zero if not a structural or whitespace char
// zero otherwise
// zero otherwise
really_inline u32 is_not_structural_or_whitespace(u8 c) {
return structural_or_whitespace_negated[c];
}
// These chars yield themselves: " \ /
// These chars yield themselves: " \ /
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
// u not handled in this table as it's complex
const u8 escape_map[256] = {
@ -628,7 +640,7 @@ const u8 escape_map[256] = {
0,0,0x08,0, 0,0,0x12,0, 0,0,0,0, 0,0,0x0a,0, //0x6.
0,0,0x0d,0, 0x09,0,0,0, 0,0,0,0, 0,0,0,0, //0x7.
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
@ -642,7 +654,7 @@ const u8 escape_map[256] = {
const u32 leading_zeros_to_utf_bytes[33] = {
1,
1, 1, 1, 1, 1, 1, 1, // 7 bits for first one
1, 1, 1, 1, 1, 1, 1, // 7 bits for first one
2, 2, 2, 2, // 11 bits for next
3, 3, 3, 3, 3, // 16 bits for next
4, 4, 4, 4, 4, // 21 bits for next
@ -724,7 +736,7 @@ really_inline bool handle_unicode_codepoint(const u8 ** src_ptr, u8 ** dst_ptr)
// TODO: check to see whether the below code is nonsense (it's really only a sketch at this point)
u32 lz = __builtin_clz(code_point);
u32 utf_bytes = leading_zeros_to_utf_bytes[lz];
u32 tmp = _pdep_u32(code_point, UTF_PDEP_MASK[utf_bytes]) | UTF_OR_MASK[utf_bytes];
u32 tmp = _pdep_u32(code_point, UTF_PDEP_MASK[utf_bytes]) | UTF_OR_MASK[utf_bytes];
// swap and move to the other side of the register
tmp = __builtin_bswap32(tmp);
tmp >>= (4 - utf_bytes) * 8;
@ -734,7 +746,7 @@ really_inline bool handle_unicode_codepoint(const u8 ** src_ptr, u8 ** dst_ptr)
}
really_inline bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc) {
u32 offset = tape[tape_loc] & 0xffffff;
u32 offset = tape[tape_loc] & 0xffffff;
const u8 * src = &buf[offset+1]; // we know that buf at offset is a "
u8 * dst = current_string_buf_loc;
#ifdef DEBUG
@ -789,11 +801,11 @@ really_inline bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED Parsed
if (!handle_unicode_codepoint(&src, &dst)) {
return false;
}
return true;
return true;
} else {
// simple 1:1 conversion. Will eat bs_dist+2 characters in input and
// write bs_dist+1 characters to output
// note this may reach beyond the part of the buffer we've actually seen.
// note this may reach beyond the part of the buffer we've actually seen.
// I think this is ok
u8 escape_result = escape_map[escape_char];
if (!escape_result)
@ -801,15 +813,15 @@ really_inline bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED Parsed
dst[bs_dist] = escape_result;
src += bs_dist+2;
dst += bs_dist+1;
}
}
} else {
// they are the same. Since they can't co-occur, it means we encountered neither.
src+=32;
dst+=32;
}
}
return true;
}
// later extensions -
// later extensions -
// if \\ we could detect whether it's a substantial run of \ or just eat 2 chars and write 1
// handle anything short of \u or \\\ (as a prefix) with clever PSHUFB stuff and don't leave SIMD
return true;
@ -826,11 +838,11 @@ really_inline bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED Parsed
// TODO: see if we really need a separate number_buf or whether we should just
// have a generic scratch - would need to align before using for this
really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc, UNUSED bool found_zero, bool found_minus) {
u32 offset = tape[tape_loc] & 0xffffff;
u32 offset = tape[tape_loc] & 0xffffff;
if (found_minus) {
offset++;
}
const u8 * src = &buf[offset];
const u8 * src = &buf[offset];
m256 v = _mm256_loadu_si256((const m256 *)(src));
u64 error_sump = 0;
#ifdef DEBUG
@ -856,7 +868,7 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
// Terminators
// Whitespace: 0x20, 0x09, 0x0a, 0x0d - bucket 5+6
// Comma and the closes: 0x2c is comma, } is 0x5d, ] is 0x7d - bucket 5+7
// Another shufti - also a bit hand-hacked. Need to make a better construction
const m256 low_nibble_mask = _mm256_setr_epi8(
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
@ -887,7 +899,7 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
// TODO: make a mask that indicates where our digits are
u32 number_mask = ~enders & (enders-1);
dumpbits32(number_mask, "number mask");
m256 n_mask = _mm256_set1_epi8(0x1f);
m256 tmp_n = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, n_mask),
_mm256_set1_epi8(0));
@ -895,11 +907,11 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
// put something into our error sump if we have something
// before our ending characters that isn't a valid character
// for the inside of our JSON
// for the inside of our JSON
number_characters &= number_mask;
error_sump |= number_characters ^ number_mask;
error_sump |= number_characters ^ number_mask;
dumpbits32(number_characters, "number characters");
m256 d_mask = _mm256_set1_epi8(0x03);
m256 tmp_d = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, d_mask),
_mm256_set1_epi8(0));
@ -927,10 +939,10 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
u32 sign_characters = ~(u32)_mm256_movemask_epi8(tmp_s);
sign_characters &= number_mask;
dumpbits32(sign_characters, "sign characters");
u32 digit_edges = ~(digit_characters << 1) & digit_characters;
dumpbits32(digit_edges, "digit_edges");
// check that we have 1-3 'edges' only
u32 t = digit_edges;
t &= t-1; t &= t-1; t &= t-1;
@ -962,7 +974,7 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
#endif
*((u64 *)current_number_buf_loc) = result;
tape[tape_loc] = ((u32)'l') << 24 | (current_number_buf_loc - number_buf); // assume 2^24 will hold all numbers for now
current_number_buf_loc += 8;
current_number_buf_loc += 8;
} else {
// try a strtod
char * end;
@ -979,7 +991,7 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
#endif
*((double *)current_number_buf_loc) = result;
tape[tape_loc] = ((u32)'d') << 24 | (current_number_buf_loc - number_buf); // assume 2^24 will hold all numbers for now
current_number_buf_loc += 8;
current_number_buf_loc += 8;
}
// TODO: check the MSB element is a digit
@ -997,12 +1009,12 @@ really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED Parsed
// TODO: eE mark and +- construct are adjacent with eE first
// eE mark preceeds final cluster of numbers only
// and immediately follows second-last cluster of numbers only (not
// and immediately follows second-last cluster of numbers only (not
// necessarily second, as we may have 4e10).
// it may suffice to insist that eE is preceeded immediately
// by a digit of any kind and that it's followed locally by
// a digit immediately or a +- construct then a digit.
// TODO: if we have both . and the eE mark then the . must
// precede the eE mark
@ -1058,7 +1070,7 @@ never_inline bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
u32 head_marker_loc = tape[j] & 0xffffffffffffffULL;
u64 tape_enclosing = tape[head_marker_loc];
u8 enclosing_c = tape_enclosing >> 56;
tape[head_marker_loc] = tape[j];
tape[head_marker_loc] = tape[j];
tape[j] = tape_enclosing;
error_sump |= (enclosing_c - head_marker_c - 2); // [] and {} only differ by 2 chars
break;
@ -1073,7 +1085,7 @@ never_inline bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
case '0':
error_sump |= !parse_number(buf, len, pj, j, true, false);
break;
case '-':
case '-':
error_sump |= !parse_number(buf, len, pj, j, false, true);
break;
case 't': {
@ -1187,7 +1199,7 @@ int main(int argc, char * argv[]) {
pj.structural_indexes = new u32[max_structures];
pj.nodes = new JsonNode[max_structures];
#if defined(DEBUG)
#if defined(DEBUG)
const u32 iterations = 1;
#else
const u32 iterations = 1000;

View File

@ -1,4 +1,4 @@
HEADERS:=include/avxprocessing.h include/benchmark.h include/common_defs.h include/jsonstruct.h include/scalarprocessing.h include/util.h
HEADERS:=include/avxprocessing.h include/benchmark.h include/common_defs.h include/jsonstruct.h include/ include/util.h
bench: benchmarks/bench.cpp rapidjson/license.txt $(HEADERS)
$(CXX) -std=c++11 -O3 -o $@ benchmarks/bench.cpp -Irapidjson/include -Iinclude -march=native -lm -Wall -Wextra -Wno-narrowing

View File

@ -11,7 +11,6 @@
#include "rapidjson/reader.h" // you have to check in the submodule
#include "rapidjson/stringbuffer.h"
#include "rapidjson/writer.h"
#include "scalarprocessing.h"
#include "util.h"
// colorfuldisplay(ParsedJson & pj, const u8 * buf)
@ -82,22 +81,11 @@ int main(int argc, char *argv[]) {
u32 max_structures = ROUNDUP_N(p.second, 64) + 2 + 7;
pj.structural_indexes = new u32[max_structures];
pj.nodes = new JsonNode[max_structures];
if (verbose) {
std::cout << "Parsing SIMD (once) " << std::endl;
avx_json_parse(p.first, p.second, pj);
colorfuldisplay(pj, p.first);
debugdisplay(pj, p.first);
std::cout << "Parsing scalar (once) " << std::endl;
scalar_json_parse(p.first, p.second, pj);
colorfuldisplay(pj, p.first);
debugdisplay(pj, p.first);
}
int repeat = 10;
int volume = p.second;
BEST_TIME_NOCHECK(avx_json_parse(p.first, p.second, pj), , repeat, volume,
true);
BEST_TIME_NOCHECK(scalar_json_parse(p.first, p.second, pj), , repeat, volume,
BEST_TIME(avx_json_parse(p.first, p.second, pj), true , , repeat, volume,
true);
rapidjson::Document d;
@ -120,18 +108,18 @@ int main(int argc, char *argv[]) {
true);
memcpy(buffer, p.first, p.second);
size_t outlength = copy_without_useless_spaces((const uint8_t *)buffer, p.second,(uint8_t *) buffer);
size_t outlength = copy_without_useless_spaces_avx((const uint8_t *)buffer, p.second,(uint8_t *) buffer);
printf("these should match: %zu %zu \n", strlength, outlength);
uint8_t * cbuffer = (uint8_t *)buffer;
BEST_TIME(copy_without_useless_spaces(cbuffer, p.second,cbuffer), outlength,
BEST_TIME(copy_without_useless_spaces_avx(cbuffer, p.second,cbuffer), outlength,
memcpy(buffer, p.first, p.second), repeat, volume, true);
BEST_TIME(despace(cbuffer, p.second,cbuffer), outlength,
BEST_TIME(scalar_despace(cbuffer, p.second,cbuffer), outlength,
memcpy(buffer, p.first, p.second), repeat, volume, true);
BEST_TIME(d.ParseInsitu(buffer).HasParseError(),false, cbuffer[copy_without_useless_spaces((const uint8_t *)p.first, p.second,cbuffer)]='\0' , repeat, volume,
printf("parsing with RapidJSON after despacing:\n");
BEST_TIME(d.ParseInsitu(buffer).HasParseError(),false, cbuffer[copy_without_useless_spaces_avx((const uint8_t *)p.first, p.second,cbuffer)]='\0' , repeat, volume,
true);
free(buffer);

View File

@ -33,7 +33,7 @@ static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi,
// take input from buf and remove useless whitespace, input and output can be
// the same
static inline size_t copy_without_useless_spaces(const uint8_t *buf, size_t len,
static inline size_t copy_without_useless_spaces_avx(const uint8_t *buf, size_t len,
uint8_t *out) {
// Useful constant masks
const uint64_t even_bits = 0x5555555555555555ULL;
@ -117,11 +117,11 @@ static inline size_t copy_without_useless_spaces(const uint8_t *buf, size_t len,
int pop3 = _popcnt64((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
int pop4 = _popcnt64((~whitespace));
__m256i vmask1 =
_mm256_loadu2_m128i((const __m128i *)mask128_epi8 + mask2,
(const __m128i *)mask128_epi8 + mask1);
_mm256_loadu2_m128i((const __m128i *)mask128_epi8 + (mask2 & 0x7FFFF),
(const __m128i *)mask128_epi8 + (mask1 & 0x7FFFF));
__m256i vmask2 =
_mm256_loadu2_m128i((const __m128i *)mask128_epi8 + mask4,
(const __m128i *)mask128_epi8 + mask3);
_mm256_loadu2_m128i((const __m128i *)mask128_epi8 + (mask4 & 0x7FFFF),
(const __m128i *)mask128_epi8 + (mask3 & 0x7FFFF));
__m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
__m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
_mm256_storeu2_m128i((__m128i *)(out + pop1), (__m128i *)out, result1);
@ -163,36 +163,7 @@ static inline size_t copy_without_useless_spaces(const uint8_t *buf, size_t len,
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
prev_iter_inside_quote = (uint64_t)((s64)quote_mask >> 63);
/*
const __m256i low_nibble_mask = _mm256_setr_epi8(
// 0 9 a b c d
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, 0,
0, 0, 8, 12, 1, 2, 9, 0, 0);
const __m256i high_nibble_mask = _mm256_setr_epi8(
// 0 2 3 5 7
8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
1, 0, 0, 0, 3, 2, 1, 0, 0);
__m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);
__m256i v_lo = _mm256_and_si256(
_mm256_shuffle_epi8(low_nibble_mask, input_lo),
_mm256_shuffle_epi8(high_nibble_mask,
_mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
_mm256_set1_epi8(0x7f))));
__m256i v_hi = _mm256_and_si256(
_mm256_shuffle_epi8(low_nibble_mask, input_hi),
_mm256_shuffle_epi8(high_nibble_mask,
_mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
_mm256_set1_epi8(0x7f))));
__m256i tmp_ws_lo = _mm256_cmpeq_epi8(
_mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
__m256i tmp_ws_hi = _mm256_cmpeq_epi8(
_mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
uint64_t ws_res_0 = (uint32_t)_mm256_movemask_epi8(tmp_ws_lo);
uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
*/
__m256i mask_20 = _mm256_set1_epi8(0x20); // c==32
__m256i mask_70 =
_mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits
@ -228,10 +199,10 @@ static inline size_t copy_without_useless_spaces(const uint8_t *buf, size_t len,
int pop2 = _popcnt64((~whitespace) & UINT64_C(0xFFFFFFFF));
int pop3 = _popcnt64((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
int pop4 = _popcnt64((~whitespace));
__m256i vmask1 = _mm256_loadu2_m128i((const __m128i *)mask128_epi8 + mask2,
(const __m128i *)mask128_epi8 + mask1);
__m256i vmask2 = _mm256_loadu2_m128i((const __m128i *)mask128_epi8 + mask4,
(const __m128i *)mask128_epi8 + mask3);
__m256i vmask1 = _mm256_loadu2_m128i((const __m128i *)mask128_epi8 + (mask2 & 0x7FFF),
(const __m128i *)mask128_epi8 + (mask1 & 0x7FFF));
__m256i vmask2 = _mm256_loadu2_m128i((const __m128i *)mask128_epi8 + (mask4 & 0x7FFF),
(const __m128i *)mask128_epi8 + (mask3 & 0x7FFF));
__m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
__m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
_mm256_storeu2_m128i((__m128i *)(buffer + pop1), (__m128i *)buffer,

View File

@ -15,8 +15,41 @@
#include "jsonstruct.h"
using namespace std;
#ifdef DEBUG
inline void dump256(m256 d, string msg) {
for (u32 i = 0; i < 32; i++) {
cout << setw(3) << (int)*(((u8 *)(&d)) + i);
if (!((i+1)%8))
cout << "|";
else if (!((i+1)%4))
cout << ":";
else
cout << " ";
}
cout << " " << msg << "\n";
}
// dump bits low to high
void dumpbits(u64 v, string msg) {
for (u32 i = 0; i < 64; i++) {
std::cout << (((v>>(u64)i) & 0x1ULL) ? "1" : "_");
}
cout << " " << msg << "\n";
}
void dumpbits32(u32 v, string msg) {
for (u32 i = 0; i < 32; i++) {
std::cout << (((v>>(u32)i) & 0x1ULL) ? "1" : "_");
}
cout << " " << msg << "\n";
}
#else
#define dump256(a,b) ;
#define dumpbits(a,b) ;
#define dumpbits32(a,b) ;
#endif
// a straightforward comparison of a mask against input. 5 uops; would be cheaper in AVX512.
static u64 cmp_mask_against_input(m256 input_lo, m256 input_hi, m256 mask) {
really_inline u64 cmp_mask_against_input(m256 input_lo, m256 input_hi, m256 mask) {
m256 cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask);
u64 res_0 = (u32)_mm256_movemask_epi8(cmp_res_0);
m256 cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask);
@ -24,7 +57,7 @@ static u64 cmp_mask_against_input(m256 input_lo, m256 input_hi, m256 mask) {
return res_0 | (res_1 << 32);
}
static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
never_inline bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
// Useful constant masks
const u64 even_bits = 0x5555555555555555ULL;
const u64 odd_bits = ~even_bits;
@ -39,6 +72,18 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
for (size_t idx = 0; idx < len; idx+=64) {
__builtin_prefetch(buf + idx + 128);
#ifdef DEBUG
cout << "Idx is " << idx << "\n";
for (u32 j = 0; j < 64; j++) {
char c = *(buf+idx+j);
if (isprint(c)) {
cout << c;
} else {
cout << '_';
}
}
cout << "| ... input\n";
#endif
m256 input_lo = _mm256_load_si256((const m256 *)(buf + idx + 0));
m256 input_hi = _mm256_load_si256((const m256 *)(buf + idx + 32));
@ -47,13 +92,18 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
////////////////////////////////////////////////////////////////////////////////////////////
u64 bs_bits = cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('\\'));
dumpbits(bs_bits, "backslash bits");
u64 start_edges = bs_bits & ~(bs_bits << 1);
dumpbits(start_edges, "start_edges");
// flip lowest if we have an odd-length run at the end of the prior iteration
u64 even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
u64 even_starts = start_edges & even_start_mask;
u64 odd_starts = start_edges & ~even_start_mask;
dumpbits(even_starts, "even_starts");
dumpbits(odd_starts, "odd_starts");
u64 even_carries = bs_bits + even_starts;
u64 odd_carries;
@ -65,12 +115,22 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
// if we had an odd-numbered run at the end of
// the previous iteration
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
dumpbits(even_carries, "even_carries");
dumpbits(odd_carries, "odd_carries");
u64 even_carry_ends = even_carries & ~bs_bits;
u64 odd_carry_ends = odd_carries & ~bs_bits;
dumpbits(even_carry_ends, "even_carry_ends");
dumpbits(odd_carry_ends, "odd_carry_ends");
u64 even_start_odd_end = even_carry_ends & odd_bits;
u64 odd_start_even_end = odd_carry_ends & even_bits;
dumpbits(even_start_odd_end, "esoe");
dumpbits(odd_start_even_end, "osee");
u64 odd_ends = even_start_odd_end | odd_start_even_end;
dumpbits(odd_ends, "odd_ends");
////////////////////////////////////////////////////////////////////////////////////////////
// Step 2: detect insides of quote pairs
@ -78,10 +138,12 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
u64 quote_bits = cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
quote_bits = quote_bits & ~odd_ends;
dumpbits(quote_bits, "quote_bits");
u64 quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(_mm_set_epi64x(0ULL, quote_bits),
_mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
prev_iter_inside_quote = (u64)((s64)quote_mask>>63);
dumpbits(quote_mask, "quote_mask");
// How do we build up a user traversable data structure
// first, do a 'shufti' to detect structural JSON characters
@ -133,6 +195,9 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
u64 ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
u64 whitespace = ~(ws_res_0 | (ws_res_1 << 32));
dumpbits(structurals, "structurals");
dumpbits(whitespace, "whitespace");
// mask off anything inside quotes
structurals &= ~quote_mask;
@ -149,14 +214,20 @@ static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
// a qualified predecessor is something that can happen 1 position before an
// psuedo-structural character
u64 pseudo_pred = structurals | whitespace;
dumpbits(pseudo_pred, "pseudo_pred");
u64 shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
dumpbits(shifted_pseudo_pred, "shifted_pseudo_pred");
prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
u64 pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask);
dumpbits(pseudo_structurals, "pseudo_structurals");
dumpbits(structurals, "final structurals without pseudos");
structurals |= pseudo_structurals;
dumpbits(structurals, "final structurals and pseudo structurals");
// now, we've used our close quotes all we need to. So let's switch them off
// they will be off in the quote mask and on in quote bits.
structurals &= ~(quote_bits & ~quote_mask);
dumpbits(structurals, "final structurals and pseudo structurals after close quote removal");
*(u64 *)(pj.structurals + idx/8) = structurals;
}
return true;
@ -166,15 +237,30 @@ const u32 NUM_RESERVED_NODES = 2;
const u32 DUMMY_NODE = 0;
const u32 ROOT_NODE = 1;
#define VECDECODE
#ifdef VECDECODE
#include "vecdecode.h"
#endif
// just transform the bitmask to a big list of 32-bit integers for now
// that's all; the type of character the offset points to will
// tell us exactly what we need to know. Naive but straightforward implementation
static bool flatten_indexes(size_t len, ParsedJson & pj) {
u32 base = NUM_RESERVED_NODES;
never_inline bool flatten_indexes(size_t len, ParsedJson & pj) {
u32 * base_ptr = pj.structural_indexes;
base_ptr[DUMMY_NODE] = base_ptr[ROOT_NODE] = 0; // really shouldn't matter
#ifdef VECDECODE
u32 number = bitmap_decode_avx2(pj.structurals, len, base_ptr + NUM_RESERVED_NODES) + NUM_RESERVED_NODES;
pj.n_structural_indexes = number;
base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array
return true;
#else
u32 base = NUM_RESERVED_NODES;
for (size_t idx = 0; idx < len; idx+=64) {
u64 s = *(u64 *)(pj.structurals + idx/8);
#ifdef SUPPRESS_CHEESY_FLATTEN
while (s) {
base_ptr[base++] = (u32)idx + __builtin_ctzll(s); s &= s - 1ULL;
}
#else
u32 cnt = __builtin_popcountll(s);
u32 next_base = base + cnt;
while (s) {
@ -193,14 +279,20 @@ static bool flatten_indexes(size_t len, ParsedJson & pj) {
base += 6;
}
base = next_base;
#endif
}
pj.n_structural_indexes = base;
base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array
return true;
#endif
}
const u32 MAX_DEPTH = 256;
const u32 DEPTH_SAFETY_MARGIN = 32; // should be power-of-2 as we check this with a modulo in our
// hot stage 3 loop
const u32 START_DEPTH = DEPTH_SAFETY_MARGIN;
const u32 REDLINE_DEPTH = MAX_DEPTH - DEPTH_SAFETY_MARGIN;
// the ape machine consists of two parts:
//
@ -277,7 +369,7 @@ const size_t MAX_TAPE = MAX_DEPTH * MAX_TAPE_ENTRIES;
// all of this stuff needs to get moved somewhere reasonable
// like our ParsedJson structure
u32 tape[MAX_TAPE];
u64 tape[MAX_TAPE];
u32 tape_locs[MAX_DEPTH];
u8 string_buf[512*1024];
u8 * current_string_buf_loc;
@ -290,6 +382,20 @@ u32 trans[MAX_STATES][256];
u32 states[MAX_DEPTH];
const int START_STATE = 1;
// weird sub-machine for starting depth only
// we start at 13 and go to 14 on a single UNARY
// 14 doesn't have to have any transitions. Anything
// else arrives after the single thing it's an error
const int START_DEPTH_START_STATE = 13;
// ANYTHING_IS_ERROR_STATE is useful both as a target
// for a transition at the start depth and also as
// a good initial value for "red line" depths; that
// is, depths that are maintained strictly to avoid
// undefined behavior (e.g. depths below the starting
// depth).
const int ANYTHING_IS_ERROR_STATE = 14;
never_inline void init_state_machine() {
// states 10 and 6 eliminated
@ -310,6 +416,7 @@ never_inline void init_state_machine() {
trans[ 5][(u32)UNARIES[i]] = 7;
trans[ 9][(u32)UNARIES[i]] = 11;
trans[12][(u32)UNARIES[i]] = 11;
trans[13][(u32)UNARIES[i]] = 14;
}
// back transitions when new things are open
@ -321,10 +428,10 @@ never_inline void init_state_machine() {
trans[7]['['] = 9;
trans[9]['['] = 9;
trans[11]['['] = 9;
}
static bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
never_inline bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
// NOTE - our depth is used by both the tape machine and the state machine
// Further, in production we will set it to a largish value in a generous buffer as a rogue input
// could consist of many {[ characters or many }] characters. We aren't busily checking errors
@ -337,11 +444,17 @@ static bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
// is an error (so we can detect max_depth violations by making sure that specious tape locations haven't
// moved from their starting values)
u32 depth = 1;
u32 depth = START_DEPTH;
for (u32 i = 0; i < MAX_DEPTH; i++) {
tape_locs[i] = i*MAX_TAPE_ENTRIES;
states[i] = START_STATE;
if (i == START_DEPTH) {
states[i] = START_DEPTH_START_STATE;
} else if ((i < START_DEPTH) || (i >= REDLINE_DEPTH)) {
states[i] = ANYTHING_IS_ERROR_STATE;
} else {
states[i] = START_STATE;
}
}
current_string_buf_loc = string_buf;
@ -355,6 +468,18 @@ static bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
u32 next_control = char_control[next_c];
for (u32 i = NUM_RESERVED_NODES; i < pj.n_structural_indexes; i++) {
// very periodic safety checking. This does NOT guarantee that we
// haven't been in our dangerous zones above or below our normal
// depths. It ONLY checks to be sure that we don't manage to leave
// these zones and write completely off our tape.
if (!(i%DEPTH_SAFETY_MARGIN)) {
if (depth < START_DEPTH || depth >= REDLINE_DEPTH) {
error_sump |= 1;
break;
}
}
u32 idx = next_idx;
u8 c = next_c;
u32 control = next_control;
@ -368,13 +493,61 @@ static bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
u8 write_size = get_write_size(control);
u32 write_val = (depth_adjust != 0) ? old_tape_loc : idx;
depth += depth_adjust;
//states[depth] = trans[states[depth]][c];
#ifdef DEBUG
cout << "i: " << i << " idx: " << idx << " c " << c << "\n";
cout << "TAPE MACHINE: depth change " << (s32)depth_adjust
<< " write_size " << (u32)write_size << " current_depth: " << depth << "\n";
#endif
// STATE MACHINE - hoisted here to fill in during the tape machine's latencies
#ifdef DEBUG
cout << "STATE MACHINE: state[depth] pre " << states[depth] << " ";
#endif
states[depth] = trans[states[depth]][c];
#ifdef DEBUG
cout << "post " << states[depth] << "\n";
#endif
// TAPE MACHINE, again
tape[tape_locs[depth]] = write_val | (c << 24); // hack. Assumes no more than 2^24 tape items and buffer size for now
tape[tape_locs[depth]] = write_val | (((u64)c) << 56);
old_tape_loc = tape_locs[depth] += write_size;
}
/*
for (u32 i = 0; i < MAX_DEPTH; i++) {
if (states[i] == 0) {
printf("duuh\n");
return false;
}
}*/
#define DUMP_TAPES
#ifdef DEBUG
for (u32 i = 0; i < MAX_DEPTH; i++) {
u32 start_loc = i*MAX_TAPE_ENTRIES;
cout << " tape section i " << i;
if (i == START_DEPTH) {
cout << " (START) ";
} else if ((i < START_DEPTH) || (i >= REDLINE_DEPTH)) {
cout << " (REDLINE) ";
} else {
cout << " (NORMAL) ";
}
cout << " from: " << start_loc
<< " to: " << tape_locs[i] << " "
<< " size: " << (tape_locs[i]-start_loc) << "\n";
cout << " state: " << states[i] << "\n";
#ifdef DUMP_TAPES
for (u32 j = start_loc; j < tape_locs[i]; j++) {
if (tape[j]) {
cout << "j: " << j << " tape[j] char " << (char)(tape[j]>>56)
<< " tape[j][0..55]: " << (tape[j]&0xffffffffffffffULL ) << "\n";
}
}
#endif
}
#endif
if (error_sump) {
printf("error_sump\n");
return false;
}
return true;
@ -500,7 +673,7 @@ bool hex_to_u32(const u8 * src, u32 * res) {
// dest will advance a variable amount (return via pointer)
// return true if the unicode codepoint was valid
// We work in little-endian then swap at write time
static bool handle_unicode_codepoint(const u8 ** src_ptr, u8 ** dst_ptr) {
really_inline bool handle_unicode_codepoint(const u8 ** src_ptr, u8 ** dst_ptr) {
u32 code_point = 0; // read the hex, potentially reading another \u beyond if it's a // wacky one
if (!hex_to_u32(*src_ptr + 2, &code_point)) {
return false;
@ -534,20 +707,43 @@ static bool handle_unicode_codepoint(const u8 ** src_ptr, u8 ** dst_ptr) {
return true;
}
static bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc) {
really_inline bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc) {
u32 offset = tape[tape_loc] & 0xffffff;
const u8 * src = &buf[offset+1]; // we know that buf at offset is a "
u8 * dst = current_string_buf_loc;
#ifdef DEBUG
cout << "Entering parse string with offset " << offset << "\n";
#endif
// basic non-sexy parsing code
while (1) {
#ifdef DEBUG
for (u32 j = 0; j < 32; j++) {
char c = *(src+j);
if (isprint(c)) {
cout << c;
} else {
cout << '_';
}
}
cout << "| ... string handling input\n";
#endif
m256 v = _mm256_loadu_si256((const m256 *)(src));
u32 bs_bits = (u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')));
dumpbits32(bs_bits, "backslash bits 2");
u32 quote_bits = (u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('"')));
dumpbits32(quote_bits, "quote_bits");
u32 quote_dist = __builtin_ctz(quote_bits);
u32 bs_dist = __builtin_ctz(bs_bits);
// store to dest unconditionally - we can overwrite the bits we don't like later
_mm256_storeu_si256((m256 *)(dst), v);
#ifdef DEBUG
cout << "quote dist: " << quote_dist << " bs dist: " << bs_dist << "\n";
#endif
if (quote_dist < bs_dist) {
#ifdef DEBUG
cout << "Found end, leaving!\n";
#endif
// we encountered quotes first. Move dst to point to quotes and exit
dst[quote_dist] = 0; // null terminate and get out
current_string_buf_loc = dst + quote_dist + 1;
@ -555,6 +751,9 @@ static bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
return true;
} else if (quote_dist > bs_dist) {
u8 escape_char = src[bs_dist+1];
#ifdef DEBUG
cout << "Found escape char: " << escape_char << "\n";
#endif
// we encountered backslash first. Handle backslash
if (escape_char == 'u') {
// move src/dst up to the start; they will be further adjusted
@ -600,7 +799,7 @@ static bool parse_string(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
// TODO: see if we really need a separate number_buf or whether we should just
// have a generic scratch - would need to align before using for this
static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc, UNUSED bool found_zero, bool found_minus) {
really_inline bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson & pj, u32 tape_loc, UNUSED bool found_zero, bool found_minus) {
u32 offset = tape[tape_loc] & 0xffffff;
if (found_minus) {
offset++;
@ -608,6 +807,17 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
const u8 * src = &buf[offset];
m256 v = _mm256_loadu_si256((const m256 *)(src));
u64 error_sump = 0;
#ifdef DEBUG
for (u32 j = 0; j < 32; j++) {
char c = *(src+j);
if (isprint(c)) {
cout << c;
} else {
cout << '_';
}
}
cout << "| ... number handling input\n";
#endif
// categories to extract
// Digits:
@ -642,6 +852,7 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
m256 tmp_enders = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, enders_mask),
_mm256_set1_epi8(0));
u32 enders = ~(u32)_mm256_movemask_epi8(tmp_enders);
dumpbits32(enders, "ender characters");
if (enders == 0) {
// TODO: scream for help if enders == 0 which means we have
@ -649,6 +860,7 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
}
// TODO: make a mask that indicates where our digits are
u32 number_mask = ~enders & (enders-1);
dumpbits32(number_mask, "number mask");
m256 n_mask = _mm256_set1_epi8(0x1f);
m256 tmp_n = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, n_mask),
@ -660,32 +872,38 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
// for the inside of our JSON
number_characters &= number_mask;
error_sump |= number_characters ^ number_mask;
dumpbits32(number_characters, "number characters");
m256 d_mask = _mm256_set1_epi8(0x03);
m256 tmp_d = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, d_mask),
_mm256_set1_epi8(0));
u32 digit_characters = ~(u32)_mm256_movemask_epi8(tmp_d);
digit_characters &= number_mask;
dumpbits32(digit_characters, "digit characters");
m256 p_mask = _mm256_set1_epi8(0x04);
m256 tmp_p = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, p_mask),
_mm256_set1_epi8(0));
u32 decimal_characters = ~(u32)_mm256_movemask_epi8(tmp_p);
decimal_characters &= number_mask;
dumpbits32(decimal_characters, "decimal characters");
m256 e_mask = _mm256_set1_epi8(0x08);
m256 tmp_e = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, e_mask),
_mm256_set1_epi8(0));
u32 exponent_characters = ~(u32)_mm256_movemask_epi8(tmp_e);
exponent_characters &= number_mask;
dumpbits32(exponent_characters, "exponent characters");
m256 s_mask = _mm256_set1_epi8(0x10);
m256 tmp_s = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, s_mask),
_mm256_set1_epi8(0));
u32 sign_characters = ~(u32)_mm256_movemask_epi8(tmp_s);
sign_characters &= number_mask;
dumpbits32(sign_characters, "sign characters");
u32 digit_edges = ~(digit_characters << 1) & digit_characters;
dumpbits32(digit_edges, "digit_edges");
// check that we have 1-3 'edges' only
u32 t = digit_edges;
@ -713,6 +931,9 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
if (found_minus) {
result = -result;
}
#ifdef DEBUG
cout << "Found number " << result << "\n";
#endif
*((u64 *)current_number_buf_loc) = result;
tape[tape_loc] = ((u32)'l') << 24 | (current_number_buf_loc - number_buf); // assume 2^24 will hold all numbers for now
current_number_buf_loc += 8;
@ -727,6 +948,9 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
if (found_minus) {
result = -result;
}
#ifdef DEBUG
cout << "Found number " << result << "\n";
#endif
*((double *)current_number_buf_loc) = result;
tape[tape_loc] = ((u32)'d') << 24 | (current_number_buf_loc - number_buf); // assume 2^24 will hold all numbers for now
current_number_buf_loc += 8;
@ -764,7 +988,13 @@ static bool parse_number(const u8 * buf, UNUSED size_t len, UNUSED ParsedJson &
return true;
}
static bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
bool tape_disturbed(u32 i) {
u32 start_loc = i*MAX_TAPE_ENTRIES;
u32 end_loc = tape_locs[i];
return start_loc != end_loc;
}
never_inline bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
// fixup the mess made by the ape_machine
// as such it does a bunch of miscellaneous things on the tapes
u32 error_sump = 0;
@ -774,13 +1004,23 @@ static bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
u64 mask4 = 0x00000000ffffffff;
u64 mask5 = 0x000000ffffffffff;
// if the tape has been touched at all at the depths outside the safe
// zone we need to quit. Note that our periodic checks to see that we're
// inside our safe zone in stage 3 don't guarantee that the system did
// not get into the danger area briefly.
if (tape_disturbed(START_DEPTH - 1) || tape_disturbed(REDLINE_DEPTH)) {
return false;
}
// walk over each tape
for (u32 i = 0; i < MAX_DEPTH; i++) {
for (u32 i = START_DEPTH; i < MAX_DEPTH; i++) {
u32 start_loc = i*MAX_TAPE_ENTRIES;
u32 end_loc = tape_locs[i];
if (start_loc == end_loc) {
break;
}
for (u32 j = start_loc; j < end_loc; j++) {
switch (tape[j]>>24) {
switch (tape[j]>>56) {
case '{': case '[': {
// pivot our tapes
// point the enclosing structural char (}]) to the head marker ({[) and
@ -788,10 +1028,10 @@ static bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
// we start with head marker pointing at the enclosing structural char
// and the enclosing structural char pointing at the end. Just swap them.
// also check the balanced-{} or [] property here
u8 head_marker_c = tape[j] >> 24;
u32 head_marker_loc = tape[j] & 0xffffff;
u32 tape_enclosing = tape[head_marker_loc];
u8 enclosing_c = tape_enclosing >> 24;
u8 head_marker_c = tape[j] >> 56;
u32 head_marker_loc = tape[j] & 0xffffffffffffffULL;
u64 tape_enclosing = tape[head_marker_loc];
u8 enclosing_c = tape_enclosing >> 56;
tape[head_marker_loc] = tape[j];
tape[j] = tape_enclosing;
error_sump |= (enclosing_c - head_marker_c - 2); // [] and {} only differ by 2 chars
@ -811,21 +1051,21 @@ static bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
error_sump |= !parse_number(buf, len, pj, j, false, true);
break;
case 't': {
u32 offset = tape[j] & 0xffffff;
u32 offset = tape[j] & 0xffffffffffffffULL;
const u8 * loc = buf + offset;
error_sump |= ((*(const u64 *)loc) & mask4) ^ tv;
error_sump |= is_not_structural_or_whitespace(loc[4]);
break;
}
case 'f': {
u32 offset = tape[j] & 0xffffff;
u32 offset = tape[j] & 0xffffffffffffffULL;
const u8 * loc = buf + offset;
error_sump |= ((*(const u64 *)loc) & mask5) ^ fv;
error_sump |= is_not_structural_or_whitespace(loc[5]);
break;
}
case 'n': {
u32 offset = tape[j] & 0xffffff;
u32 offset = tape[j] & 0xffffffffffffffULL;
const u8 * loc = buf + offset;
error_sump |= ((*(const u64 *)loc) & mask4) ^ nv;
error_sump |= is_not_structural_or_whitespace(loc[4]);
@ -837,6 +1077,7 @@ static bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
}
}
if (error_sump) {
cerr << "Ugh!\n";
return false;
}
return true;
@ -845,8 +1086,14 @@ static bool shovel_machine(const u8 * buf, size_t len, ParsedJson & pj) {
static bool avx_json_parse(const u8 * buf, size_t len, ParsedJson & pj) {
find_structural_bits(buf, len, pj);
flatten_indexes(len, pj);
return ape_machine(buf, len, pj) && shovel_machine(buf, len, pj);
bool apeok = ape_machine(buf, len, pj);
if(!apeok) {
return false;
}
return shovel_machine(buf, len, pj);
}

View File

@ -3,9 +3,9 @@
#include "common_defs.h"
struct JsonNode {
u32 up;
u32 next;
u32 prev;
u32 next_type;
u64 payload; // a freeform 'payload' holding a parsed representation of *something*
};
struct ParsedJson {
@ -68,15 +68,3 @@ void colorfuldisplay(ParsedJson & pj, const u8 * buf) {
}
std::cout << std::endl;
}
void debugdisplay(ParsedJson & pj, const u8 * buf) {
for (u32 i = 0; i < pj.n_structural_indexes; i++) {
u32 idx = pj.structural_indexes[i];
JsonNode & n = pj.nodes[i];
std::cout << "i: " << i;
std::cout << " n.up: " << n.up;
std::cout << " n.next: " << n.next;
std::cout << " n.prev: " << n.prev;
std::cout << " idx: " << idx << " buf[idx] " << buf[idx] << std::endl;
}
}

View File

@ -34,7 +34,7 @@ static uint8_t jump_table[256 * 3] = {
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
};
static inline size_t despace(const unsigned char *bytes, size_t howmany,
static inline size_t scalar_despace(const unsigned char *bytes, size_t howmany,
unsigned char *out) {
size_t i = 0, pos = 0;
uint8_t quote = 0;

View File

@ -1,79 +0,0 @@
#include "common_defs.h"
#include "jsonstruct.h"
bool is_valid_escape(char c) {
return (c == '"') || (c == '\\') || (c == '/') || (c == 'b') || (c == 'f') || (c == 'n') || (c == 'r') || (c == 't') || (c == 'u');
}
bool scalar_json_parse(const u8 * buf, size_t len, ParsedJson & pj) {
// this is a naive attempt at this point
// it will probably be subject to failures given adversarial inputs
size_t pos = 0;
size_t last = 0;
size_t up = 0;
const u32 DUMMY_NODE = 0;
const u32 ROOT_NODE = 1;
pj.structural_indexes[DUMMY_NODE] = 0;
pj.structural_indexes[ROOT_NODE] = 0;
JsonNode & dummy = pj.nodes[DUMMY_NODE];
JsonNode & root = pj.nodes[ROOT_NODE];
dummy.prev = dummy.up = DUMMY_NODE;
dummy.next = 0;
root.prev = DUMMY_NODE;
root.up = ROOT_NODE;
root.next = 0;
last = up = ROOT_NODE;
pos = 2;
for(size_t i = 0; i < len; i++) {
JsonNode & n = pj.nodes[pos];
switch (buf[i]) {
case '[':
case '{':
pj.structural_indexes[pos] = i;
n.prev = last;
pj.nodes[last].next = pos;// two-way linked list
n.up = up;
up = pos;// new possible up
last = 0;
pos += 1;
break;
case ']':
case '}':
pj.structural_indexes[pos] = i;
n.prev = up;
n.next = 0;// necessary?
pj.nodes[up].next = pos;// two-way linked list
n.up = pj.nodes[up].up;
up = pj.nodes[up].up;
last = pos;// potential previous
pos += 1;
break;
case '"':
case ':':
case ',':
pj.structural_indexes[pos] = i;
n.prev = last;
n.next = 0;// necessary
pj.nodes[last].next = pos;// two-way linked list
n.up = up;
last = pos;// potential previous
pos += 1;
break;
case '\\':
if(i == len - 1) return false;
if(!is_valid_escape(buf[i+1])) return false;
i = i + 1; // skip valid escape
default:
// nothing
break;
}
}
pj.n_structural_indexes = pos;
dummy.next = DUMMY_NODE; // dummy.next is a sump for meaningless 'nexts', clear it
return true;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,335 @@
#ifndef VECDECODE_H
#define VECDECODE_H
#if defined(_MSC_VER)
#define ALIGNED(x) __declspec(align(x))
#else
#if defined(__GNUC__)
#define ALIGNED(x) __attribute__((aligned(x)))
#endif
#endif
static uint8_t lengthTable[256] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
static uint32_t vecDecodeTable[256][8] ALIGNED(16) = {
{0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
{1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
{2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
{1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
{3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
{1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
{2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
{1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
{4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
{1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
{2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
{1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
{3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
{1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
{2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
{1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
{5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
{1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
{2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
{1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
{3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
{1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
{2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
{1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
{4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
{1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
{2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
{1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
{3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
{1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
{2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
{1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
{6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
{1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
{2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
{1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
{3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
{1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
{2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
{1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
{4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
{1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
{2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
{1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
{3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
{1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
{2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
{1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
{5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
{1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
{2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
{1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
{3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
{1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
{2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
{1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
{4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
{1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
{2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
{1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
{3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
{1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
{2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
{1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
{7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
{1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
{2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
{1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
{3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
{1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
{2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
{1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
{4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
{1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
{2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
{1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
{3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
{1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
{2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
{1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
{5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
{1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
{2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
{1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
{3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
{1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
{2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
{1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
{4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
{1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
{2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
{1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
{3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
{1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
{2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
{1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
{6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
{1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
{2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
{1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
{3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
{1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
{2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
{1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
{4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
{1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
{2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
{1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
{3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
{1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
{2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
{1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
{5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
{1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
{2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
{1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
{3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
{1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
{2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
{1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
{4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
{1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
{2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
{1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
{3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
{1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
{2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
{1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
{8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
{1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
{2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
{1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
{3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
{1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
{2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
{1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
{4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
{1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
{2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
{1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
{3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
{1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
{2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
{1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
{5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
{1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
{2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
{1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
{3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
{1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
{2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
{1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
{4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
{1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
{2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
{1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
{3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
{1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
{2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
{1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
{6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
{1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
{2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
{1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
{3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
{1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
{2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
{1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
{4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
{1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
{2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
{1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
{3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
{1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
{2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
{1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
{5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
{1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
{2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
{1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
{3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
{1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
{2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
{1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
{4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
{1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
{2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
{1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
{3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
{1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
{2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
{1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
{7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
{1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
{2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
{1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
{3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
{1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
{2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
{1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
{4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
{1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
{2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
{1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
{3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
{1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
{2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
{1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
{5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
{1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
{2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
{1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
{3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
{1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
{2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
{1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
{4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
{1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
{2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
{1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
{3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
{1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
{2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
{1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
{6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
{1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
{2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
{1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
{3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
{1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
{2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
{1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
{4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
{1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
{2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
{1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
{3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
{1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
{2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
{1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
{5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
{1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
{2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
{1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
{3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
{1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
{2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
{1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
{4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
{1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
{2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
{1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
{3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
{1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
{2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
{1, 2, 3, 4, 5, 6, 7, 8} /* 0xFF (11111111) */
};
static size_t bitmap_decode_avx2(uint8_t *bitmapInPtr, size_t bitsin, uint32_t *out) {
uint32_t *initout = out;
__m256i baseVec = _mm256_set1_epi32(-1);
__m256i incVec = _mm256_set1_epi32(64);
__m256i add8 = _mm256_set1_epi32(8);
int sizeinwords = bitsin / 64;
uint64_t *array = (uint64_t *)bitmapInPtr;
for (int i = 0; i < sizeinwords; ++i) {
uint64_t w = array[i];
if (w == 0) {
baseVec = _mm256_add_epi32(baseVec, incVec);
} else {
for (int k = 0; k < 4; ++k) {
uint8_t byteA = (uint8_t)w;
uint8_t byteB = (uint8_t)(w >> 8);
w >>= 16;
__m256i vecA =
_mm256_load_si256((const __m256i *)vecDecodeTable[byteA]);
__m256i vecB =
_mm256_load_si256((const __m256i *)vecDecodeTable[byteB]);
uint8_t advanceA = lengthTable[byteA];
uint8_t advanceB = lengthTable[byteB];
vecA = _mm256_add_epi32(baseVec, vecA);
baseVec = _mm256_add_epi32(baseVec, add8);
vecB = _mm256_add_epi32(baseVec, vecB);
baseVec = _mm256_add_epi32(baseVec, add8);
_mm256_storeu_si256((__m256i *)out, vecA);
out += advanceA;
_mm256_storeu_si256((__m256i *)out, vecB);
out += advanceB;
}
}
}
if ((bitsin % 64) != 0) {
// finish off the work the slow way.
uint64_t bitset = 0;
memcpy(&bitset, bitmapInPtr, sizeof(bitset));
bitset = bitset & ((UINT64_C(1) << (bitsin % 64)) - 1);
while (bitset != 0) {
uint64_t t = bitset & -bitset;
int r = __builtin_ctzll(bitset);
*out = sizeinwords * 64 + r;
out++;
bitset ^= t;
}
}
return out - initout;
}
#endif

335
vecdecode.h Normal file
View File

@ -0,0 +1,335 @@
#ifndef VECDECODE_H
#define VECDECODE_H
#if defined(_MSC_VER)
#define ALIGNED(x) __declspec(align(x))
#else
#if defined(__GNUC__)
#define ALIGNED(x) __attribute__((aligned(x)))
#endif
#endif
static uint8_t lengthTable[256] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
static uint32_t vecDecodeTable[256][8] ALIGNED(16) = {
{0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
{1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
{2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
{1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
{3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
{1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
{2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
{1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
{4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
{1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
{2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
{1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
{3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
{1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
{2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
{1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
{5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
{1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
{2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
{1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
{3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
{1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
{2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
{1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
{4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
{1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
{2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
{1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
{3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
{1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
{2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
{1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
{6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
{1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
{2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
{1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
{3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
{1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
{2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
{1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
{4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
{1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
{2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
{1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
{3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
{1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
{2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
{1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
{5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
{1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
{2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
{1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
{3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
{1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
{2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
{1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
{4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
{1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
{2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
{1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
{3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
{1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
{2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
{1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
{7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
{1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
{2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
{1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
{3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
{1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
{2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
{1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
{4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
{1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
{2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
{1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
{3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
{1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
{2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
{1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
{5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
{1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
{2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
{1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
{3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
{1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
{2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
{1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
{4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
{1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
{2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
{1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
{3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
{1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
{2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
{1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
{6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
{1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
{2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
{1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
{3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
{1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
{2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
{1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
{4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
{1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
{2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
{1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
{3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
{1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
{2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
{1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
{5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
{1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
{2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
{1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
{3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
{1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
{2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
{1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
{4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
{1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
{2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
{1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
{3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
{1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
{2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
{1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
{8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
{1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
{2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
{1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
{3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
{1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
{2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
{1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
{4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
{1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
{2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
{1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
{3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
{1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
{2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
{1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
{5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
{1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
{2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
{1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
{3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
{1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
{2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
{1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
{4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
{1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
{2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
{1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
{3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
{1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
{2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
{1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
{6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
{1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
{2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
{1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
{3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
{1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
{2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
{1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
{4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
{1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
{2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
{1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
{3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
{1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
{2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
{1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
{5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
{1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
{2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
{1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
{3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
{1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
{2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
{1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
{4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
{1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
{2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
{1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
{3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
{1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
{2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
{1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
{7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
{1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
{2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
{1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
{3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
{1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
{2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
{1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
{4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
{1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
{2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
{1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
{3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
{1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
{2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
{1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
{5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
{1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
{2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
{1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
{3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
{1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
{2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
{1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
{4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
{1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
{2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
{1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
{3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
{1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
{2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
{1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
{6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
{1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
{2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
{1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
{3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
{1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
{2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
{1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
{4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
{1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
{2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
{1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
{3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
{1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
{2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
{1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
{5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
{1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
{2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
{1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
{3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
{1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
{2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
{1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
{4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
{1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
{2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
{1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
{3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
{1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
{2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
{1, 2, 3, 4, 5, 6, 7, 8} /* 0xFF (11111111) */
};
static size_t bitmap_decode_avx2(uint8_t *bitmapInPtr, size_t bitsin, uint32_t *out) {
uint32_t *initout = out;
__m256i baseVec = _mm256_set1_epi32(-1);
__m256i incVec = _mm256_set1_epi32(64);
__m256i add8 = _mm256_set1_epi32(8);
int sizeinwords = bitsin / 64;
uint64_t *array = (uint64_t *)bitmapInPtr;
for (int i = 0; i < sizeinwords; ++i) {
uint64_t w = array[i];
if (w == 0) {
baseVec = _mm256_add_epi32(baseVec, incVec);
} else {
for (int k = 0; k < 4; ++k) {
uint8_t byteA = (uint8_t)w;
uint8_t byteB = (uint8_t)(w >> 8);
w >>= 16;
__m256i vecA =
_mm256_load_si256((const __m256i *)vecDecodeTable[byteA]);
__m256i vecB =
_mm256_load_si256((const __m256i *)vecDecodeTable[byteB]);
uint8_t advanceA = lengthTable[byteA];
uint8_t advanceB = lengthTable[byteB];
vecA = _mm256_add_epi32(baseVec, vecA);
baseVec = _mm256_add_epi32(baseVec, add8);
vecB = _mm256_add_epi32(baseVec, vecB);
baseVec = _mm256_add_epi32(baseVec, add8);
_mm256_storeu_si256((__m256i *)out, vecA);
out += advanceA;
_mm256_storeu_si256((__m256i *)out, vecB);
out += advanceB;
}
}
}
if ((bitsin % 64) != 0) {
// finish off the work the slow way.
uint64_t bitset = 0;
memcpy(&bitset, bitmapInPtr, sizeof(bitset));
bitset = bitset & ((UINT64_C(1) << (bitsin % 64)) - 1);
while (bitset != 0) {
uint64_t t = bitset & -bitset;
int r = __builtin_ctzll(bitset);
*out = sizeinwords * 64 + r;
out++;
bitset ^= t;
}
}
return out - initout;
}
#endif