Remove old 4-stage path.

This commit is contained in:
Geoff Langdale 2018-09-26 15:22:55 +10:00
parent b9706d462c
commit 9f91650e72
9 changed files with 6 additions and 1080 deletions

View File

@ -8,11 +8,11 @@
CXXFLAGS = -std=c++11 -g2 -O2 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux -Idependencies/double-conversion -Idependencies/rapidjson/include -Ldependencies/double-conversion/release
LIBFLAGS = -ldouble-conversion
EXECUTABLES=parse jsoncheck minifiercompetition parsingcompetition parseunified
EXECUTABLES=parse jsoncheck minifiercompetition parsingcompetition
DOUBLEEXECUTABLES=parsedouble jsoncheckdouble parsingcompetitiondouble
HEADERS=include/jsonparser/jsonparser.h include/jsonparser/common_defs.h include/jsonparser/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/jsonparser/simdjson_internal.h include/jsonparser/stage1_find_marks.h include/jsonparser/stage2_flatten.h include/jsonparser/stage3_ape_machine.h include/jsonparser/stage4_shovel_machine.h include/jsonparser/stage34_unified.h
LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp src/stage2_flatten.cpp src/stage3_ape_machine.cpp src/stage4_shovel_machine.cpp src/stage34_unified.cpp
HEADERS=include/jsonparser/jsonparser.h include/jsonparser/common_defs.h include/jsonparser/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/jsonparser/simdjson_internal.h include/jsonparser/stage1_find_marks.h include/jsonparser/stage2_flatten.h include/jsonparser/stage34_unified.h
LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp src/stage2_flatten.cpp src/stage34_unified.cpp
MINIFIERHEADERS=include/jsonparser/jsonminifier.h include/jsonparser/simdprune_tables.h
MINIFIERLIBFILES=src/jsonminifier.cpp
@ -39,9 +39,6 @@ bench: benchmarks/bench.cpp $(RAPIDJSON_INCLUDE) $(HEADERS)
$(CXX) -std=c++11 -O3 -o $@ benchmarks/bench.cpp -I$(RAPIDJSON_INCLUDE) -Iinclude -march=native -lm -Wall -Wextra -Wno-narrowing
parseunified: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o parseunified $(LIBFILES) benchmark/parse.cpp $(LIBFLAGS) -DTEST_UNIFIED
parse: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o parse $(LIBFILES) benchmark/parse.cpp $(LIBFLAGS)

View File

@ -22,8 +22,6 @@
#include <vector>
#include <x86intrin.h>
//#define TEST_UNIFIED
/// Fixme: enable doube conv
// #define DOUBLECONV
#ifdef DOUBLECONV
@ -39,8 +37,6 @@ using namespace double_conversion;
#include "jsonparser/simdjson_internal.h"
#include "jsonparser/stage1_find_marks.h"
#include "jsonparser/stage2_flatten.h"
#include "jsonparser/stage3_ape_machine.h"
#include "jsonparser/stage4_shovel_machine.h"
#include "jsonparser/stage34_unified.h"
using namespace std;
@ -129,7 +125,6 @@ int main(int argc, char *argv[]) {
cerr << "Currently only support JSON files < 16MB\n";
exit(1);
}
init_state_machine();
pj.n_structural_indexes = 0;
// we have potentially 1 structure per byte of input
@ -159,8 +154,8 @@ int main(int argc, char *argv[]) {
LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
vector<u64> results;
results.resize(evts.size());
unsigned long cy1 = 0, cy2 = 0, cy3 = 0, cy4 = 0;
unsigned long cl1 = 0, cl2 = 0, cl3 = 0, cl4 = 0;
unsigned long cy1 = 0, cy2 = 0, cy3 = 0;
unsigned long cl1 = 0, cl2 = 0, cl3 = 0;
#endif
bool isok = true;
for (u32 i = 0; i < iterations; i++) {
@ -191,31 +186,6 @@ int main(int argc, char *argv[]) {
unified.start();
#endif
#ifndef TEST_UNIFIED
isok = ape_machine(p.first, p.second, pj);
#ifndef SQUASH_COUNTERS
unified.end(results);
cy3 += results[0];
cl3 += results[1];
if (!isok) {
cout << "Failed out during stage 3\n";
break;
}
unified.start();
#endif
isok = shovel_machine(p.first, p.second, pj);
#ifndef SQUASH_COUNTERS
unified.end(results);
cy4 += results[0];
cl4 += results[1];
#endif
if (!isok) {
cout << "Failed out during stage 4\n";
break;
}
#else
isok = unified_machine(p.first, p.second, pj);
#ifndef SQUASH_COUNTERS
unified.end(results);
@ -227,7 +197,6 @@ int main(int argc, char *argv[]) {
}
#endif
#endif
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
res[i] = secs.count();
@ -237,7 +206,7 @@ int main(int argc, char *argv[]) {
printf("number of bytes %ld number of structural chars %d ratio %.3f\n",
p.second, pj.n_structural_indexes,
(double)pj.n_structural_indexes / p.second);
unsigned long total = cy1 + cy2 + cy3 + cy4;
unsigned long total = cy1 + cy2 + cy3;
printf(
"stage 1 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: %.2f \n",
@ -261,14 +230,6 @@ int main(int argc, char *argv[]) {
printf("%.2f cycles per structural character.\n",
(double)cy3 / (iterations * pj.n_structural_indexes));
printf(
"stage 4 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: %.2f \n",
cl4, cy4, 100. * cy4 / total, (double)cl4 / cy4);
printf(" stage 4 runs at %.2f cycles per input byte and ",
(double)cy4 / (iterations * p.second));
printf("%.2f cycles per structural character.\n",
(double)cy4 / (iterations * pj.n_structural_indexes));
printf(" all stages: %.2f cycles per input byte.\n",
(double)total / (iterations * p.second));
#endif

View File

@ -46,7 +46,6 @@ int main(int argc, char *argv[]) {
int repeat = 10;
int volume = p.second;
BEST_TIME(json_parse(p.first, p.second, pj), true, , repeat, volume, true);
BEST_TIME(json_parse_4stages(p.first, p.second, pj), true, , repeat, volume, true);
rapidjson::Document d;

View File

@ -5,8 +5,6 @@
#include "simdjson_internal.h"
#include "stage1_find_marks.h"
#include "stage2_flatten.h"
#include "stage3_ape_machine.h"
#include "stage4_shovel_machine.h"
#include "stage34_unified.h"
// Allocate a ParsedJson structure that can support document
@ -22,6 +20,3 @@ void deallocate_ParsedJson(ParsedJson *pj_ptr);
// Parse a document found in buf, need to preallocate ParsedJson.
// Return false in case of a failure.
bool json_parse(const u8 *buf, size_t len, ParsedJson &pj);
// like json_parse but users 4 stages, slower.
bool json_parse_4stages(const u8 *buf, size_t len, ParsedJson &pj);

View File

@ -1,7 +0,0 @@
#pragma once
#include "common_defs.h"
#include "simdjson_internal.h"
void init_state_machine();
bool ape_machine(const u8 *buf, size_t len, ParsedJson &pj);

View File

@ -1,6 +0,0 @@
#pragma once
#include "common_defs.h"
#include "simdjson_internal.h"
bool shovel_machine(const u8 *buf, size_t len, ParsedJson &pj);

View File

@ -44,27 +44,6 @@ void deallocate_ParsedJson(ParsedJson *pj_ptr) {
delete pj_ptr;
}
// parse a document found in buf, need to preallocate ParsedJson.
// this can probably be considered a legacy function at this point.
bool json_parse_4stages(const u8 *buf, size_t len, ParsedJson &pj) {
if (pj.bytecapacity < len) {
std::cerr << "Your ParsedJson cannot support documents that big: " << len
<< std::endl;
return false;
}
bool isok = find_structural_bits(buf, len, pj);
if (isok) {
isok = flatten_indexes(len, pj);
}
if (isok) {
isok = ape_machine(buf, len, pj);
}
if (isok) {
isok = shovel_machine(buf, len, pj);
}
return isok;
}
// parse a document found in buf, need to preallocate ParsedJson.
bool json_parse(const u8 *buf, size_t len, ParsedJson &pj) {
if (pj.bytecapacity < len) {

View File

@ -1,338 +0,0 @@
#ifdef _MSC_VER
/* Microsoft C/C++-compatible compiler */
#include <intrin.h>
#else
#include <immintrin.h>
#include <x86intrin.h>
#endif
#include <cassert>
#include <cstring>
#include "jsonparser/common_defs.h"
#include "jsonparser/simdjson_internal.h"
// the ape machine consists of two parts:
//
// 1) The "state machine", which is a multiple channel per-level state machine
// It is a conventional DFA except in that it 'changes track' on {}[]
// characters
//
// 2) The "tape machine": this records offsets of various structures as they go
// by
// These structures are either u32 offsets of other tapes or u32 offsets into
// our input or structures.
//
// The state machine doesn't record ouput.
// The tape machine doesn't validate.
//
// The output of the tape machine is meaningful only if the state machine is in
// non-error states.
// depth adjustment is strictly based on whether we are {[ or }]
// depth adjustment is a pre-increment which, in effect, means that a {[
// contained in an object is in the level one deeper, while the corresponding }]
// is at the level
// TAPE MACHINE DEFINITIONS
const u32 DEPTH_PLUS_ONE = 0x01000000;
const u32 DEPTH_ZERO = 0x00000000;
const u32 DEPTH_MINUS_ONE = 0xff000000;
const u32 WRITE_ZERO = 0x0;
const u32 WRITE_FOUR = 0x1;
const u32 CDF = DEPTH_ZERO | WRITE_ZERO; // default 'control'
const u32 C04 = DEPTH_ZERO | WRITE_FOUR;
const u32 CP4 = DEPTH_PLUS_ONE | WRITE_FOUR;
const u32 CM4 = DEPTH_MINUS_ONE | WRITE_FOUR;
inline s8 get_depth_adjust(u32 control) { return (s8)(((s32)control) >> 24); }
inline size_t get_write_size(u32 control) { return control & 0xff; }
const u32 char_control[256] = {
// nothing interesting from 0x00-0x20
CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF,
CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF,
CDF, CDF,
// " is 0x22, - is 0x2d
CDF, CDF, C04, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, C04, CDF,
CDF,
// numbers are 0x30-0x39
C04, C04, C04, C04, C04, C04, C04, C04, C04, C04, CDF, CDF, CDF, CDF, CDF,
CDF,
// nothing interesting from 0x40-0x49
CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF,
CDF,
// 0x5b/5d are []
CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CP4, CDF, CM4, CDF,
CDF,
// f is 0x66 n is 0x6e
CDF, CDF, CDF, CDF, CDF, CDF, C04, CDF, CDF, CDF, CDF, CDF, CDF, CDF, C04,
CDF,
// 0x7b/7d are {}, 74 is t
CDF, CDF, CDF, CDF, C04, CDF, CDF, CDF, CDF, CDF, CDF, CP4, CDF, CM4, CDF,
CDF,
// nothing interesting from 0x80-0xff
CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF,
CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF,
CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF,
CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF,
CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF,
CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF,
CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF,
CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF,
CDF, CDF, CDF, CDF, CDF, CDF, CDF, CDF};
// all of this stuff needs to get moved somewhere reasonable
// like our ParsedJson structure
/*
u64 tape[MAX_TAPE];
u32 tape_locs[MAX_DEPTH];
u8 string_buf[512*1024];
u8 * current_string_buf_loc;
u8 number_buf[512*1024]; // holds either doubles or longs, really
u8 * current_number_buf_loc;
*/
// STATE MACHINE DECLARATIONS
const u32 MAX_STATES = 16;
/**
* It is annoying to have to call init_state_machine each time.
* Better to precompute the (small) result into a header file.
*/
// u32 trans[MAX_STATES][256];
#include "jsonparser/transitions.h"
u32 states[MAX_DEPTH];
const int START_STATE = 1;
u32 valid_end_states[MAX_STATES] = {
0, // 0 state is by definition an error
1, // ok to still be in start state
1, // state 2: we've seen an { - if we left this level it's ok
0, // state 3 is abolished, we shouldn't be in it
0, // state 4 means we saw a string inside an object. We can't end like
// this!
0, // similarly state 5 means we saw a string followed by a colon.
0, // state 6 is abolished
1, // it's ok to finish on 7
0, // state 8 we've seen a comma inside an object - can't finish here
1, // state 9 is like state 2 only for arrays, so ok
0, // state 10 abolished
1, // state 11 is ok to finish on, we just saw a unary inside a array
0, // state 12 we've just seen a comma inside an array - can't finish
0, // state 13 is our weird start state. I think we shouldn't end on it as
// we need to see something
1, // state 14 is ok. Its an error to see something *more* here but not to
// be in this state
0, // we don't use state 15
};
// weird sub-machine for starting depth only
// we start at 13 and go to 14 on a single UNARY
// 14 doesn't have to have any transitions. Anything
// else arrives after the single thing it's an error
const int START_DEPTH_START_STATE = 13;
// ANYTHING_IS_ERROR_STATE is useful both as a target
// for a transition at the start depth and also as
// a good initial value for "red line" depths; that
// is, depths that are maintained strictly to avoid
// undefined behavior (e.g. depths below the starting
// depth).
const int ANYTHING_IS_ERROR_STATE = 14;
void init_state_machine() {
// states 10 and 6 eliminated
trans[1][(int)'{'] = 2;
trans[2][(int)'"'] = 4;
trans[4][(int)':'] = 5;
// 5->7 on all values ftn0123456789-"
trans[7][(int)','] = 8;
trans[8][(int)'"'] = 4;
trans[1][(int)'['] = 9;
// 9->11 on all values ftn0123456789-"
trans[11][(int)','] = 12;
// 12->11 on all values ftn0123456789-"
const char *UNARIES = "}]ftn0123456789-\"";
for (u32 i = 0; i < strlen(UNARIES); i++) {
trans[5][(u32)UNARIES[i]] = 7;
trans[9][(u32)UNARIES[i]] = 11;
trans[12][(u32)UNARIES[i]] = 11;
#ifdef PERMIT_RANDOM_UNARIES_AT_TOP_LEVEL
// NOTE: if we permit JSON documents that
// contain a single number or string, then we
// allow all the unaries at the top level
trans[13][(u32)UNARIES[i]] = 14;
#endif
}
#ifndef PERMIT_RANDOM_UNARIES_AT_TOP_LEVEL
// NOTE: if we don't permit JSON documents that
// that contain a single number or string, we must
// make sure we accept the top-level closing braces
// that are delivered to the start depth only
trans[13][(int)'}'] = 14;
trans[13][(int)']'] = 14;
#endif
// back transitions when new things are open
trans[2][(int)'{'] = 2;
trans[7][(int)'{'] = 2;
trans[9][(int)'{'] = 2;
trans[11][(int)'{'] = 2;
trans[2][(int)'['] = 9;
trans[7][(int)'['] = 9;
trans[9][(int)'['] = 9;
trans[11][(int)'['] = 9;
}
bool ape_machine(const u8 *buf, UNUSED size_t len, ParsedJson &pj) {
// NOTE - our depth is used by both the tape machine and the state machine
// Further, in production we will set it to a largish value in a generous
// buffer as a rogue input could consist of many {[ characters or many }]
// characters. We aren't busily checking errors (and in fact, a aggressive
// sequence of [ characters is actually valid input!) so something that blows
// out maximum depth will need to be periodically checked for, as will
// something that tries to set depth very low. If we set our starting depth,
// say, to 256, we can tolerate 256 bogus close brace characters without
// aggressively going wrong and writing to bad memory Note that any specious
// depth can have a specious tape associated with and all these specious
// depths can share a region of the tape - it's harmless. Since tape is
// one-way, any movement in a specious tape is an error (so we can detect
// max_depth violations by making sure that specious tape locations haven't
// moved from their starting values)
u32 depth = START_DEPTH;
for (u32 i = 0; i < MAX_DEPTH; i++) {
pj.tape_locs[i] = i * MAX_TAPE_ENTRIES;
if (i == START_DEPTH) {
states[i] = START_DEPTH_START_STATE;
} else if ((i < START_DEPTH) || (i >= REDLINE_DEPTH)) {
states[i] = ANYTHING_IS_ERROR_STATE;
} else {
states[i] = START_STATE;
}
}
pj.current_string_buf_loc = pj.string_buf;
pj.current_number_buf_loc = pj.number_buf;
u32 error_sump = 0;
u32 old_tape_loc = pj.tape_locs[depth]; // need to initialize for first write
u32 next_idx = pj.structural_indexes[0];
u8 next_c = buf[next_idx];
u32 next_control = char_control[next_c];
for (u32 i = 0; i < pj.n_structural_indexes; i++) {
// very periodic safety checking. This does NOT guarantee that we
// haven't been in our dangerous zones above or below our normal
// depths. It ONLY checks to be sure that we don't manage to leave
// these zones and write completely off our tape.
if (!(i % DEPTH_SAFETY_MARGIN)) {
if (depth < START_DEPTH || depth >= REDLINE_DEPTH) {
error_sump |= 1;
break;
}
}
u32 idx = next_idx;
u8 c = next_c;
u32 control = next_control;
next_idx = pj.structural_indexes[i + 1];
next_c = buf[next_idx];
next_control = char_control[next_c];
// TAPE MACHINE
s8 depth_adjust = get_depth_adjust(control);
u8 write_size = get_write_size(control);
u32 write_val = (depth_adjust != 0) ? old_tape_loc : idx;
depth += depth_adjust;
#ifdef DEBUG
cout << "i: " << i << " idx: " << idx << " c " << c << "\n";
cout << "TAPE MACHINE: depth change " << (s32)depth_adjust << " write_size "
<< (u32)write_size << " current_depth: " << depth << "\n";
#endif
// STATE MACHINE - hoisted here to fill in during the tape machine's
// latencies
#ifdef DEBUG
cout << "STATE MACHINE: state[depth] pre " << states[depth] << " ";
#endif
states[depth] = trans[states[depth]][c];
#ifdef DEBUG
cout << "post " << states[depth] << "\n";
#endif
// TAPE MACHINE, again
pj.tape[pj.tape_locs[depth]] = write_val | (((u64)c) << 56);
old_tape_loc = pj.tape_locs[depth] += write_size;
}
if (depth != START_DEPTH) {
// We haven't returned to our start depth, so our braces can't possibly
// match Note this doesn't exclude the possibility that we have improperly
// matched { } or [] pairs
return false;
}
for (u32 i = 0; i < MAX_DEPTH; i++) {
if (!valid_end_states[states[i]]) {
#ifdef DEBUG
printf("Invalid ending state: states[%d] == %d\n", states[i]);
#endif
return false;
}
}
#define DUMP_TAPES
#ifdef DEBUG
for (u32 i = 0; i < MAX_DEPTH; i++) {
u32 start_loc = i * MAX_TAPE_ENTRIES;
cout << " tape section i " << i;
if (i == START_DEPTH) {
cout << " (START) ";
} else if ((i < START_DEPTH) || (i >= REDLINE_DEPTH)) {
cout << " (REDLINE) ";
} else {
cout << " (NORMAL) ";
}
cout << " from: " << start_loc << " to: " << tape_locs[i] << " "
<< " size: " << (tape_locs[i] - start_loc) << "\n";
cout << " state: " << states[i] << "\n";
#ifdef DUMP_TAPES
for (u32 j = start_loc; j < tape_locs[i]; j++) {
if (tape[j]) {
cout << "j: " << j << " tape[j] char " << (char)(tape[j] >> 56)
<< " tape[j][0..55]: " << (tape[j] & 0xffffffffffffffULL) << "\n";
}
}
#endif
}
#endif
if (error_sump) {
return false;
}
return true;
}

View File

@ -1,654 +0,0 @@
#ifdef _MSC_VER
/* Microsoft C/C++-compatible compiler */
#include <intrin.h>
#else
#include <immintrin.h>
#include <x86intrin.h>
#endif
#include <cassert>
#include <cstring>
#include "jsonparser/common_defs.h"
#include "jsonparser/simdjson_internal.h"
// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
// these go into the first 3 buckets of the comparison (1/2/4)
// we are also interested in the four whitespace characters
// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
const u32 structural_or_whitespace_negated[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
// return non-zero if not a structural or whitespace char
// zero otherwise
really_inline u32 is_not_structural_or_whitespace(u8 c) {
return structural_or_whitespace_negated[c];
}
// These chars yield themselves: " \ /
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
// u not handled in this table as it's complex
const u8 escape_map[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
0, 0, 0x08, 0, 0, 0, 0x12, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
const u32 leading_zeros_to_utf_bytes[33] = {
1, 1, 1, 1, 1, 1, 1, 1, // 7 bits for first one
2, 2, 2, 2, // 11 bits for next
3, 3, 3, 3, 3, // 16 bits for next
4, 4, 4, 4, 4, // 21 bits for next
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // error
const u32 UTF_PDEP_MASK[5] = {0x00, // error
0x7f, 0x1f3f, 0x0f3f3f, 0x073f3f3f};
const u32 UTF_OR_MASK[5] = {0x00, // error
0x00, 0xc080, 0xe08080, 0xf0808080};
bool is_hex_digit(u8 v) {
if (v >= '0' && v <= '9')
return true;
v &= 0xdf;
if (v >= 'A' && v <= 'F')
return true;
return false;
}
u8 digit_to_val(u8 v) {
if (v >= '0' && v <= '9')
return v - '0';
v &= 0xdf;
return v - 'A' + 10;
}
bool hex_to_u32(const u8 *src, u32 *res) {
u8 v1 = src[0];
u8 v2 = src[1];
u8 v3 = src[2];
u8 v4 = src[3];
if (!is_hex_digit(v1) || !is_hex_digit(v2) || !is_hex_digit(v3) ||
!is_hex_digit(v4)) {
return false;
}
*res = digit_to_val(v1) << 24 | digit_to_val(v2) << 16 |
digit_to_val(v3) << 8 | digit_to_val(v4);
return true;
}
// handle a unicode codepoint
// write appropriate values into dest
// src will always advance 6 bytes
// dest will advance a variable amount (return via pointer)
// return true if the unicode codepoint was valid
// We work in little-endian then swap at write time
really_inline bool handle_unicode_codepoint(const u8 **src_ptr, u8 **dst_ptr) {
u32 code_point = 0; // read the hex, potentially reading another \u beyond if
// it's a // wacky one
if (!hex_to_u32(*src_ptr + 2, &code_point)) {
return false;
}
*src_ptr += 6;
// check for the weirdo double-UTF-16 nonsense for things outside Basic
// Multilingual Plane.
if (code_point >= 0xd800 && code_point < 0xdc00) {
// TODO: sanity check and clean up; snippeted from RapidJSON and poorly
// understood at the moment
if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
return false;
}
u32 code_point_2 = 0;
if (!hex_to_u32(*src_ptr + 2, &code_point_2)) {
return false;
}
if (code_point_2 < 0xdc00 || code_point_2 > 0xdfff) {
return false;
}
code_point =
(((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
*src_ptr += 6;
}
// TODO: check to see whether the below code is nonsense (it's really only a
// sketch at this point)
u32 lz = __builtin_clz(code_point);
u32 utf_bytes = leading_zeros_to_utf_bytes[lz];
u32 tmp =
_pdep_u32(code_point, UTF_PDEP_MASK[utf_bytes]) | UTF_OR_MASK[utf_bytes];
// swap and move to the other side of the register
tmp = __builtin_bswap32(tmp);
tmp >>= ((4 - utf_bytes) * 8) & 31; // if utf_bytes, this could become a shift
// by 32, hence the mask with 31
// use memcpy to avoid undefined behavior:
std::memcpy(*(u32 **)dst_ptr, &tmp, sizeof(u32)); //**(u32 **)dst_ptr = tmp;
*dst_ptr += utf_bytes;
return true;
}
really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
ParsedJson &pj, u32 tape_loc) {
u32 offset = pj.tape[tape_loc] & 0xffffff;
const u8 *src = &buf[offset + 1]; // we know that buf at offset is a "
u8 *dst = pj.current_string_buf_loc;
#ifdef DEBUG
cout << "Entering parse string with offset " << offset << "\n";
#endif
// basic non-sexy parsing code
while (1) {
#ifdef DEBUG
for (u32 j = 0; j < 32; j++) {
char c = *(src + j);
if (isprint(c)) {
cout << c;
} else {
cout << '_';
}
}
cout << "| ... string handling input\n";
#endif
m256 v = _mm256_loadu_si256((const m256 *)(src));
u32 bs_bits =
(u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')));
dumpbits32(bs_bits, "backslash bits 2");
u32 quote_bits =
(u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('"')));
dumpbits32(quote_bits, "quote_bits");
u32 quote_dist = __builtin_ctz(quote_bits);
u32 bs_dist = __builtin_ctz(bs_bits);
// store to dest unconditionally - we can overwrite the bits we don't like
// later
_mm256_storeu_si256((m256 *)(dst), v);
#ifdef DEBUG
cout << "quote dist: " << quote_dist << " bs dist: " << bs_dist << "\n";
#endif
if (quote_dist < bs_dist) {
#ifdef DEBUG
cout << "Found end, leaving!\n";
#endif
// we encountered quotes first. Move dst to point to quotes and exit
dst[quote_dist] = 0; // null terminate and get out
pj.current_string_buf_loc = dst + quote_dist + 1;
pj.tape[tape_loc] =
((u32)'"') << 24 |
(pj.current_string_buf_loc -
pj.string_buf); // assume 2^24 will hold all strings for now
return true;
} else if (quote_dist > bs_dist) {
u8 escape_char = src[bs_dist + 1];
#ifdef DEBUG
cout << "Found escape char: " << escape_char << "\n";
#endif
// we encountered backslash first. Handle backslash
if (escape_char == 'u') {
// move src/dst up to the start; they will be further adjusted
// within the unicode codepoint handling code.
src += bs_dist;
dst += bs_dist;
if (!handle_unicode_codepoint(&src, &dst)) {
return false;
}
return true;
} else {
// simple 1:1 conversion. Will eat bs_dist+2 characters in input and
// write bs_dist+1 characters to output
// note this may reach beyond the part of the buffer we've actually
// seen. I think this is ok
u8 escape_result = escape_map[escape_char];
if (!escape_result)
return false; // bogus escape value is an error
dst[bs_dist] = escape_result;
src += bs_dist + 2;
dst += bs_dist + 1;
}
} else {
// they are the same. Since they can't co-occur, it means we encountered
// neither.
src += 32;
dst += 32;
}
return true;
}
// later extensions -
// if \\ we could detect whether it's a substantial run of \ or just eat 2
// chars and write 1 handle anything short of \u or \\\ (as a prefix) with
// clever PSHUFB stuff and don't leave SIMD
return true;
}
#ifdef DOUBLECONV
#include "double-conversion/double-conversion.h"
#include "double-conversion/ieee.h"
using namespace double_conversion;
static StringToDoubleConverter
converter(StringToDoubleConverter::ALLOW_TRAILING_JUNK, 2000000.0,
Double::NaN(), NULL, NULL);
#endif
// does not validation whatsoever, assumes that all digit
// this is CS 101
u64 naivestrtoll(const char *p, const char *end) {
if(p == end) return 0; // should be an error?
// this code could get a whole lot smarter if we have many long ints:
// e.g., see http://0x80.pl/articles/simd-parsing-int-sequences.html
u64 x = *p - '0';
p++;
for(;p < end;p++) {
x = (x*10) + (*p - '0');
}
return x;
}
// put a parsed version of number (either as a double or a signed long) into the
// number buffer, put a 'tag' indicating which type and where it is back onto
// the tape at that location return false if we can't parse the number which
// means either (a) the number isn't valid, or (b) the number is followed by
// something that isn't whitespace, comma or a close }] character which are the
// only things that should follow a number at this stage bools to detect what we
// found in our initial character already here - we are already switching on 0
// vs 1-9 vs - so we may as well keep separate paths where that's useful
// TODO: see if we really need a separate number_buf or whether we should just
// have a generic scratch - would need to align before using for this
really_inline bool parse_number(const u8 *buf, UNUSED size_t len,
UNUSED ParsedJson &pj, u32 tape_loc,
UNUSED bool found_zero, bool found_minus) {
u32 offset = pj.tape[tape_loc] & 0xffffff;
////////////////
// This is temporary... but it illustrates how one could use Google's double
// conv.
///
#ifdef DOUBLECONV
// Maybe surprisingly, StringToDouble does not parse according to the JSON
// spec (e.g., it will happily parse 012 as 12).
int processed_characters_count;
double result_double_conv = converter.StringToDouble(
(const char *)(buf + offset), 10, &processed_characters_count);
*((double *)pj.current_number_buf_loc) = result_double_conv;
pj.tape[tape_loc] =
((u32)'d') << 24 |
(pj.current_number_buf_loc -
pj.number_buf); // assume 2^24 will hold all numbers for now
pj.current_number_buf_loc += 8;
return result_double_conv == result_double_conv;
#endif
////////////////
// end of double conv temporary stuff.
////////////////
if (found_minus) {
offset++;
}
const u8 *src = &buf[offset];
m256 v = _mm256_loadu_si256((const m256 *)(src));
u64 error_sump = 0;
#ifdef DEBUG
for (u32 j = 0; j < 32; j++) {
char c = *(src + j);
if (isprint(c)) {
cout << c;
} else {
cout << '_';
}
}
cout << "| ... number handling input\n";
#endif
// categories to extract
// Digits:
// 0 (0x30) - bucket 0
// 1-9 (never any distinction except if we didn't get the free kick at 0 due
// to the leading minus) (0x31-0x39) - bucket 1
// . (0x2e) - bucket 2
// E or e - no distinction (0x45/0x65) - bucket 3
// + (0x2b) - bucket 4
// - (0x2d) - bucket 4
// Terminators
// Whitespace: 0x20, 0x09, 0x0a, 0x0d - bucket 5+6
// Comma and the closes: 0x2c is comma, } is 0x5d, ] is 0x7d - bucket 5+7
// Another shufti - also a bit hand-hacked. Need to make a better construction
const m256 low_nibble_mask = _mm256_setr_epi8(
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
33, 2, 2, 2, 2, 10, 2, 2, 2, 66, 64, 16, 32, 0xd0, 4, 0, 33, 2, 2, 2, 2,
10, 2, 2, 2, 66, 64, 16, 32, 0xd0, 4, 0);
const m256 high_nibble_mask = _mm256_setr_epi8(
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
64, 0, 52, 3, 8, -128, 8, 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 52, 3, 8,
-128, 8, 0x80, 0, 0, 0, 0, 0, 0, 0, 0);
m256 tmp = _mm256_and_si256(
_mm256_shuffle_epi8(low_nibble_mask, v),
_mm256_shuffle_epi8(
high_nibble_mask,
_mm256_and_si256(_mm256_srli_epi32(v, 4), _mm256_set1_epi8(0x7f))));
#ifdef DEBUG
// let us print out the magic:
uint8_t buffer[32];
_mm256_storeu_si256((__m256i *)buffer,tmp);
for(int k = 0; k < 32; k++)
printf("%.2x ",buffer[k]);
printf("\n");
#endif
m256 enders_mask = _mm256_set1_epi8(0xe0);
m256 tmp_enders = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, enders_mask),
_mm256_set1_epi8(0));
u32 enders = ~(u32)_mm256_movemask_epi8(tmp_enders);
dumpbits32(enders, "ender characters");
//dumpbits32_always(enders, "ender characters");
if (enders == 0) {
error_sump = 1;
// if enders == 0 we have
// a heroically long number string or some garbage
}
// TODO: make a mask that indicates where our digits are // DANIEL: Isn't that digit_characters?
u32 number_mask = ~enders & (enders - 1);
dumpbits32(number_mask, "number mask");
//dumpbits32_always(number_mask, "number mask");
m256 n_mask = _mm256_set1_epi8(0x1f);
m256 tmp_n =
_mm256_cmpeq_epi8(_mm256_and_si256(tmp, n_mask), _mm256_set1_epi8(0));
u32 number_characters = ~(u32)_mm256_movemask_epi8(tmp_n);
// put something into our error sump if we have something
// before our ending characters that isn't a valid character
// for the inside of our JSON
number_characters &= number_mask;
error_sump |= number_characters ^ number_mask;
dumpbits32(number_characters, "number characters");
m256 d_mask = _mm256_set1_epi8(0x03);
m256 tmp_d =
_mm256_cmpeq_epi8(_mm256_and_si256(tmp, d_mask), _mm256_set1_epi8(0));
u32 digit_characters = ~(u32)_mm256_movemask_epi8(tmp_d);
digit_characters &= number_mask;
dumpbits32(digit_characters, "digit characters");
// dumpbits32_always(digit_characters, "digit characters");
m256 p_mask = _mm256_set1_epi8(0x04);
m256 tmp_p =
_mm256_cmpeq_epi8(_mm256_and_si256(tmp, p_mask), _mm256_set1_epi8(0));
u32 decimal_characters = ~(u32)_mm256_movemask_epi8(tmp_p);
decimal_characters &= number_mask;
dumpbits32(decimal_characters, "decimal characters");
m256 e_mask = _mm256_set1_epi8(0x08);
m256 tmp_e =
_mm256_cmpeq_epi8(_mm256_and_si256(tmp, e_mask), _mm256_set1_epi8(0));
u32 exponent_characters = ~(u32)_mm256_movemask_epi8(tmp_e);
exponent_characters &= number_mask;
dumpbits32(exponent_characters, "exponent characters");
m256 zero_mask = _mm256_set1_epi8(0x1);
m256 tmp_zero =
_mm256_cmpeq_epi8(tmp, zero_mask);
u32 zero_characters = (u32)_mm256_movemask_epi8(tmp_zero);
dumpbits32(zero_characters, "zero characters");
// if the zero character is in first position, it
// needs to be followed by decimal or exponent or ender (note: we
// handle found_minus separately)
u32 expo_or_decimal_or_ender = exponent_characters | decimal_characters | enders;
error_sump |= zero_characters & 0x01 & (~(expo_or_decimal_or_ender >> 1));
m256 s_mask = _mm256_set1_epi8(0x10);
m256 tmp_s =
_mm256_cmpeq_epi8(_mm256_and_si256(tmp, s_mask), _mm256_set1_epi8(0));
u32 sign_characters = ~(u32)_mm256_movemask_epi8(tmp_s);
sign_characters &= number_mask;
dumpbits32(sign_characters, "sign characters");
u32 digit_edges = ~(digit_characters << 1) & digit_characters;
dumpbits32(digit_edges, "digit_edges");
// check that we have 1-3 'edges' only
u32 t = digit_edges;
t &= t - 1;
t &= t - 1;
t &= t - 1;
error_sump |= t;
// check that we start with a digit
error_sump |= ~digit_characters & 0x1;
// having done some checks, get lazy and fall back
// to strtoll or strtod
// TODO: handle the easy cases ourselves; these are
// expensive and we've done a lot of the prepwork.
// return errors if strto* fail, otherwise fill in a code on the tape
// 'd' for floating point and 'l' for long and put a pointer to the
// spot in the buffer.
if ( digit_edges == 1) {
//if (__builtin_popcount(digit_edges) == 1) { // DANIEL : shouldn't we have digit_edges == 1
#define NAIVEINTPARSING // naive means "faster" in this case
#ifdef NAIVEINTPARSING
// this is faster, maybe, because we use a naive strtoll
// should be all digits?
error_sump |= number_characters ^ digit_characters;
int stringlength = __builtin_ctz(~digit_characters);
const char *end = (const char *)src + stringlength;
u64 result = naivestrtoll((const char *)src,end);
if (found_minus) { // unfortunate that it is a branch?
result = -result;
}
#else
// try a strtoll (this is likely slower because it revalidates)
char *end;
u64 result = strtoll((const char *)src, &end, 10);
if ((errno != 0) || (end == (const char *)src)) {
error_sump |= 1;
}
error_sump |= is_not_structural_or_whitespace(*end);
if (found_minus) {
result = -result;
}
#endif
#ifdef DEBUG
cout << "Found number " << result << "\n";
#endif
*((u64 *)pj.current_number_buf_loc) = result;
pj.tape[tape_loc] =
((u32)'l') << 24 |
(pj.current_number_buf_loc -
pj.number_buf); // assume 2^24 will hold all numbers for now
pj.current_number_buf_loc += 8;
} else {
// try a strtod
char *end;
double result = strtod((const char *)src, &end);
if ((errno != 0) || (end == (const char *)src)) {
error_sump |= 1;
}
error_sump |= is_not_structural_or_whitespace(*end);
if (found_minus) {
result = -result;
}
#ifdef DEBUG
cout << "Found number " << result << "\n";
#endif
*((double *)pj.current_number_buf_loc) = result;
pj.tape[tape_loc] =
((u32)'d') << 24 |
(pj.current_number_buf_loc -
pj.number_buf); // assume 2^24 will hold all numbers for now
pj.current_number_buf_loc += 8;
}
// TODO: check the MSB element is a digit
// TODO: a whole bunch of checks
// TODO: <=1 decimal point, eE mark, +- construct
// TODO: first and last character in mask region must be
// digit
// TODO: if it exists,
// Decimal point is after the first cluster of numbers only
// and before the second cluster of numbers only. It must
// be digit_or_zero . digit_or_zero strictly
// TODO: eE mark and +- construct are adjacent with eE first
// eE mark preceeds final cluster of numbers only
// and immediately follows second-last cluster of numbers only (not
// necessarily second, as we may have 4e10).
// it may suffice to insist that eE is preceeded immediately
// by a digit of any kind and that it's followed locally by
// a digit immediately or a +- construct then a digit.
// TODO: if we have both . and the eE mark then the . must
// precede the eE mark
if (error_sump)
return false;
return true;
}
bool tape_disturbed(u32 i, ParsedJson &pj) {
u32 start_loc = i * MAX_TAPE_ENTRIES;
u32 end_loc = pj.tape_locs[i];
return start_loc != end_loc;
}
bool shovel_machine(const u8 *buf, size_t len, ParsedJson &pj) {
// fixup the mess made by the ape_machine
// as such it does a bunch of miscellaneous things on the tapes
u32 error_sump = 0;
u64 tv = *(const u64 *)"true ";
u64 nv = *(const u64 *)"null ";
u64 fv = *(const u64 *)"false ";
u64 mask4 = 0x00000000ffffffff;
u64 mask5 = 0x000000ffffffffff;
// if the tape has been touched at all at the depths outside the safe
// zone we need to quit. Note that our periodic checks to see that we're
// inside our safe zone in stage 3 don't guarantee that the system did
// not get into the danger area briefly.
if (tape_disturbed(START_DEPTH - 1, pj) ||
tape_disturbed(REDLINE_DEPTH, pj)) {
return false;
}
// walk over each tape
for (u32 i = START_DEPTH; i < MAX_DEPTH; i++) {
u32 start_loc = i * MAX_TAPE_ENTRIES;
u32 end_loc = pj.tape_locs[i];
if (start_loc == end_loc) {
break;
}
for (u32 j = start_loc; j < end_loc; j++) {
switch (pj.tape[j] >> 56) {
case '{':
case '[': {
// pivot our tapes
// point the enclosing structural char (}]) to the head marker ({[) and
// put the end of the sequence on the tape at the head marker
// we start with head marker pointing at the enclosing structural char
// and the enclosing structural char pointing at the end. Just swap
// them. also check the balanced-{} or [] property here
u8 head_marker_c = pj.tape[j] >> 56;
u32 head_marker_loc = pj.tape[j] & 0xffffffffffffffULL;
u64 tape_enclosing = pj.tape[head_marker_loc];
u8 enclosing_c = tape_enclosing >> 56;
pj.tape[head_marker_loc] = pj.tape[j];
pj.tape[j] = tape_enclosing;
error_sump |= (enclosing_c - head_marker_c -
2); // [] and {} only differ by 2 chars
break;
}
case '"': {
error_sump |= !parse_string(buf, len, pj, j);
break;
}
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
error_sump |= !parse_number(buf, len, pj, j, false, false);
break;
case '0':
error_sump |= !parse_number(buf, len, pj, j, true, false);
break;
case '-':
error_sump |= !parse_number(buf, len, pj, j, false, true);
break;
case 't': {
u32 offset = pj.tape[j] & 0xffffffffffffffULL;
const u8 *loc = buf + offset;
u64 locval; // we want to avoid unaligned 64-bit loads (undefined in
// C/C++)
std::memcpy(&locval, loc, sizeof(u64));
error_sump |= (locval & mask4) ^ tv;
error_sump |= is_not_structural_or_whitespace(loc[4]);
break;
}
case 'f': {
u32 offset = pj.tape[j] & 0xffffffffffffffULL;
const u8 *loc = buf + offset;
u64 locval; // we want to avoid unaligned 64-bit loads (undefined in
// C/C++)
std::memcpy(&locval, loc, sizeof(u64));
error_sump |= (locval & mask5) ^ fv;
error_sump |= is_not_structural_or_whitespace(loc[5]);
break;
}
case 'n': {
u32 offset = pj.tape[j] & 0xffffffffffffffULL;
const u8 *loc = buf + offset;
u64 locval; // we want to avoid unaligned 64-bit loads (undefined in
// C/C++)
std::memcpy(&locval, loc, sizeof(u64));
error_sump |= (locval & mask4) ^ nv;
error_sump |= is_not_structural_or_whitespace(loc[4]);
break;
}
default:
break;
}
}
}
/*
if (error_sump) {
return false;
}
*/
return true;
}