Lots and lots of cleaning.

This commit is contained in:
Daniel Lemire 2018-11-27 14:37:59 -05:00
parent 5fae7b2100
commit a43b0772e1
15 changed files with 521 additions and 465 deletions

View File

@ -7,7 +7,7 @@
.PHONY: clean cleandist
DEPSINCLUDE = -Idependencies/rapidjson/include -Idependencies/sajson/include -Idependencies/json11 -Idependencies/fastjson/src -Idependencies/fastjson/include -Idependencies/gason/src -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src
CXXFLAGS = -std=c++11 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux $(DEPSINCLUDE)
CXXFLAGS = -std=c++11 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux $(DEPSINCLUDE)
CFLAGS = -march=native -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src
ifeq ($(SANITIZE),1)
CXXFLAGS += -g3 -O0 -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined

View File

@ -21,7 +21,7 @@ template <int TYPE = PERF_TYPE_HARDWARE> class LinuxEvents {
std::vector<uint64_t> ids;
public:
LinuxEvents(std::vector<int> config_vec) : fd(0) {
explicit LinuxEvents(std::vector<int> config_vec) : fd(0) {
memset(&attribs, 0, sizeof(attribs));
attribs.type = TYPE;
attribs.size = sizeof(attribs);

View File

@ -1,3 +1,4 @@
#include <unistd.h>
#include <iostream>
#include "benchmark.h"
@ -13,6 +14,7 @@
#include "rapidjson/writer.h"
#include "sajson.h"
using namespace rapidjson;
using namespace std;
@ -43,17 +45,29 @@ std::string rapidstringme(char *json) {
}
int main(int argc, char *argv[]) {
if (argc < 2) {
cerr << "Usage: " << argv[0] << " <jsonfile>\n";
cerr << "Or " << argv[0] << " -v <jsonfile>\n";
int c;
bool verbose = false;
while ((c = getopt (argc, argv, "v")) != -1)
switch (c)
{
case 'v':
verbose = true;
break;
default:
abort ();
}
if (optind >= argc) {
cerr << "Usage: " << argv[0] << " <jsonfile>" << endl;
exit(1);
}
bool verbose = false;
if (argc > 2) {
if (strcmp(argv[1], "-v"))
verbose = true;
const char * filename = argv[optind];
pair<u8 *, size_t> p;
try {
p = get_corpus(filename);
} catch (const std::exception& e) { // caught by reference to base
std::cout << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
pair<u8 *, size_t> p = get_corpus(argv[argc - 1]);
if (verbose) {
std::cout << "Input has ";
if (p.second > 1024 * 1024)

View File

@ -31,79 +31,14 @@
#include "jsonparser/stage34_unified.h"
using namespace std;
// https://stackoverflow.com/questions/2616906/how-do-i-output-coloured-text-to-a-linux-terminal
namespace Color {
enum Code {
FG_DEFAULT = 39,
FG_BLACK = 30,
FG_RED = 31,
FG_GREEN = 32,
FG_YELLOW = 33,
FG_BLUE = 34,
FG_MAGENTA = 35,
FG_CYAN = 36,
FG_LIGHT_GRAY = 37,
FG_DARK_GRAY = 90,
FG_LIGHT_RED = 91,
FG_LIGHT_GREEN = 92,
FG_LIGHT_YELLOW = 93,
FG_LIGHT_BLUE = 94,
FG_LIGHT_MAGENTA = 95,
FG_LIGHT_CYAN = 96,
FG_WHITE = 97,
BG_RED = 41,
BG_GREEN = 42,
BG_BLUE = 44,
BG_DEFAULT = 49
};
class Modifier {
Code code;
public:
Modifier(Code pCode) : code(pCode) {}
friend std::ostream &operator<<(std::ostream &os, const Modifier &mod) {
return os << "\033[" << mod.code << "m";
}
};
} // namespace Color
void colorfuldisplay(ParsedJson &pj, const u8 *buf) {
Color::Modifier greenfg(Color::FG_GREEN);
Color::Modifier yellowfg(Color::FG_YELLOW);
Color::Modifier deffg(Color::FG_DEFAULT);
size_t i = 0;
// skip initial fluff
while ((i + 1 < pj.n_structural_indexes) &&
(pj.structural_indexes[i] == pj.structural_indexes[i + 1])) {
i++;
}
for (; i < pj.n_structural_indexes; i++) {
u32 idx = pj.structural_indexes[i];
u8 c = buf[idx];
if (((c & 0xdf) == 0x5b)) { // meaning 7b or 5b, { or [
std::cout << greenfg << buf[idx] << deffg;
} else if (((c & 0xdf) == 0x5d)) { // meaning 7d or 5d, } or ]
std::cout << greenfg << buf[idx] << deffg;
} else {
std::cout << yellowfg << buf[idx] << deffg;
}
if (i + 1 < pj.n_structural_indexes) {
u32 nextidx = pj.structural_indexes[i + 1];
for (u32 pos = idx + 1; pos < nextidx; pos++) {
std::cout << buf[pos];
}
}
}
std::cout << std::endl;
}
int main(int argc, char *argv[]) {
bool verbose = false;
bool dump = false;
bool forceoneiteration = false;
int c;
while ((c = getopt (argc, argv, "vd")) != -1)
while ((c = getopt (argc, argv, "1vd")) != -1)
switch (c)
{
case 'v':
@ -112,6 +47,9 @@ int main(int argc, char *argv[]) {
case 'd':
dump = true;
break;
case '1':
forceoneiteration = true;
break;
default:
abort ();
}
@ -124,7 +62,13 @@ int main(int argc, char *argv[]) {
cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
}
if(verbose) cout << "[verbose] loading " << filename << endl;
pair<u8 *, size_t> p = get_corpus(filename);
pair<u8 *, size_t> p;
try {
p = get_corpus(filename);
} catch (const std::exception& e) { // caught by reference to base
std::cout << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
if(verbose) cout << "[verbose] loaded " << filename << " ("<< p.second << " bytes)" << endl;
ParsedJson *pj_ptr = allocate_ParsedJson(p.second, 1024);
ParsedJson &pj(*pj_ptr);
@ -133,7 +77,7 @@ int main(int argc, char *argv[]) {
#if defined(DEBUG)
const u32 iterations = 1;
#else
const u32 iterations = p.second < 1 * 1000 * 1000? 1000 : 10;
const u32 iterations = forceoneiteration ? 1 : ( p.second < 1 * 1000 * 1000? 1000 : 10);
#endif
vector<double> res;
res.resize(iterations);
@ -174,7 +118,7 @@ int main(int argc, char *argv[]) {
}
unified.start();
#endif
isok = flatten_indexes(p.second, pj);
isok = isok && flatten_indexes(p.second, pj);
#ifndef SQUASH_COUNTERS
unified.end(results);
cy2 += results[0];
@ -187,7 +131,7 @@ int main(int argc, char *argv[]) {
unified.start();
#endif
isok = unified_machine(p.first, p.second, pj);
isok = isok && unified_machine(p.first, p.second, pj);
#ifndef SQUASH_COUNTERS
unified.end(results);
cy3 += results[0];

View File

@ -31,7 +31,6 @@ void on_json_error( void *, const fastjson::ErrorContext& ec) {
bool fastjson_parse(const char *input) {
fastjson::Token token;
fastjson::dom::Chunk chunk;
std::string error_message;
return fastjson::dom::parse_string(input, &token, &chunk, 0, &on_json_error, NULL);
}
// end of fastjson stuff
@ -62,7 +61,14 @@ int main(int argc, char *argv[]) {
if(optind + 1 < argc) {
cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
}
pair<u8 *, size_t> p = get_corpus(filename);
pair<u8 *, size_t> p;
try {
p = get_corpus(filename);
} catch (const std::exception& e) { // caught by reference to base
std::cout << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
if (verbose) {
std::cout << "Input has ";
if (p.second > 1024 * 1024)

View File

@ -20,6 +20,13 @@ char * allocate_aligned_buffer(size_t length);
// first element of the pair is a string (null terminated)
// whereas the second element is the length.
// caller is responsible to free (free std::pair<u8 *, size_t>.first)
//
// throws an exception if the file cannot be opened, use try/catch
// try {
// p = get_corpus(filename);
// } catch (const std::exception& e) {
// std::cout << "Could not load the file " << filename << std::endl;
// }
std::pair<u8 *, size_t> get_corpus(std::string filename);
#endif

View File

@ -128,7 +128,7 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
const __m128i mul_1_10000 =
_mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
const __m128i input = _mm_sub_epi8(_mm_loadu_si128((__m128i *)chars), ascii0);
const __m128i input = _mm_sub_epi8(_mm_loadu_si128((const __m128i *)chars), ascii0);
const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
const __m128i t3 = _mm_packus_epi32(t2, t2);
@ -149,7 +149,7 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
//
static never_inline bool
parse_highprecision_float(const u8 *const buf, UNUSED size_t len,
ParsedJson &pj, const u32 depth, const u32 offset,
ParsedJson &pj, UNUSED const u32 depth, const u32 offset,
UNUSED bool found_zero, bool found_minus) {
const char *p = (const char *)(buf + offset);
@ -193,7 +193,6 @@ parse_highprecision_float(const u8 *const buf, UNUSED size_t len,
}
exponent = firstafterperiod - p;
}
int64_t expnumber = 0; // exponential part
if (('e' == *p) || ('E' == *p)) {
++p;
bool negexp = false;
@ -210,7 +209,7 @@ parse_highprecision_float(const u8 *const buf, UNUSED size_t len,
return false;
}
unsigned char digit = *p - '0';
expnumber = digit;
int64_t expnumber = digit; // exponential part
p++;
if (is_integer(*p)) {
digit = *p - '0';
@ -270,7 +269,7 @@ parse_highprecision_float(const u8 *const buf, UNUSED size_t len,
//
static never_inline bool parse_large_integer(const u8 *const buf,
UNUSED size_t len, ParsedJson &pj,
const u32 depth, const u32 offset,
UNUSED const u32 depth, const u32 offset,
UNUSED bool found_zero,
bool found_minus) {
const char *p = (const char *)(buf + offset);
@ -340,10 +339,12 @@ static never_inline bool parse_large_integer(const u8 *const buf,
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif
// parse the number at buf + offset
// define JSON_TEST_NUMBERS for unit testing
static really_inline bool parse_number(const u8 *const buf, UNUSED size_t len,
ParsedJson &pj, const u32 depth,
ParsedJson &pj, UNUSED const u32 depth,
const u32 offset, UNUSED bool found_zero,
bool found_minus) {
const char *p = (const char *)(buf + offset);

View File

@ -105,14 +105,14 @@ public:
void write_tape_s64(s64 i) {
*((s64 *)current_number_buf_loc) = i;
current_number_buf_loc += 8;
*((s64 *)current_number_buf_loc) = i;// safe because array will be 8-byte aligned, could use memcpy
current_number_buf_loc += sizeof(s64);
write_tape(current_number_buf_loc - number_buf, 'l');
}
void write_tape_double(double d) {
*((double *)current_number_buf_loc) = d;
current_number_buf_loc += 8;
*((double *)current_number_buf_loc) = d;// safe because array will be 8-byte aligned, could use memcpy
current_number_buf_loc += sizeof(double);
write_tape(current_number_buf_loc - number_buf, 'd');
}
@ -137,7 +137,7 @@ public:
u32 scope_header; // the start of our current scope that contains our current location
u32 location; // our current location on a tape
ParsedJsonHandle(ParsedJson & pj_) : pj(pj_), depth(0), scope_header(0), location(0) {}
explicit ParsedJsonHandle(ParsedJson & pj_) : pj(pj_), depth(0), scope_header(0), location(0) {}
// OK with default copy constructor as the way to clone the POD structure
// some placeholder navigation. Will convert over to a more native C++-ish way of doing
@ -167,7 +167,7 @@ public:
#ifdef DEBUG
inline void dump256(m256 d, std::string msg) {
inline void dump256(m256 d, const std::string msg) {
for (u32 i = 0; i < 32; i++) {
std::cout << std::setw(3) << (int)*(((u8 *)(&d)) + i);
if (!((i + 1) % 8))
@ -181,14 +181,14 @@ inline void dump256(m256 d, std::string msg) {
}
// dump bits low to high
inline void dumpbits(u64 v, std::string msg) {
inline void dumpbits(u64 v, const std::string msg) {
for (u32 i = 0; i < 64; i++) {
std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_");
}
std::cout << " " << msg << "\n";
}
inline void dumpbits32(u32 v, std::string msg) {
inline void dumpbits32(u32 v, const std::string msg) {
for (u32 i = 0; i < 32; i++) {
std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_");
}
@ -201,14 +201,14 @@ inline void dumpbits32(u32 v, std::string msg) {
#endif
// dump bits low to high
inline void dumpbits_always(u64 v, std::string msg) {
inline void dumpbits_always(u64 v, const std::string msg) {
for (u32 i = 0; i < 64; i++) {
std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_");
}
std::cout << " " << msg << "\n";
}
inline void dumpbits32_always(u32 v, std::string msg) {
inline void dumpbits32_always(u32 v, const std::string msg) {
for (u32 i = 0; i < 32; i++) {
std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_");
}

View File

@ -58,7 +58,7 @@ really_inline bool handle_unicode_codepoint(const u8 **src_ptr, u8 **dst_ptr) {
}
really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
ParsedJson &pj, u32 depth, u32 offset) {
ParsedJson &pj, UNUSED const u32 depth, u32 offset) {
using namespace std;
const u8 *src = &buf[offset + 1]; // we know that buf at offset is a "
u8 *dst = pj.current_string_buf_loc;

View File

@ -137,7 +137,7 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);
prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// might be undefined behavior
const __m256i low_nibble_mask = _mm256_setr_epi8(
// 0 9 a b c d
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0,
@ -220,7 +220,7 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);
// prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we don't need this anymore
__m256i mask_20 = _mm256_set1_epi8(0x20); // c==32
__m256i mask_70 =

View File

@ -10,9 +10,9 @@
#include <cstring>
#include "jsonparser/common_defs.h"
#include "jsonparser/simdjson_internal.h"
#include "jsonparser/jsoncharutils.h"
#include "jsonparser/numberparsing.h"
#include "jsonparser/simdjson_internal.h"
#include "jsonparser/stringparsing.h"
#include <iostream>
@ -20,390 +20,430 @@
#define PATH_SEP '/'
#if defined(DEBUG) && !defined(DEBUG_PRINTF)
#include <string.h>
#include <stdio.h>
#define DEBUG_PRINTF(format, ...) printf("%s:%s:%d:" format, \
strrchr(__FILE__, PATH_SEP) + 1, \
__func__, __LINE__, ## __VA_ARGS__)
#include <string.h>
#define DEBUG_PRINTF(format, ...) \
printf("%s:%s:%d:" format, strrchr(__FILE__, PATH_SEP) + 1, __func__, \
__LINE__, ##__VA_ARGS__)
#elif !defined(DEBUG_PRINTF)
#define DEBUG_PRINTF(format, ...) do { } while(0)
#define DEBUG_PRINTF(format, ...) \
do { \
} while (0)
#endif
using namespace std;
WARN_UNUSED
really_inline bool is_valid_true_atom(const u8 * loc) {
u64 tv = *(const u64 *)"true ";
u64 mask4 = 0x00000000ffffffff;
u32 error = 0;
u64 locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
std::memcpy(&locval, loc, sizeof(u64));
error = (locval & mask4) ^ tv;
error |= is_not_structural_or_whitespace(loc[4]);
return error == 0;
really_inline bool is_valid_true_atom(const u8 *loc) {
u64 tv = *(const u64 *)"true ";
u64 mask4 = 0x00000000ffffffff;
u32 error = 0;
u64 locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
std::memcpy(&locval, loc, sizeof(u64));
error = (locval & mask4) ^ tv;
error |= is_not_structural_or_whitespace(loc[4]);
return error == 0;
}
WARN_UNUSED
really_inline bool is_valid_false_atom(const u8 * loc) {
u64 fv = *(const u64 *)"false ";
u64 mask5 = 0x000000ffffffffff;
u32 error = 0;
u64 locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
std::memcpy(&locval, loc, sizeof(u64));
error = (locval & mask5) ^ fv;
error |= is_not_structural_or_whitespace(loc[5]);
return error == 0;
really_inline bool is_valid_false_atom(const u8 *loc) {
u64 fv = *(const u64 *)"false ";
u64 mask5 = 0x000000ffffffffff;
u32 error = 0;
u64 locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
std::memcpy(&locval, loc, sizeof(u64));
error = (locval & mask5) ^ fv;
error |= is_not_structural_or_whitespace(loc[5]);
return error == 0;
}
WARN_UNUSED
really_inline bool is_valid_null_atom(const u8 * loc) {
u64 nv = *(const u64 *)"null ";
u64 mask4 = 0x00000000ffffffff;
u32 error = 0;
u64 locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
std::memcpy(&locval, loc, sizeof(u64));
error = (locval & mask4) ^ nv;
error |= is_not_structural_or_whitespace(loc[4]);
return error == 0;
really_inline bool is_valid_null_atom(const u8 *loc) {
u64 nv = *(const u64 *)"null ";
u64 mask4 = 0x00000000ffffffff;
u32 error = 0;
u64 locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
std::memcpy(&locval, loc, sizeof(u64));
error = (locval & mask4) ^ nv;
error |= is_not_structural_or_whitespace(loc[4]);
return error == 0;
}
// Implemented using Labels as Values which works in GCC and CLANG (and maybe also in Intel's compiler),
// but won't work in MSVC. This would need to be reimplemented differently
// if one wants to be standard compliant.
// Implemented using Labels as Values which works in GCC and CLANG (and maybe
// also in Intel's compiler), but won't work in MSVC. This would need to be
// reimplemented differently if one wants to be standard compliant.
WARN_UNUSED
bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
u32 i = 0; // index of the structural character (0,1,2,3...)
u32 idx; // location of the structural character in the input (buf)
u8 c; // used to track the (structural) character we are looking at, updated by UPDATE_CHAR macro
u32 depth = 0;//START_DEPTH; // an arbitrary starting depth
//void * ret_address[MAX_DEPTH]; // used to store "labels as value" (non-standard compiler extension)
// a call site is the start of either an object or an array ('[' or '{')
// this is the location of the previous call site
// (in the tape, at the given depth);
// we only need one.
// We should also track the tape address of our containing
// scope for two reasons. First, we will need to put an
// up pointer there at each call site so we can navigate
// upwards. Second, when we encounter the end of the scope
// we can put the current offset into a record for the
// scope so we know where it is
//u32 containing_scope_offset[MAX_DEPTH];
pj.init();
// add a sentinel to the end to avoid premature exit
// need to be able to find the \0 at the 'padded length' end of the buffer
// FIXME: TERRIFYING!
//size_t j;
//for (j = len; buf[j] != 0; j++)
// ;
//pj.structural_indexes[pj.n_structural_indexes++] = j;
u32 i = 0; // index of the structural character (0,1,2,3...)
u32 idx; // location of the structural character in the input (buf)
u8 c; // used to track the (structural) character we are looking at, updated
// by UPDATE_CHAR macro
u32 depth = 0; // could have an arbitrary starting depth
pj.init();
// this macro reads the next structural character, updating idx, i and c.
#define UPDATE_CHAR() { idx = pj.structural_indexes[i++]; c = buf[idx]; DEBUG_PRINTF("Got %c at %d (%d offset)\n", c, idx, i-1);}
#define UPDATE_CHAR() \
{ \
idx = pj.structural_indexes[i++]; \
c = buf[idx]; \
DEBUG_PRINTF("Got %c at %d (%d offset) (depth %d)\n", c, idx, i - 1, \
depth); \
}
////////////////////////////// START STATE /////////////////////////////
printf("at start\n");
DEBUG_PRINTF("at start\n");
pj.ret_address[depth] = &&start_continue;
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
depth++;// everything starts at depth = 1, depth = 0 is just for the root
if(depth > pj.depthcapacity) {
goto fail;
}
printf("got char %c \n",c);
UPDATE_CHAR();
switch (c) {
case '{': goto object_begin;
case '[': goto array_begin;
////////////////////////////// START STATE /////////////////////////////
DEBUG_PRINTF("at start\n");
pj.ret_address[depth] = &&start_continue;
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten
depth++; // everything starts at depth = 1, depth = 0 is just for the root
if (depth > pj.depthcapacity) {
goto fail;
}
UPDATE_CHAR();
switch (c) {
case '{':
goto object_begin;
case '[':
goto array_begin;
#define SIMDJSON_ALLOWANYTHINGINROOT
// A JSON text is a serialized value. Note that certain previous
// specifications of JSON constrained a JSON text to be an object or an
// array. Implementations that generate only objects or arrays where a
// JSON text is called for will be interoperable in the sense that all
// implementations will accept these as conforming JSON texts.
// https://tools.ietf.org/html/rfc8259
// A JSON text is a serialized value. Note that certain previous
// specifications of JSON constrained a JSON text to be an object or an
// array. Implementations that generate only objects or arrays where a
// JSON text is called for will be interoperable in the sense that all
// implementations will accept these as conforming JSON texts.
// https://tools.ietf.org/html/rfc8259
#ifdef SIMDJSON_ALLOWANYTHINGINROOT
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
goto start_continue;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
goto start_continue;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
goto start_continue;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
goto start_continue;
case '0': {
if (!parse_number(buf, len, pj, depth, idx, true, false)) {
goto fail;
}
goto start_continue;
}
case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': {
if (!parse_number(buf, len, pj, depth, idx, false, false)) {
goto fail;
}
goto start_continue;
}
case '-': {
if (!parse_number(buf, len, pj, depth, idx, false, true)) {
goto fail;
}
goto start_continue;
}
#endif // ALLOWANYTHINGINROOT
default: goto fail;
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case '0': {
if (!parse_number(buf, len, pj, depth, idx, true, false)) {
goto fail;
}
break;
}
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (!parse_number(buf, len, pj, depth, idx, false, false)) {
goto fail;
}
break;
}
case '-': {
if (!parse_number(buf, len, pj, depth, idx, false, true)) {
goto fail;
}
break;
}
#endif // ALLOWANYTHINGINROOT
default:
goto fail;
}
#ifdef SIMDJSON_ALLOWANYTHINGINROOT
depth--; // for fall-through cases (e.g., documents containing just a string)
#endif // ALLOWANYTHINGINROOT
start_continue:
DEBUG_PRINTF("in start_object_close\n");
UPDATE_CHAR();
switch (c) {
case 0: goto succeed;
default: goto fail;
}
DEBUG_PRINTF("in start_object_close\n");
UPDATE_CHAR();
switch (c) {
case 0:
goto succeed;
default:
goto fail;
}
////////////////////////////// OBJECT STATES /////////////////////////////
////////////////////////////// OBJECT STATES /////////////////////////////
object_begin:
printf("in object_begin %c \n",c);
DEBUG_PRINTF("in object_begin\n");
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c);
depth ++;
if(depth > pj.depthcapacity) {
goto fail;
}
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
goto object_key_state;
}
case '}': goto scope_end;
default: goto fail;
DEBUG_PRINTF("in object_begin\n");
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c);
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
goto object_key_state;
}
case '}':
goto scope_end; // could also go to object_continue
default:
goto fail;
}
object_key_state:
printf("in object_key_state %c \n",c);
DEBUG_PRINTF("in object_key_state\n");
UPDATE_CHAR();
if (c != ':') {
goto fail;
}
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case '0': {
if (!parse_number(buf, len, pj, depth, idx, true, false)) {
goto fail;
}
break;
}
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (!parse_number(buf, len, pj, depth, idx, false, false)) {
goto fail;
}
break;
}
case '-': {
if (!parse_number(buf, len, pj, depth, idx, false, true)) {
goto fail;
}
break;
}
case '{': {
// we have not yet encountered } so we need to come back for it
pj.ret_address[depth] = &&object_continue;
// we found an object inside an object, so we need to increment the depth
depth++;
if (depth > pj.depthcapacity) {
goto fail;
}
DEBUG_PRINTF("in object_key_state\n");
UPDATE_CHAR();
if (c != ':') {
goto fail;
}
UPDATE_CHAR();
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't': if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f': if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n': if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case '0': {
if (!parse_number(buf, len, pj, depth, idx, true, false)) {
goto fail;
}
break;
}
case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': {
if (!parse_number(buf, len, pj, depth, idx, false, false)) {
goto fail;
}
break;
}
case '-': {
if (!parse_number(buf, len, pj, depth, idx, false, true)) {
goto fail;
}
break;
}
case '{': {
pj.ret_address[depth] = &&object_continue;
goto object_begin;
}
case '[': {
pj.ret_address[depth] = &&object_continue;
goto array_begin;
}
default: goto fail;
goto object_begin;
}
case '[': {
// we have not yet encountered } so we need to come back for it
pj.ret_address[depth] = &&object_continue;
// we found an array inside an object, so we need to increment the depth
depth++;
if (depth > pj.depthcapacity) {
goto fail;
}
goto array_begin;
}
default:
goto fail;
}
object_continue:
printf("in object_continue %c \n",c);
DEBUG_PRINTF("in object_continue\n");
DEBUG_PRINTF("in object_continue\n");
UPDATE_CHAR();
switch (c) {
case ',':
UPDATE_CHAR();
switch (c) {
case ',':
UPDATE_CHAR();
if (c != '"') {
goto fail;
} else {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
goto object_key_state;
}
case '}': goto scope_end;
default: goto fail;
}
////////////////////////////// COMMON STATE /////////////////////////////
scope_end:
// write our tape location to the header scope
depth--;
pj.write_tape(pj.containing_scope_offset[depth], c);
pj.annotate_previousloc(pj.containing_scope_offset[depth], pj.get_current_loc());
// goto saved_state
goto *pj.ret_address[depth];
////////////////////////////// ARRAY STATES /////////////////////////////
array_begin:
printf("in array_begin %c \n",c);
DEBUG_PRINTF("in array_begin\n");
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c);
depth ++;
if(depth > pj.depthcapacity) {
if (c != '"') {
goto fail;
} else {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
goto object_key_state;
}
UPDATE_CHAR();
if (c == ']') {
goto scope_end;
}
case '}':
goto scope_end;
default:
goto fail;
}
////////////////////////////// COMMON STATE /////////////////////////////
scope_end:
// write our tape location to the header scope
depth--;
pj.write_tape(pj.containing_scope_offset[depth], c);
pj.annotate_previousloc(pj.containing_scope_offset[depth],
pj.get_current_loc());
// goto saved_state
goto *pj.ret_address[depth];
////////////////////////////// ARRAY STATES /////////////////////////////
array_begin:
DEBUG_PRINTF("in array_begin\n");
pj.containing_scope_offset[depth] = pj.get_current_loc();
pj.write_tape(0, c);
UPDATE_CHAR();
if (c == ']') {
goto scope_end; // could also go to array_continue
}
main_array_switch:
// we call update char on all paths in, so we can peek at c on the
// on paths that can accept a close square brace (post-, and at start)
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
goto array_continue;
}
case 't': if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f': if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n': if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case '0': {
if (!parse_number(buf, len, pj, depth, idx, true, false)) {
goto fail;
}
break;
}
case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': {
if (!parse_number(buf, len, pj, depth, idx, false, false)) {
goto fail;
}
break;
}
case '-': {
if (!parse_number(buf, len, pj, depth, idx, false, true)) {
goto fail;
}
break;
}
case '{': {
pj.ret_address[depth] = &&array_continue;
goto object_begin;
}
case '[': {
pj.ret_address[depth] = &&array_continue;
goto array_begin;
}
default: goto fail;
// we call update char on all paths in, so we can peek at c on the
// on paths that can accept a close square brace (post-, and at start)
switch (c) {
case '"': {
if (!parse_string(buf, len, pj, depth, idx)) {
goto fail;
}
break;
}
case 't':
if (!is_valid_true_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'f':
if (!is_valid_false_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break;
case 'n':
if (!is_valid_null_atom(buf + idx)) {
goto fail;
}
pj.write_tape(0, c);
break; // goto array_continue;
case '0': {
if (!parse_number(buf, len, pj, depth, idx, true, false)) {
goto fail;
}
break; // goto array_continue;
}
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
if (!parse_number(buf, len, pj, depth, idx, false, false)) {
goto fail;
}
break; // goto array_continue;
}
case '-': {
if (!parse_number(buf, len, pj, depth, idx, false, true)) {
goto fail;
}
break; // goto array_continue;
}
case '{': {
// we have not yet encountered ] so we need to come back for it
pj.ret_address[depth] = &&array_continue;
// we found an object inside an array, so we need to increment the depth
depth++;
if (depth > pj.depthcapacity) {
goto fail;
}
goto object_begin;
}
case '[': {
// we have not yet encountered ] so we need to come back for it
pj.ret_address[depth] = &&array_continue;
// we found an array inside an array, so we need to increment the depth
depth++;
if (depth > pj.depthcapacity) {
goto fail;
}
goto array_begin;
}
default:
goto fail;
}
array_continue:
printf("in array_begin %c \n",c);
DEBUG_PRINTF("in array_continue\n");
DEBUG_PRINTF("in array_continue\n");
UPDATE_CHAR();
switch (c) {
case ',':
UPDATE_CHAR();
switch (c) {
case ',': UPDATE_CHAR(); goto main_array_switch;
case ']': goto scope_end;
default: goto fail;
}
goto main_array_switch;
case ']':
goto scope_end;
default:
goto fail;
}
////////////////////////////// FINAL STATES /////////////////////////////
////////////////////////////// FINAL STATES /////////////////////////////
succeed:
DEBUG_PRINTF("in succeed\n");
// we annotate the root node
depth--;
// next line allows us to go back to the start
pj.write_tape(pj.containing_scope_offset[depth], 'r');// r is root
// next line tells the root node how to go to the end
pj.annotate_previousloc(pj.containing_scope_offset[depth], pj.get_current_loc());
DEBUG_PRINTF("in succeed, depth = %d \n", depth);
// we annotate the root node
// depth--;
// next line allows us to go back to the start
pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
// next line tells the root node how to go to the end
pj.annotate_previousloc(pj.containing_scope_offset[depth],
pj.get_current_loc());
#ifdef DEBUG
pj.dump_tapes();
pj.dump_tapes();
#endif
return true;
return true;
fail:
DEBUG_PRINTF("in fail\n");
DEBUG_PRINTF("in fail\n");
#ifdef DEBUG
pj.dump_tapes();
pj.dump_tapes();
#endif
return false;
return false;
}

View File

@ -1,3 +1,4 @@
#include <unistd.h>
#include "jsonparser/jsonparser.h"
@ -30,7 +31,6 @@ void on_json_error( void *, const fastjson::ErrorContext& ec) {
bool fastjson_parse(const char *input) {
fastjson::Token token;
fastjson::dom::Chunk chunk;
std::string error_message;
return fastjson::dom::parse_string(input, &token, &chunk, 0, &on_json_error, NULL);
}
// end of fastjson stuff
@ -41,17 +41,30 @@ using namespace rapidjson;
using namespace std;
int main(int argc, char *argv[]) {
if (argc < 2) {
bool verbose = false;
int c;
while ((c = getopt (argc, argv, "v")) != -1)
switch (c)
{
case 'v':
verbose = true;
break;
default:
abort ();
}
if (optind >= argc) {
cerr << "Usage: " << argv[0] << " <jsonfile>\n";
cerr << "Or " << argv[0] << " -v <jsonfile>\n";
exit(1);
}
bool verbose = false;
if (argc > 2) {
if (strcmp(argv[1], "-v"))
verbose = true;
const char * filename = argv[optind];
std::pair<u8 *, size_t> p;
try {
p = get_corpus(filename);
} catch (const std::exception& e) { // caught by reference to base
std::cout << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
pair<u8 *, size_t> p = get_corpus(argv[argc - 1]);
if (verbose) {
std::cout << "Input has ";
if (p.second > 1024 * 1024)

View File

@ -5,6 +5,7 @@
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "jsonparser/jsonparser.h"
@ -41,6 +42,8 @@ bool validate(const char *dirname) {
printf("nothing in dir %s \n", dirname);
return false;
}
bool * isfileasexpected = new bool[c];
for(int i = 0; i < c; i++) isfileasexpected[i] = true;
size_t howmany = 0;
bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/');
for (int i = 0; i < c; i++) {
@ -56,7 +59,13 @@ bool validate(const char *dirname) {
} else {
strcpy(fullpath + dirlen, name);
}
std::pair<u8 *, size_t> p = get_corpus(fullpath);
std::pair<u8 *, size_t> p;
try {
p = get_corpus(fullpath);
} catch (const std::exception& e) {
std::cout << "Could not load the file " << fullpath << std::endl;
return EXIT_FAILURE;
}
ParsedJson *pj_ptr = allocate_ParsedJson(p.second, 1024);
if(pj_ptr == NULL) {
std::cerr<< "can't allocate memory"<<std::endl;
@ -70,11 +79,13 @@ bool validate(const char *dirname) {
howmany--;
} else if (startsWith("pass", name)) {
if (!isok) {
isfileasexpected[i] = false;
printf("warning: file %s should pass but it fails.\n", name);
everythingfine = false;
}
} else if (startsWith("fail", name)) {
if (isok) {
isfileasexpected[i] = false;
printf("warning: file %s should fail but it passes.\n", name);
everythingfine = false;
}
@ -87,11 +98,20 @@ bool validate(const char *dirname) {
deallocate_ParsedJson(pj_ptr);
}
}
printf("%zu files checked.\n", howmany);
if(everythingfine) {
printf("All ok!\n");
} else {
printf("There were problems! Consider reviewing the following files:\n");
for(int i = 0; i < c; i++) {
if(!isfileasexpected[i]) printf("%s \n", entry_list[i]->d_name);
}
}
for (int i = 0; i < c; ++i)
free(entry_list[i]);
free(entry_list);
printf("%zu files checked.\n", howmany);
if(everythingfine) printf("All ok!\n");
delete[] isfileasexpected;
return everythingfine;
}

View File

@ -28,7 +28,7 @@ bool startsWith(const char *pre, const char *str) {
size_t lenpre = strlen(pre), lenstr = strlen(str);
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
}
bool is_in_bad_list(char *buf) {
bool is_in_bad_list(const char *buf) {
for (size_t i = 0; i < sizeof(really_bad) / sizeof(really_bad[0]); i++)
if (startsWith(really_bad[i], buf))
return true;
@ -38,9 +38,9 @@ bool is_in_bad_list(char *buf) {
inline void foundInvalidNumber(const u8 *buf) {
invalid_count++;
char *endptr;
double expected = strtod((char *)buf, &endptr);
if (endptr != (char *)buf) {
if (!is_in_bad_list((char *)buf)) {
double expected = strtod((const char *)buf, &endptr);
if (endptr != (const char *)buf) {
if (!is_in_bad_list((const char *)buf)) {
printf(
"Warning: foundInvalidNumber %.32s whereas strtod parses it to %f, ",
buf, expected);
@ -53,8 +53,8 @@ inline void foundInvalidNumber(const u8 *buf) {
inline void foundInteger(int64_t result, const u8 *buf) {
int_count++;
char *endptr;
long long expected = strtoll((char *)buf, &endptr, 10);
if ((endptr == (char *)buf) || (expected != result)) {
long long expected = strtoll((const char *)buf, &endptr, 10);
if ((endptr == (const char *)buf) || (expected != result)) {
printf("Error: parsed %" PRId64 " out of %.32s, ", result, buf);
printf(" while parsing %s \n", fullpath);
parse_error |= PARSE_ERROR;
@ -64,8 +64,8 @@ inline void foundInteger(int64_t result, const u8 *buf) {
inline void foundFloat(double result, const u8 *buf) {
char *endptr;
float_count++;
double expected = strtod((char *)buf, &endptr);
if (endptr == (char *)buf) {
double expected = strtod((const char *)buf, &endptr);
if (endptr == (const char *)buf) {
printf("parsed %f from %.32s whereas strtod refuses to parse a float, ",
result, buf);
printf(" while parsing %s \n", fullpath);
@ -123,7 +123,13 @@ bool validate(const char *dirname) {
} else {
strcpy(fullpath + dirlen, name);
}
std::pair<u8 *, size_t> p = get_corpus(fullpath);
std::pair<u8 *, size_t> p;
try {
p = get_corpus(fullpath);
} catch (const std::exception& e) {
std::cout << "Could not load the file " << fullpath << std::endl;
return EXIT_FAILURE;
}
// terrible hack but just to get it working
ParsedJson *pj_ptr = allocate_ParsedJson(p.second, 1024);
if (pj_ptr == NULL) {

View File

@ -241,7 +241,7 @@ inline void foundString(const u8 *buf, const u8 *parsed_begin,
// we have a zero-length string
if (parsed_begin != parsed_end) {
printf("WARNING: We have a zero-length but gap is %zu \n",
parsed_end - parsed_begin);
(size_t)(parsed_end - parsed_begin));
probable_bug = true;
}
empty_string++;
@ -252,12 +252,12 @@ inline void foundString(const u8 *buf, const u8 *parsed_begin,
printf("WARNING: lengths on parsed strings disagree %zu %zu \n", thislen,
len);
printf("\nour parsed string : '%*s'\n\n", (int)thislen,
(char *)parsed_begin);
print_hex((char *)parsed_begin, thislen);
(const char *)parsed_begin);
print_hex((const char *)parsed_begin, thislen);
printf("\n");
printf("reference parsing :'%*s'\n\n", (int)len, bigbuffer);
print_hex((char *)bigbuffer, len);
print_hex((const char *)bigbuffer, len);
printf("\n");
probable_bug = true;
@ -267,15 +267,15 @@ inline void foundString(const u8 *buf, const u8 *parsed_begin,
printf("Lengths %zu %zu \n", thislen, len);
printf("\nour parsed string : '%*s'\n", (int)thislen,
(char *)parsed_begin);
print_hex((char *)parsed_begin, thislen);
(const char *)parsed_begin);
print_hex((const char *)parsed_begin, thislen);
printf("\n");
printf("reference parsing :'%*s'\n", (int)len, bigbuffer);
print_hex((char *)bigbuffer, len);
print_hex((const char *)bigbuffer, len);
printf("\n");
print_cmp_hex((char *)parsed_begin, bigbuffer, thislen);
print_cmp_hex((const char *)parsed_begin, bigbuffer, thislen);
probable_bug = true;
}
@ -325,8 +325,13 @@ bool validate(const char *dirname) {
} else {
strcpy(fullpath + dirlen, name);
}
std::pair<u8 *, size_t> p = get_corpus(fullpath);
// terrible hack but just to get it working
std::pair<u8 *, size_t> p;
try {
p = get_corpus(fullpath);
} catch (const std::exception& e) {
std::cout << "Could not load the file " << fullpath << std::endl;
return EXIT_FAILURE;
}
ParsedJson *pj_ptr = allocate_ParsedJson(p.second, 1024);
if (pj_ptr == NULL) {
std::cerr << "can't allocate memory" << std::endl;