From a43b0772e17212dc70943373748b27d73408ac62 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Tue, 27 Nov 2018 14:37:59 -0500 Subject: [PATCH] Lots and lots of cleaning. --- Makefile | 2 +- benchmark/linux/linux-perf-events.h | 2 +- benchmark/minifiercompetition.cpp | 30 +- benchmark/parse.cpp | 86 +-- benchmark/parsingcompetition.cpp | 10 +- include/jsonparser/jsonioutil.h | 7 + include/jsonparser/numberparsing.h | 13 +- include/jsonparser/simdjson_internal.h | 20 +- include/jsonparser/stringparsing.h | 2 +- src/jsonminifier.cpp | 4 +- src/stage34_unified.cpp | 708 +++++++++++++------------ tests/allparserscheckfile.cpp | 27 +- tests/jsoncheck.cpp | 26 +- tests/numberparsingcheck.cpp | 24 +- tests/stringparsingcheck.cpp | 25 +- 15 files changed, 521 insertions(+), 465 deletions(-) diff --git a/Makefile b/Makefile index e6ceba29..166a845c 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ .PHONY: clean cleandist DEPSINCLUDE = -Idependencies/rapidjson/include -Idependencies/sajson/include -Idependencies/json11 -Idependencies/fastjson/src -Idependencies/fastjson/include -Idependencies/gason/src -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src -CXXFLAGS = -std=c++11 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux $(DEPSINCLUDE) +CXXFLAGS = -std=c++11 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux $(DEPSINCLUDE) CFLAGS = -march=native -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src ifeq ($(SANITIZE),1) CXXFLAGS += -g3 -O0 -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined diff --git a/benchmark/linux/linux-perf-events.h b/benchmark/linux/linux-perf-events.h index 49346263..55a1a707 100644 --- a/benchmark/linux/linux-perf-events.h +++ b/benchmark/linux/linux-perf-events.h @@ -21,7 +21,7 @@ template class LinuxEvents { std::vector ids; public: - LinuxEvents(std::vector config_vec) : fd(0) { + explicit LinuxEvents(std::vector config_vec) : fd(0) { memset(&attribs, 0, sizeof(attribs)); attribs.type = TYPE; attribs.size = sizeof(attribs); diff --git a/benchmark/minifiercompetition.cpp b/benchmark/minifiercompetition.cpp index 441f4c50..d0a4a0f7 100644 --- a/benchmark/minifiercompetition.cpp +++ b/benchmark/minifiercompetition.cpp @@ -1,3 +1,4 @@ +#include #include #include "benchmark.h" @@ -13,6 +14,7 @@ #include "rapidjson/writer.h" #include "sajson.h" + using namespace rapidjson; using namespace std; @@ -43,17 +45,29 @@ std::string rapidstringme(char *json) { } int main(int argc, char *argv[]) { - if (argc < 2) { - cerr << "Usage: " << argv[0] << " \n"; - cerr << "Or " << argv[0] << " -v \n"; + int c; + bool verbose = false; + while ((c = getopt (argc, argv, "v")) != -1) + switch (c) + { + case 'v': + verbose = true; + break; + default: + abort (); + } + if (optind >= argc) { + cerr << "Usage: " << argv[0] << " " << endl; exit(1); } - bool verbose = false; - if (argc > 2) { - if (strcmp(argv[1], "-v")) - verbose = true; + const char * filename = argv[optind]; + pair p; + try { + p = get_corpus(filename); + } catch (const std::exception& e) { // caught by reference to base + std::cout << "Could not load the file " << filename << std::endl; + return EXIT_FAILURE; } - pair p = get_corpus(argv[argc - 1]); if (verbose) { std::cout << "Input has "; if (p.second > 1024 * 1024) diff --git a/benchmark/parse.cpp b/benchmark/parse.cpp index d874bbfd..2f61572f 100644 --- a/benchmark/parse.cpp +++ b/benchmark/parse.cpp @@ -31,79 +31,14 @@ #include "jsonparser/stage34_unified.h" using namespace std; -// https://stackoverflow.com/questions/2616906/how-do-i-output-coloured-text-to-a-linux-terminal -namespace Color { -enum Code { - FG_DEFAULT = 39, - FG_BLACK = 30, - FG_RED = 31, - FG_GREEN = 32, - FG_YELLOW = 33, - FG_BLUE = 34, - FG_MAGENTA = 35, - FG_CYAN = 36, - FG_LIGHT_GRAY = 37, - FG_DARK_GRAY = 90, - FG_LIGHT_RED = 91, - FG_LIGHT_GREEN = 92, - FG_LIGHT_YELLOW = 93, - FG_LIGHT_BLUE = 94, - FG_LIGHT_MAGENTA = 95, - FG_LIGHT_CYAN = 96, - FG_WHITE = 97, - BG_RED = 41, - BG_GREEN = 42, - BG_BLUE = 44, - BG_DEFAULT = 49 -}; -class Modifier { - Code code; - -public: - Modifier(Code pCode) : code(pCode) {} - friend std::ostream &operator<<(std::ostream &os, const Modifier &mod) { - return os << "\033[" << mod.code << "m"; - } -}; -} // namespace Color - -void colorfuldisplay(ParsedJson &pj, const u8 *buf) { - Color::Modifier greenfg(Color::FG_GREEN); - Color::Modifier yellowfg(Color::FG_YELLOW); - Color::Modifier deffg(Color::FG_DEFAULT); - size_t i = 0; - // skip initial fluff - while ((i + 1 < pj.n_structural_indexes) && - (pj.structural_indexes[i] == pj.structural_indexes[i + 1])) { - i++; - } - for (; i < pj.n_structural_indexes; i++) { - u32 idx = pj.structural_indexes[i]; - u8 c = buf[idx]; - if (((c & 0xdf) == 0x5b)) { // meaning 7b or 5b, { or [ - std::cout << greenfg << buf[idx] << deffg; - } else if (((c & 0xdf) == 0x5d)) { // meaning 7d or 5d, } or ] - std::cout << greenfg << buf[idx] << deffg; - } else { - std::cout << yellowfg << buf[idx] << deffg; - } - if (i + 1 < pj.n_structural_indexes) { - u32 nextidx = pj.structural_indexes[i + 1]; - for (u32 pos = idx + 1; pos < nextidx; pos++) { - std::cout << buf[pos]; - } - } - } - std::cout << std::endl; -} - int main(int argc, char *argv[]) { bool verbose = false; bool dump = false; + bool forceoneiteration = false; int c; - while ((c = getopt (argc, argv, "vd")) != -1) + while ((c = getopt (argc, argv, "1vd")) != -1) switch (c) { case 'v': @@ -112,6 +47,9 @@ int main(int argc, char *argv[]) { case 'd': dump = true; break; + case '1': + forceoneiteration = true; + break; default: abort (); } @@ -124,7 +62,13 @@ int main(int argc, char *argv[]) { cerr << "warning: ignoring everything after " << argv[optind + 1] << endl; } if(verbose) cout << "[verbose] loading " << filename << endl; - pair p = get_corpus(filename); + pair p; + try { + p = get_corpus(filename); + } catch (const std::exception& e) { // caught by reference to base + std::cout << "Could not load the file " << filename << std::endl; + return EXIT_FAILURE; + } if(verbose) cout << "[verbose] loaded " << filename << " ("<< p.second << " bytes)" << endl; ParsedJson *pj_ptr = allocate_ParsedJson(p.second, 1024); ParsedJson &pj(*pj_ptr); @@ -133,7 +77,7 @@ int main(int argc, char *argv[]) { #if defined(DEBUG) const u32 iterations = 1; #else - const u32 iterations = p.second < 1 * 1000 * 1000? 1000 : 10; + const u32 iterations = forceoneiteration ? 1 : ( p.second < 1 * 1000 * 1000? 1000 : 10); #endif vector res; res.resize(iterations); @@ -174,7 +118,7 @@ int main(int argc, char *argv[]) { } unified.start(); #endif - isok = flatten_indexes(p.second, pj); + isok = isok && flatten_indexes(p.second, pj); #ifndef SQUASH_COUNTERS unified.end(results); cy2 += results[0]; @@ -187,7 +131,7 @@ int main(int argc, char *argv[]) { unified.start(); #endif - isok = unified_machine(p.first, p.second, pj); + isok = isok && unified_machine(p.first, p.second, pj); #ifndef SQUASH_COUNTERS unified.end(results); cy3 += results[0]; diff --git a/benchmark/parsingcompetition.cpp b/benchmark/parsingcompetition.cpp index c9922224..3dd2efdf 100644 --- a/benchmark/parsingcompetition.cpp +++ b/benchmark/parsingcompetition.cpp @@ -31,7 +31,6 @@ void on_json_error( void *, const fastjson::ErrorContext& ec) { bool fastjson_parse(const char *input) { fastjson::Token token; fastjson::dom::Chunk chunk; - std::string error_message; return fastjson::dom::parse_string(input, &token, &chunk, 0, &on_json_error, NULL); } // end of fastjson stuff @@ -62,7 +61,14 @@ int main(int argc, char *argv[]) { if(optind + 1 < argc) { cerr << "warning: ignoring everything after " << argv[optind + 1] << endl; } - pair p = get_corpus(filename); + pair p; + try { + p = get_corpus(filename); + } catch (const std::exception& e) { // caught by reference to base + std::cout << "Could not load the file " << filename << std::endl; + return EXIT_FAILURE; + } + if (verbose) { std::cout << "Input has "; if (p.second > 1024 * 1024) diff --git a/include/jsonparser/jsonioutil.h b/include/jsonparser/jsonioutil.h index 5ea01711..62289fba 100644 --- a/include/jsonparser/jsonioutil.h +++ b/include/jsonparser/jsonioutil.h @@ -20,6 +20,13 @@ char * allocate_aligned_buffer(size_t length); // first element of the pair is a string (null terminated) // whereas the second element is the length. // caller is responsible to free (free std::pair.first) +// +// throws an exception if the file cannot be opened, use try/catch +// try { +// p = get_corpus(filename); +// } catch (const std::exception& e) { +// std::cout << "Could not load the file " << filename << std::endl; +// } std::pair get_corpus(std::string filename); #endif diff --git a/include/jsonparser/numberparsing.h b/include/jsonparser/numberparsing.h index 39d71bae..ec58a022 100644 --- a/include/jsonparser/numberparsing.h +++ b/include/jsonparser/numberparsing.h @@ -128,7 +128,7 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) { const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1); const __m128i mul_1_10000 = _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1); - const __m128i input = _mm_sub_epi8(_mm_loadu_si128((__m128i *)chars), ascii0); + const __m128i input = _mm_sub_epi8(_mm_loadu_si128((const __m128i *)chars), ascii0); const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10); const __m128i t2 = _mm_madd_epi16(t1, mul_1_100); const __m128i t3 = _mm_packus_epi32(t2, t2); @@ -149,7 +149,7 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) { // static never_inline bool parse_highprecision_float(const u8 *const buf, UNUSED size_t len, - ParsedJson &pj, const u32 depth, const u32 offset, + ParsedJson &pj, UNUSED const u32 depth, const u32 offset, UNUSED bool found_zero, bool found_minus) { const char *p = (const char *)(buf + offset); @@ -193,7 +193,6 @@ parse_highprecision_float(const u8 *const buf, UNUSED size_t len, } exponent = firstafterperiod - p; } - int64_t expnumber = 0; // exponential part if (('e' == *p) || ('E' == *p)) { ++p; bool negexp = false; @@ -210,7 +209,7 @@ parse_highprecision_float(const u8 *const buf, UNUSED size_t len, return false; } unsigned char digit = *p - '0'; - expnumber = digit; + int64_t expnumber = digit; // exponential part p++; if (is_integer(*p)) { digit = *p - '0'; @@ -270,7 +269,7 @@ parse_highprecision_float(const u8 *const buf, UNUSED size_t len, // static never_inline bool parse_large_integer(const u8 *const buf, UNUSED size_t len, ParsedJson &pj, - const u32 depth, const u32 offset, + UNUSED const u32 depth, const u32 offset, UNUSED bool found_zero, bool found_minus) { const char *p = (const char *)(buf + offset); @@ -340,10 +339,12 @@ static never_inline bool parse_large_integer(const u8 *const buf, #define unlikely(x) __builtin_expect(!!(x), 0) #endif + + // parse the number at buf + offset // define JSON_TEST_NUMBERS for unit testing static really_inline bool parse_number(const u8 *const buf, UNUSED size_t len, - ParsedJson &pj, const u32 depth, + ParsedJson &pj, UNUSED const u32 depth, const u32 offset, UNUSED bool found_zero, bool found_minus) { const char *p = (const char *)(buf + offset); diff --git a/include/jsonparser/simdjson_internal.h b/include/jsonparser/simdjson_internal.h index d8af4465..60558761 100644 --- a/include/jsonparser/simdjson_internal.h +++ b/include/jsonparser/simdjson_internal.h @@ -105,14 +105,14 @@ public: void write_tape_s64(s64 i) { - *((s64 *)current_number_buf_loc) = i; - current_number_buf_loc += 8; + *((s64 *)current_number_buf_loc) = i;// safe because array will be 8-byte aligned, could use memcpy + current_number_buf_loc += sizeof(s64); write_tape(current_number_buf_loc - number_buf, 'l'); } void write_tape_double(double d) { - *((double *)current_number_buf_loc) = d; - current_number_buf_loc += 8; + *((double *)current_number_buf_loc) = d;// safe because array will be 8-byte aligned, could use memcpy + current_number_buf_loc += sizeof(double); write_tape(current_number_buf_loc - number_buf, 'd'); } @@ -137,7 +137,7 @@ public: u32 scope_header; // the start of our current scope that contains our current location u32 location; // our current location on a tape - ParsedJsonHandle(ParsedJson & pj_) : pj(pj_), depth(0), scope_header(0), location(0) {} + explicit ParsedJsonHandle(ParsedJson & pj_) : pj(pj_), depth(0), scope_header(0), location(0) {} // OK with default copy constructor as the way to clone the POD structure // some placeholder navigation. Will convert over to a more native C++-ish way of doing @@ -167,7 +167,7 @@ public: #ifdef DEBUG -inline void dump256(m256 d, std::string msg) { +inline void dump256(m256 d, const std::string msg) { for (u32 i = 0; i < 32; i++) { std::cout << std::setw(3) << (int)*(((u8 *)(&d)) + i); if (!((i + 1) % 8)) @@ -181,14 +181,14 @@ inline void dump256(m256 d, std::string msg) { } // dump bits low to high -inline void dumpbits(u64 v, std::string msg) { +inline void dumpbits(u64 v, const std::string msg) { for (u32 i = 0; i < 64; i++) { std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_"); } std::cout << " " << msg << "\n"; } -inline void dumpbits32(u32 v, std::string msg) { +inline void dumpbits32(u32 v, const std::string msg) { for (u32 i = 0; i < 32; i++) { std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_"); } @@ -201,14 +201,14 @@ inline void dumpbits32(u32 v, std::string msg) { #endif // dump bits low to high -inline void dumpbits_always(u64 v, std::string msg) { +inline void dumpbits_always(u64 v, const std::string msg) { for (u32 i = 0; i < 64; i++) { std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_"); } std::cout << " " << msg << "\n"; } -inline void dumpbits32_always(u32 v, std::string msg) { +inline void dumpbits32_always(u32 v, const std::string msg) { for (u32 i = 0; i < 32; i++) { std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_"); } diff --git a/include/jsonparser/stringparsing.h b/include/jsonparser/stringparsing.h index 093f9617..ede34b67 100644 --- a/include/jsonparser/stringparsing.h +++ b/include/jsonparser/stringparsing.h @@ -58,7 +58,7 @@ really_inline bool handle_unicode_codepoint(const u8 **src_ptr, u8 **dst_ptr) { } really_inline bool parse_string(const u8 *buf, UNUSED size_t len, - ParsedJson &pj, u32 depth, u32 offset) { + ParsedJson &pj, UNUSED const u32 depth, u32 offset) { using namespace std; const u8 *src = &buf[offset + 1]; // we know that buf at offset is a " u8 *dst = pj.current_string_buf_loc; diff --git a/src/jsonminifier.cpp b/src/jsonminifier.cpp index d2165758..8642f9dd 100644 --- a/src/jsonminifier.cpp +++ b/src/jsonminifier.cpp @@ -137,7 +137,7 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) { uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); quote_mask ^= prev_iter_inside_quote; - prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); + prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// might be undefined behavior const __m256i low_nibble_mask = _mm256_setr_epi8( // 0 9 a b c d 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, @@ -220,7 +220,7 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) { uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); quote_mask ^= prev_iter_inside_quote; - prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); + // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we don't need this anymore __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32 __m256i mask_70 = diff --git a/src/stage34_unified.cpp b/src/stage34_unified.cpp index 976c76e8..2699d1d2 100644 --- a/src/stage34_unified.cpp +++ b/src/stage34_unified.cpp @@ -10,9 +10,9 @@ #include #include "jsonparser/common_defs.h" -#include "jsonparser/simdjson_internal.h" #include "jsonparser/jsoncharutils.h" #include "jsonparser/numberparsing.h" +#include "jsonparser/simdjson_internal.h" #include "jsonparser/stringparsing.h" #include @@ -20,390 +20,430 @@ #define PATH_SEP '/' #if defined(DEBUG) && !defined(DEBUG_PRINTF) -#include #include -#define DEBUG_PRINTF(format, ...) printf("%s:%s:%d:" format, \ - strrchr(__FILE__, PATH_SEP) + 1, \ - __func__, __LINE__, ## __VA_ARGS__) +#include +#define DEBUG_PRINTF(format, ...) \ + printf("%s:%s:%d:" format, strrchr(__FILE__, PATH_SEP) + 1, __func__, \ + __LINE__, ##__VA_ARGS__) #elif !defined(DEBUG_PRINTF) -#define DEBUG_PRINTF(format, ...) do { } while(0) +#define DEBUG_PRINTF(format, ...) \ + do { \ + } while (0) #endif using namespace std; WARN_UNUSED -really_inline bool is_valid_true_atom(const u8 * loc) { - u64 tv = *(const u64 *)"true "; - u64 mask4 = 0x00000000ffffffff; - u32 error = 0; - u64 locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++) - std::memcpy(&locval, loc, sizeof(u64)); - error = (locval & mask4) ^ tv; - error |= is_not_structural_or_whitespace(loc[4]); - return error == 0; +really_inline bool is_valid_true_atom(const u8 *loc) { + u64 tv = *(const u64 *)"true "; + u64 mask4 = 0x00000000ffffffff; + u32 error = 0; + u64 locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++) + std::memcpy(&locval, loc, sizeof(u64)); + error = (locval & mask4) ^ tv; + error |= is_not_structural_or_whitespace(loc[4]); + return error == 0; } WARN_UNUSED -really_inline bool is_valid_false_atom(const u8 * loc) { - u64 fv = *(const u64 *)"false "; - u64 mask5 = 0x000000ffffffffff; - u32 error = 0; - u64 locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++) - std::memcpy(&locval, loc, sizeof(u64)); - error = (locval & mask5) ^ fv; - error |= is_not_structural_or_whitespace(loc[5]); - return error == 0; +really_inline bool is_valid_false_atom(const u8 *loc) { + u64 fv = *(const u64 *)"false "; + u64 mask5 = 0x000000ffffffffff; + u32 error = 0; + u64 locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++) + std::memcpy(&locval, loc, sizeof(u64)); + error = (locval & mask5) ^ fv; + error |= is_not_structural_or_whitespace(loc[5]); + return error == 0; } WARN_UNUSED -really_inline bool is_valid_null_atom(const u8 * loc) { - u64 nv = *(const u64 *)"null "; - u64 mask4 = 0x00000000ffffffff; - u32 error = 0; - u64 locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++) - std::memcpy(&locval, loc, sizeof(u64)); - error = (locval & mask4) ^ nv; - error |= is_not_structural_or_whitespace(loc[4]); - return error == 0; +really_inline bool is_valid_null_atom(const u8 *loc) { + u64 nv = *(const u64 *)"null "; + u64 mask4 = 0x00000000ffffffff; + u32 error = 0; + u64 locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++) + std::memcpy(&locval, loc, sizeof(u64)); + error = (locval & mask4) ^ nv; + error |= is_not_structural_or_whitespace(loc[4]); + return error == 0; } -// Implemented using Labels as Values which works in GCC and CLANG (and maybe also in Intel's compiler), -// but won't work in MSVC. This would need to be reimplemented differently -// if one wants to be standard compliant. +// Implemented using Labels as Values which works in GCC and CLANG (and maybe +// also in Intel's compiler), but won't work in MSVC. This would need to be +// reimplemented differently if one wants to be standard compliant. WARN_UNUSED bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) { - u32 i = 0; // index of the structural character (0,1,2,3...) - u32 idx; // location of the structural character in the input (buf) - u8 c; // used to track the (structural) character we are looking at, updated by UPDATE_CHAR macro - u32 depth = 0;//START_DEPTH; // an arbitrary starting depth - //void * ret_address[MAX_DEPTH]; // used to store "labels as value" (non-standard compiler extension) - - // a call site is the start of either an object or an array ('[' or '{') - // this is the location of the previous call site - // (in the tape, at the given depth); - // we only need one. - - // We should also track the tape address of our containing - // scope for two reasons. First, we will need to put an - // up pointer there at each call site so we can navigate - // upwards. Second, when we encounter the end of the scope - // we can put the current offset into a record for the - // scope so we know where it is - - //u32 containing_scope_offset[MAX_DEPTH]; - - pj.init(); - - // add a sentinel to the end to avoid premature exit - // need to be able to find the \0 at the 'padded length' end of the buffer - // FIXME: TERRIFYING! - //size_t j; - //for (j = len; buf[j] != 0; j++) - // ; - //pj.structural_indexes[pj.n_structural_indexes++] = j; - + u32 i = 0; // index of the structural character (0,1,2,3...) + u32 idx; // location of the structural character in the input (buf) + u8 c; // used to track the (structural) character we are looking at, updated + // by UPDATE_CHAR macro + u32 depth = 0; // could have an arbitrary starting depth + pj.init(); // this macro reads the next structural character, updating idx, i and c. -#define UPDATE_CHAR() { idx = pj.structural_indexes[i++]; c = buf[idx]; DEBUG_PRINTF("Got %c at %d (%d offset)\n", c, idx, i-1);} +#define UPDATE_CHAR() \ + { \ + idx = pj.structural_indexes[i++]; \ + c = buf[idx]; \ + DEBUG_PRINTF("Got %c at %d (%d offset) (depth %d)\n", c, idx, i - 1, \ + depth); \ + } - - - - -////////////////////////////// START STATE ///////////////////////////// -printf("at start\n"); - DEBUG_PRINTF("at start\n"); - pj.ret_address[depth] = &&start_continue; - pj.containing_scope_offset[depth] = pj.get_current_loc(); - pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten - depth++;// everything starts at depth = 1, depth = 0 is just for the root - if(depth > pj.depthcapacity) { - goto fail; - } - printf("got char %c \n",c); - UPDATE_CHAR(); - switch (c) { - case '{': goto object_begin; - case '[': goto array_begin; + ////////////////////////////// START STATE ///////////////////////////// + DEBUG_PRINTF("at start\n"); + pj.ret_address[depth] = &&start_continue; + pj.containing_scope_offset[depth] = pj.get_current_loc(); + pj.write_tape(0, 'r'); // r for root, 0 is going to get overwritten + depth++; // everything starts at depth = 1, depth = 0 is just for the root + if (depth > pj.depthcapacity) { + goto fail; + } + UPDATE_CHAR(); + switch (c) { + case '{': + goto object_begin; + case '[': + goto array_begin; #define SIMDJSON_ALLOWANYTHINGINROOT - // A JSON text is a serialized value. Note that certain previous - // specifications of JSON constrained a JSON text to be an object or an - // array. Implementations that generate only objects or arrays where a - // JSON text is called for will be interoperable in the sense that all - // implementations will accept these as conforming JSON texts. - // https://tools.ietf.org/html/rfc8259 + // A JSON text is a serialized value. Note that certain previous + // specifications of JSON constrained a JSON text to be an object or an + // array. Implementations that generate only objects or arrays where a + // JSON text is called for will be interoperable in the sense that all + // implementations will accept these as conforming JSON texts. + // https://tools.ietf.org/html/rfc8259 #ifdef SIMDJSON_ALLOWANYTHINGINROOT - case '"': { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - goto start_continue; - } - case 't': - if (!is_valid_true_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - goto start_continue; - case 'f': - if (!is_valid_false_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - goto start_continue; - case 'n': - if (!is_valid_null_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - goto start_continue; - case '0': { - if (!parse_number(buf, len, pj, depth, idx, true, false)) { - goto fail; - } - goto start_continue; - } - case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { - if (!parse_number(buf, len, pj, depth, idx, false, false)) { - goto fail; - } - goto start_continue; - } - case '-': { - if (!parse_number(buf, len, pj, depth, idx, false, true)) { - goto fail; - } - goto start_continue; - } -#endif // ALLOWANYTHINGINROOT - default: goto fail; + case '"': { + if (!parse_string(buf, len, pj, depth, idx)) { + goto fail; } + break; + } + case 't': + if (!is_valid_true_atom(buf + idx)) { + goto fail; + } + pj.write_tape(0, c); + break; + case 'f': + if (!is_valid_false_atom(buf + idx)) { + goto fail; + } + pj.write_tape(0, c); + break; + case 'n': + if (!is_valid_null_atom(buf + idx)) { + goto fail; + } + pj.write_tape(0, c); + break; + case '0': { + if (!parse_number(buf, len, pj, depth, idx, true, false)) { + goto fail; + } + + break; + } + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + if (!parse_number(buf, len, pj, depth, idx, false, false)) { + goto fail; + } + break; + } + case '-': { + if (!parse_number(buf, len, pj, depth, idx, false, true)) { + goto fail; + } + break; + } +#endif // ALLOWANYTHINGINROOT + default: + goto fail; + } +#ifdef SIMDJSON_ALLOWANYTHINGINROOT + depth--; // for fall-through cases (e.g., documents containing just a string) +#endif // ALLOWANYTHINGINROOT start_continue: - DEBUG_PRINTF("in start_object_close\n"); - UPDATE_CHAR(); - switch (c) { - case 0: goto succeed; - default: goto fail; - } + DEBUG_PRINTF("in start_object_close\n"); + UPDATE_CHAR(); + switch (c) { + case 0: + goto succeed; + default: + goto fail; + } -////////////////////////////// OBJECT STATES ///////////////////////////// + ////////////////////////////// OBJECT STATES ///////////////////////////// object_begin: - printf("in object_begin %c \n",c); - DEBUG_PRINTF("in object_begin\n"); - pj.containing_scope_offset[depth] = pj.get_current_loc(); - pj.write_tape(0, c); - depth ++; - if(depth > pj.depthcapacity) { - goto fail; - } - UPDATE_CHAR(); - switch (c) { - case '"': { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - goto object_key_state; - } - case '}': goto scope_end; - default: goto fail; + DEBUG_PRINTF("in object_begin\n"); + pj.containing_scope_offset[depth] = pj.get_current_loc(); + pj.write_tape(0, c); + + UPDATE_CHAR(); + switch (c) { + case '"': { + if (!parse_string(buf, len, pj, depth, idx)) { + goto fail; } + goto object_key_state; + } + case '}': + goto scope_end; // could also go to object_continue + default: + goto fail; + } object_key_state: - printf("in object_key_state %c \n",c); + DEBUG_PRINTF("in object_key_state\n"); + UPDATE_CHAR(); + if (c != ':') { + goto fail; + } + UPDATE_CHAR(); + switch (c) { + case '"': { + if (!parse_string(buf, len, pj, depth, idx)) { + goto fail; + } + break; + } + case 't': + if (!is_valid_true_atom(buf + idx)) { + goto fail; + } + pj.write_tape(0, c); + break; + case 'f': + if (!is_valid_false_atom(buf + idx)) { + goto fail; + } + pj.write_tape(0, c); + break; + case 'n': + if (!is_valid_null_atom(buf + idx)) { + goto fail; + } + pj.write_tape(0, c); + break; + case '0': { + if (!parse_number(buf, len, pj, depth, idx, true, false)) { + goto fail; + } + break; + } + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + if (!parse_number(buf, len, pj, depth, idx, false, false)) { + goto fail; + } + break; + } + case '-': { + if (!parse_number(buf, len, pj, depth, idx, false, true)) { + goto fail; + } + break; + } + case '{': { + // we have not yet encountered } so we need to come back for it + pj.ret_address[depth] = &&object_continue; + // we found an object inside an object, so we need to increment the depth + depth++; + if (depth > pj.depthcapacity) { + goto fail; + } - DEBUG_PRINTF("in object_key_state\n"); - UPDATE_CHAR(); - if (c != ':') { - goto fail; - } - UPDATE_CHAR(); - switch (c) { - case '"': { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - break; - } - case 't': if (!is_valid_true_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case 'f': if (!is_valid_false_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case 'n': if (!is_valid_null_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case '0': { - if (!parse_number(buf, len, pj, depth, idx, true, false)) { - goto fail; - } - break; - } - case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { - if (!parse_number(buf, len, pj, depth, idx, false, false)) { - goto fail; - } - break; - } - case '-': { - if (!parse_number(buf, len, pj, depth, idx, false, true)) { - goto fail; - } - break; - } - case '{': { - pj.ret_address[depth] = &&object_continue; - goto object_begin; - } - case '[': { - pj.ret_address[depth] = &&object_continue; - goto array_begin; - } - default: goto fail; + goto object_begin; + } + case '[': { + // we have not yet encountered } so we need to come back for it + pj.ret_address[depth] = &&object_continue; + // we found an array inside an object, so we need to increment the depth + depth++; + if (depth > pj.depthcapacity) { + goto fail; } + goto array_begin; + } + default: + goto fail; + } object_continue: - printf("in object_continue %c \n",c); - - DEBUG_PRINTF("in object_continue\n"); + DEBUG_PRINTF("in object_continue\n"); + UPDATE_CHAR(); + switch (c) { + case ',': UPDATE_CHAR(); - switch (c) { - case ',': - UPDATE_CHAR(); - if (c != '"') { - goto fail; - } else { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - goto object_key_state; - } - case '}': goto scope_end; - default: goto fail; - } - -////////////////////////////// COMMON STATE ///////////////////////////// - -scope_end: - // write our tape location to the header scope - depth--; - pj.write_tape(pj.containing_scope_offset[depth], c); - pj.annotate_previousloc(pj.containing_scope_offset[depth], pj.get_current_loc()); - // goto saved_state - goto *pj.ret_address[depth]; - - -////////////////////////////// ARRAY STATES ///////////////////////////// - -array_begin: - printf("in array_begin %c \n",c); - - DEBUG_PRINTF("in array_begin\n"); - pj.containing_scope_offset[depth] = pj.get_current_loc(); - pj.write_tape(0, c); - depth ++; - if(depth > pj.depthcapacity) { + if (c != '"') { + goto fail; + } else { + if (!parse_string(buf, len, pj, depth, idx)) { goto fail; + } + goto object_key_state; } - UPDATE_CHAR(); - if (c == ']') { - goto scope_end; - } + case '}': + goto scope_end; + default: + goto fail; + } + + ////////////////////////////// COMMON STATE ///////////////////////////// + +scope_end: + // write our tape location to the header scope + depth--; + pj.write_tape(pj.containing_scope_offset[depth], c); + pj.annotate_previousloc(pj.containing_scope_offset[depth], + pj.get_current_loc()); + // goto saved_state + goto *pj.ret_address[depth]; + + ////////////////////////////// ARRAY STATES ///////////////////////////// +array_begin: + DEBUG_PRINTF("in array_begin\n"); + pj.containing_scope_offset[depth] = pj.get_current_loc(); + pj.write_tape(0, c); + UPDATE_CHAR(); + if (c == ']') { + goto scope_end; // could also go to array_continue + } main_array_switch: - // we call update char on all paths in, so we can peek at c on the - // on paths that can accept a close square brace (post-, and at start) - switch (c) { - case '"': { - if (!parse_string(buf, len, pj, depth, idx)) { - goto fail; - } - goto array_continue; - } - case 't': if (!is_valid_true_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case 'f': if (!is_valid_false_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - case 'n': if (!is_valid_null_atom(buf + idx)) { - goto fail; - } - pj.write_tape(0, c); - break; - - case '0': { - if (!parse_number(buf, len, pj, depth, idx, true, false)) { - goto fail; - } - break; - } - case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { - if (!parse_number(buf, len, pj, depth, idx, false, false)) { - goto fail; - } - break; - } - case '-': { - if (!parse_number(buf, len, pj, depth, idx, false, true)) { - goto fail; - } - break; - } - case '{': { - pj.ret_address[depth] = &&array_continue; - goto object_begin; - } - case '[': { - pj.ret_address[depth] = &&array_continue; - goto array_begin; - } - default: goto fail; + // we call update char on all paths in, so we can peek at c on the + // on paths that can accept a close square brace (post-, and at start) + switch (c) { + case '"': { + if (!parse_string(buf, len, pj, depth, idx)) { + goto fail; } + break; + } + case 't': + if (!is_valid_true_atom(buf + idx)) { + goto fail; + } + pj.write_tape(0, c); + break; + case 'f': + if (!is_valid_false_atom(buf + idx)) { + goto fail; + } + pj.write_tape(0, c); + break; + case 'n': + if (!is_valid_null_atom(buf + idx)) { + goto fail; + } + pj.write_tape(0, c); + break; // goto array_continue; + + case '0': { + if (!parse_number(buf, len, pj, depth, idx, true, false)) { + goto fail; + } + break; // goto array_continue; + } + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + if (!parse_number(buf, len, pj, depth, idx, false, false)) { + goto fail; + } + break; // goto array_continue; + } + case '-': { + if (!parse_number(buf, len, pj, depth, idx, false, true)) { + goto fail; + } + break; // goto array_continue; + } + case '{': { + // we have not yet encountered ] so we need to come back for it + pj.ret_address[depth] = &&array_continue; + + // we found an object inside an array, so we need to increment the depth + depth++; + if (depth > pj.depthcapacity) { + goto fail; + } + + goto object_begin; + } + case '[': { + // we have not yet encountered ] so we need to come back for it + pj.ret_address[depth] = &&array_continue; + + // we found an array inside an array, so we need to increment the depth + depth++; + if (depth > pj.depthcapacity) { + goto fail; + } + + goto array_begin; + } + default: + goto fail; + } array_continue: - printf("in array_begin %c \n",c); - - DEBUG_PRINTF("in array_continue\n"); + DEBUG_PRINTF("in array_continue\n"); + UPDATE_CHAR(); + switch (c) { + case ',': UPDATE_CHAR(); - switch (c) { - case ',': UPDATE_CHAR(); goto main_array_switch; - case ']': goto scope_end; - default: goto fail; - } + goto main_array_switch; + case ']': + goto scope_end; + default: + goto fail; + } -////////////////////////////// FINAL STATES ///////////////////////////// + ////////////////////////////// FINAL STATES ///////////////////////////// succeed: - DEBUG_PRINTF("in succeed\n"); - // we annotate the root node - depth--; - // next line allows us to go back to the start - pj.write_tape(pj.containing_scope_offset[depth], 'r');// r is root - // next line tells the root node how to go to the end - pj.annotate_previousloc(pj.containing_scope_offset[depth], pj.get_current_loc()); + DEBUG_PRINTF("in succeed, depth = %d \n", depth); + // we annotate the root node + // depth--; + // next line allows us to go back to the start + pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root + // next line tells the root node how to go to the end + pj.annotate_previousloc(pj.containing_scope_offset[depth], + pj.get_current_loc()); #ifdef DEBUG - pj.dump_tapes(); + pj.dump_tapes(); #endif - return true; - + return true; + fail: - DEBUG_PRINTF("in fail\n"); + DEBUG_PRINTF("in fail\n"); #ifdef DEBUG - pj.dump_tapes(); + pj.dump_tapes(); #endif - return false; + return false; } diff --git a/tests/allparserscheckfile.cpp b/tests/allparserscheckfile.cpp index 4dc5c6c2..660a024e 100644 --- a/tests/allparserscheckfile.cpp +++ b/tests/allparserscheckfile.cpp @@ -1,3 +1,4 @@ +#include #include "jsonparser/jsonparser.h" @@ -30,7 +31,6 @@ void on_json_error( void *, const fastjson::ErrorContext& ec) { bool fastjson_parse(const char *input) { fastjson::Token token; fastjson::dom::Chunk chunk; - std::string error_message; return fastjson::dom::parse_string(input, &token, &chunk, 0, &on_json_error, NULL); } // end of fastjson stuff @@ -41,17 +41,30 @@ using namespace rapidjson; using namespace std; int main(int argc, char *argv[]) { - if (argc < 2) { + bool verbose = false; + int c; + while ((c = getopt (argc, argv, "v")) != -1) + switch (c) + { + case 'v': + verbose = true; + break; + default: + abort (); + } + if (optind >= argc) { cerr << "Usage: " << argv[0] << " \n"; cerr << "Or " << argv[0] << " -v \n"; exit(1); } - bool verbose = false; - if (argc > 2) { - if (strcmp(argv[1], "-v")) - verbose = true; + const char * filename = argv[optind]; + std::pair p; + try { + p = get_corpus(filename); + } catch (const std::exception& e) { // caught by reference to base + std::cout << "Could not load the file " << filename << std::endl; + return EXIT_FAILURE; } - pair p = get_corpus(argv[argc - 1]); if (verbose) { std::cout << "Input has "; if (p.second > 1024 * 1024) diff --git a/tests/jsoncheck.cpp b/tests/jsoncheck.cpp index 2b9b2eec..a9487f81 100644 --- a/tests/jsoncheck.cpp +++ b/tests/jsoncheck.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "jsonparser/jsonparser.h" @@ -41,6 +42,8 @@ bool validate(const char *dirname) { printf("nothing in dir %s \n", dirname); return false; } + bool * isfileasexpected = new bool[c]; + for(int i = 0; i < c; i++) isfileasexpected[i] = true; size_t howmany = 0; bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/'); for (int i = 0; i < c; i++) { @@ -56,7 +59,13 @@ bool validate(const char *dirname) { } else { strcpy(fullpath + dirlen, name); } - std::pair p = get_corpus(fullpath); + std::pair p; + try { + p = get_corpus(fullpath); + } catch (const std::exception& e) { + std::cout << "Could not load the file " << fullpath << std::endl; + return EXIT_FAILURE; + } ParsedJson *pj_ptr = allocate_ParsedJson(p.second, 1024); if(pj_ptr == NULL) { std::cerr<< "can't allocate memory"<d_name); + } + } for (int i = 0; i < c; ++i) free(entry_list[i]); free(entry_list); - printf("%zu files checked.\n", howmany); - if(everythingfine) printf("All ok!\n"); + delete[] isfileasexpected; + return everythingfine; } diff --git a/tests/numberparsingcheck.cpp b/tests/numberparsingcheck.cpp index 1bc2544f..017f5ee3 100644 --- a/tests/numberparsingcheck.cpp +++ b/tests/numberparsingcheck.cpp @@ -28,7 +28,7 @@ bool startsWith(const char *pre, const char *str) { size_t lenpre = strlen(pre), lenstr = strlen(str); return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0; } -bool is_in_bad_list(char *buf) { +bool is_in_bad_list(const char *buf) { for (size_t i = 0; i < sizeof(really_bad) / sizeof(really_bad[0]); i++) if (startsWith(really_bad[i], buf)) return true; @@ -38,9 +38,9 @@ bool is_in_bad_list(char *buf) { inline void foundInvalidNumber(const u8 *buf) { invalid_count++; char *endptr; - double expected = strtod((char *)buf, &endptr); - if (endptr != (char *)buf) { - if (!is_in_bad_list((char *)buf)) { + double expected = strtod((const char *)buf, &endptr); + if (endptr != (const char *)buf) { + if (!is_in_bad_list((const char *)buf)) { printf( "Warning: foundInvalidNumber %.32s whereas strtod parses it to %f, ", buf, expected); @@ -53,8 +53,8 @@ inline void foundInvalidNumber(const u8 *buf) { inline void foundInteger(int64_t result, const u8 *buf) { int_count++; char *endptr; - long long expected = strtoll((char *)buf, &endptr, 10); - if ((endptr == (char *)buf) || (expected != result)) { + long long expected = strtoll((const char *)buf, &endptr, 10); + if ((endptr == (const char *)buf) || (expected != result)) { printf("Error: parsed %" PRId64 " out of %.32s, ", result, buf); printf(" while parsing %s \n", fullpath); parse_error |= PARSE_ERROR; @@ -64,8 +64,8 @@ inline void foundInteger(int64_t result, const u8 *buf) { inline void foundFloat(double result, const u8 *buf) { char *endptr; float_count++; - double expected = strtod((char *)buf, &endptr); - if (endptr == (char *)buf) { + double expected = strtod((const char *)buf, &endptr); + if (endptr == (const char *)buf) { printf("parsed %f from %.32s whereas strtod refuses to parse a float, ", result, buf); printf(" while parsing %s \n", fullpath); @@ -123,7 +123,13 @@ bool validate(const char *dirname) { } else { strcpy(fullpath + dirlen, name); } - std::pair p = get_corpus(fullpath); + std::pair p; + try { + p = get_corpus(fullpath); + } catch (const std::exception& e) { + std::cout << "Could not load the file " << fullpath << std::endl; + return EXIT_FAILURE; + } // terrible hack but just to get it working ParsedJson *pj_ptr = allocate_ParsedJson(p.second, 1024); if (pj_ptr == NULL) { diff --git a/tests/stringparsingcheck.cpp b/tests/stringparsingcheck.cpp index e7f02771..3805ce53 100644 --- a/tests/stringparsingcheck.cpp +++ b/tests/stringparsingcheck.cpp @@ -241,7 +241,7 @@ inline void foundString(const u8 *buf, const u8 *parsed_begin, // we have a zero-length string if (parsed_begin != parsed_end) { printf("WARNING: We have a zero-length but gap is %zu \n", - parsed_end - parsed_begin); + (size_t)(parsed_end - parsed_begin)); probable_bug = true; } empty_string++; @@ -252,12 +252,12 @@ inline void foundString(const u8 *buf, const u8 *parsed_begin, printf("WARNING: lengths on parsed strings disagree %zu %zu \n", thislen, len); printf("\nour parsed string : '%*s'\n\n", (int)thislen, - (char *)parsed_begin); - print_hex((char *)parsed_begin, thislen); + (const char *)parsed_begin); + print_hex((const char *)parsed_begin, thislen); printf("\n"); printf("reference parsing :'%*s'\n\n", (int)len, bigbuffer); - print_hex((char *)bigbuffer, len); + print_hex((const char *)bigbuffer, len); printf("\n"); probable_bug = true; @@ -267,15 +267,15 @@ inline void foundString(const u8 *buf, const u8 *parsed_begin, printf("Lengths %zu %zu \n", thislen, len); printf("\nour parsed string : '%*s'\n", (int)thislen, - (char *)parsed_begin); - print_hex((char *)parsed_begin, thislen); + (const char *)parsed_begin); + print_hex((const char *)parsed_begin, thislen); printf("\n"); printf("reference parsing :'%*s'\n", (int)len, bigbuffer); - print_hex((char *)bigbuffer, len); + print_hex((const char *)bigbuffer, len); printf("\n"); - print_cmp_hex((char *)parsed_begin, bigbuffer, thislen); + print_cmp_hex((const char *)parsed_begin, bigbuffer, thislen); probable_bug = true; } @@ -325,8 +325,13 @@ bool validate(const char *dirname) { } else { strcpy(fullpath + dirlen, name); } - std::pair p = get_corpus(fullpath); - // terrible hack but just to get it working + std::pair p; + try { + p = get_corpus(fullpath); + } catch (const std::exception& e) { + std::cout << "Could not load the file " << fullpath << std::endl; + return EXIT_FAILURE; + } ParsedJson *pj_ptr = allocate_ParsedJson(p.second, 1024); if (pj_ptr == NULL) { std::cerr << "can't allocate memory" << std::endl;