From 8648c4108e9f4546e00e1940a81eb7c61015cbd6 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Tue, 27 Nov 2018 20:42:35 -0500 Subject: [PATCH] MOre cleaning. --- benchmark/parse.cpp | 2 +- include/jsonparser/jsoncharutils.h | 1 + include/jsonparser/simdjson_internal.h | 340 ++++++++++++++++--------- src/stage34_unified.cpp | 13 +- 4 files changed, 232 insertions(+), 124 deletions(-) diff --git a/benchmark/parse.cpp b/benchmark/parse.cpp index db42fc35..430b4f6c 100644 --- a/benchmark/parse.cpp +++ b/benchmark/parse.cpp @@ -184,7 +184,7 @@ int main(int argc, char *argv[]) { cout << "Min: " << min_result << " bytes read: " << p.second << " Gigabytes/second: " << (p.second) / (min_result * 1000000000.0) << "\n"; - if(dump) pj_ptr->dump_tapes(); + if(dump) pj_ptr->printjson(); free(p.first); deallocate_ParsedJson(pj_ptr); if (!isok) { diff --git a/include/jsonparser/jsoncharutils.h b/include/jsonparser/jsoncharutils.h index d8725cd0..3eb529b7 100644 --- a/include/jsonparser/jsoncharutils.h +++ b/include/jsonparser/jsoncharutils.h @@ -109,3 +109,4 @@ inline size_t codepoint_to_utf8(uint32_t cp, u8 *c) { return 0; // bad r } + diff --git a/include/jsonparser/simdjson_internal.h b/include/jsonparser/simdjson_internal.h index d44c41dd..ad3a7aae 100644 --- a/include/jsonparser/simdjson_internal.h +++ b/include/jsonparser/simdjson_internal.h @@ -1,5 +1,7 @@ #pragma once +#include + #ifdef _MSC_VER /* Microsoft C/C++-compatible compiler */ #include @@ -8,29 +10,59 @@ #include #endif -#include #include +#include +#define JSONVALUEMASK 0xFFFFFFFFFFFFFF; +static inline void print_with_escapes(const unsigned char *src) { + while (*src) { + switch (*src) { + case '\n': + putchar('\\'); + putchar('n'); + break; + case '\"': + putchar('\\'); + putchar('"'); + break; + case '\t': + putchar('\\'); + putchar('t'); + break; + case '\\': + putchar('\\'); + putchar('\\'); + break; + default: + if (*src <= 0x1F) { + printf("\\u%x", *src); + } else + putchar(*src); + } + src++; + } +} -//const u32 MAX_DEPTH = 2048; -//const u32 DEPTH_SAFETY_MARGIN = 32; // should be power-of-2 as we check this - // with a modulo in our hot stage 3 loop -//const u32 START_DEPTH = DEPTH_SAFETY_MARGIN; -//const u32 REDLINE_DEPTH = MAX_DEPTH - DEPTH_SAFETY_MARGIN; -//const size_t MAX_TAPE_ENTRIES = 127 * 1024; -//const size_t MAX_TAPE = MAX_DEPTH * MAX_TAPE_ENTRIES; +// const u32 MAX_DEPTH = 2048; +// const u32 DEPTH_SAFETY_MARGIN = 32; // should be power-of-2 as we check this +// with a modulo in our hot stage 3 loop +// const u32 START_DEPTH = DEPTH_SAFETY_MARGIN; +// const u32 REDLINE_DEPTH = MAX_DEPTH - DEPTH_SAFETY_MARGIN; +// const size_t MAX_TAPE_ENTRIES = 127 * 1024; +// const size_t MAX_TAPE = MAX_DEPTH * MAX_TAPE_ENTRIES; ///////////// // TODO: move this to be more like a real class // currently, you need to create it like so... -// ParsedJson *pj_ptr = allocate_ParsedJson(numberofbytes); // allocate memory for parsing up to numberofbytes -// and we clear it like so deallocate_ParsedJson(pj_ptr); -// That's obviously not very C++-ish. It should be trivial to add a constructor and a destructor. +// ParsedJson *pj_ptr = allocate_ParsedJson(numberofbytes); // allocate memory +// for parsing up to numberofbytes and we clear it like so +// deallocate_ParsedJson(pj_ptr); That's obviously not very C++-ish. It should +// be trivial to add a constructor and a destructor. //////////// struct ParsedJson { public: - size_t bytecapacity; // indicates how many bits are meant to be supported by - // structurals + size_t bytecapacity; // indicates how many bits are meant to be supported by + // structurals size_t depthcapacity; // how deep we can go u32 current_loc; @@ -38,133 +70,199 @@ public: u32 n_structural_indexes; u32 *structural_indexes; - u64 * tape;//[MAX_TAPE]; - u32 * containing_scope_offset; - void * * ret_address; - - u8 * string_buf;// should be at least bytecapacity + u64 *tape; + u32 *containing_scope_offset; + void **ret_address; + u8 *string_buf; // should be at least bytecapacity u8 *current_string_buf_loc; - u8 * number_buf;// holds either doubles or longs, really // should be at least 4 * bytecapacity + + u8 *number_buf; // holds either doubles or longs, really // should be at least + // 4 * bytecapacity u8 *current_number_buf_loc; - - void init() { - current_string_buf_loc = string_buf; - current_number_buf_loc = number_buf; - current_loc = 0; - //for (u32 i = 0; i < MAX_DEPTH; i++) { - // tape_locs[i] = i * MAX_TAPE_ENTRIES; - //} - //tap_locs will be unitialized by design + // this should be called when parsing (right before writing the tapes) + void init() { + current_string_buf_loc = string_buf; + current_number_buf_loc = number_buf; + current_loc = 0; + } + + // print the json to stdout (should be valid) + void printjson() { + size_t tapeidx = 0; + u64 tape_val = tape[tapeidx]; + u8 type = (tape_val >> 56); + size_t howmany = 0; + if (type == 'r') { + howmany = tape_val & JSONVALUEMASK; + } else { + printf("Error: no starting root node?"); + abort(); } - - void dump_tapes() { - /*for (u32 i = 0; i < MAX_DEPTH; i++) { - u32 start_loc = i * MAX_TAPE_ENTRIES; - std::cout << " tape section i " << i; - if (i == START_DEPTH) { - std::cout << " (START) "; - } else if ((i < START_DEPTH) || (i >= REDLINE_DEPTH)) { - std::cout << " (REDLINE) "; - } else { - std::cout << " (NORMAL) "; - } - - std::cout << " from: " << start_loc << " to: " << tape_locs[i] << " " - << " size: " << (tape_locs[i] - start_loc) << "\n"; - for (u32 j = start_loc; j < tape_locs[i]; j++) { - if (tape[j]) { - std::cout << "j: " << j << " tape[j] char " << (char)(tape[j] >> 56) - << " tape[j][0..55]: " << (tape[j] & 0xffffffffffffffULL) << "\n"; - } - } - }*/ + tapeidx++; + bool *inobject = new bool[depthcapacity]; + size_t *inobjectidx = new size_t[depthcapacity]; + int depth = 1; // only root at level 0 + inobjectidx[depth] = 0; + int64_t intval; + double doubleval; + for (; tapeidx < howmany; tapeidx++) { + // printf("\ncounter: %d\n", tapeidx); + tape_val = tape[tapeidx]; + u64 payload = tape_val & JSONVALUEMASK; + type = (tape_val >> 56); + if (!inobject[depth]) { + if ((inobjectidx[depth] > 0) && (type != ']')) + printf(", "); + inobjectidx[depth]++; + } else if (inobject) { + if ((inobjectidx[depth] > 0) && ((inobjectidx[depth] & 1) == 0) && + (type != '}')) + printf(", "); + if (((inobjectidx[depth] & 1) == 1)) + printf(" : "); + inobjectidx[depth]++; + } + switch (type) { + case '"': // we have a string + putchar('"'); + print_with_escapes((const unsigned char *)(string_buf + payload)); + putchar('"'); + break; + case 'l': // we have a long int + memcpy(&intval, number_buf + payload, sizeof(intval)); + printf("%" PRId64, intval); + break; + case 'd': // we have a double + memcpy(&doubleval, number_buf + payload, sizeof(doubleval)); + printf("%f", doubleval); + break; + case 'n': // we have a null + printf("null"); + break; + case 't': // we have a true + printf("true"); + break; + case 'f': // we have a false + printf("false"); + break; + case '{': // we have an object + printf("\n"); + printf("%*s\n%*s", depth, "{", depth + 1, ""); + depth++; + inobject[depth] = true; + inobjectidx[depth] = 0; + break; + case '}': // we end an object + depth--; + printf("\n%*s}\n%*s", depth - 1, "", depth, ""); + break; + case '[': // we start an array + printf("\n"); + printf("%*s\n%*s", depth, "[", depth + 1, ""); + depth++; + inobject[depth] = false; + inobjectidx[depth] = 0; + break; + case ']': // we end an array + depth--; + printf("\n%*s]\n%*s", depth - 1, "", depth, ""); + break; + case 'r': // we start and end with the root node + printf("should we be hitting the root node?\n"); + default: + printf("bug %c\n", type); + abort(); + } } + } - // all elements are stored on the tape using a 64-bit word. - // - // strings, double and ints are stored as - // a 64-bit word with a pointer to the actual value - // - // - // - // for objects or arrays, store [ or { at the beginning and } and ] at the end. - // For the openings ([ or {), we annotate them with a reference to the location on the tape of - // the end, and for then closings (} and ]), we annotate them with a reference to the - // location of the opening - // - // + // all elements are stored on the tape using a 64-bit word. + // + // strings, double and ints are stored as + // a 64-bit word with a pointer to the actual value + // + // + // + // for objects or arrays, store [ or { at the beginning and } and ] at the + // end. For the openings ([ or {), we annotate them with a reference to the + // location on the tape of the end, and for then closings (} and ]), we + // annotate them with a reference to the location of the opening + // + // - // this should be considered a private function - really_inline void write_tape(u64 val, u8 c) { - tape[current_loc++] = val | (((u64)c) << 56); - //tape[tape_locs[depth]] = val | (((u64)c) << 56); - //tape_locs[depth]++; - } + // this should be considered a private function + really_inline void write_tape(u64 val, u8 c) { + tape[current_loc++] = val | (((u64)c) << 56); + } + really_inline void write_tape_s64(s64 i) { + write_tape(current_number_buf_loc - number_buf, 'l'); + memcpy(current_number_buf_loc, &i, sizeof(s64)); + current_number_buf_loc += sizeof(s64); + } - really_inline void write_tape_s64(s64 i) { - write_tape(current_number_buf_loc - number_buf, 'l'); - memcpy(current_number_buf_loc, &i, sizeof(s64)); - current_number_buf_loc += sizeof(s64); - } + really_inline void write_tape_double(double d) { + write_tape(current_number_buf_loc - number_buf, 'd'); + memcpy(current_number_buf_loc, &d, sizeof(double)); + current_number_buf_loc += sizeof(double); + } - really_inline void write_tape_double(double d) { - write_tape(current_number_buf_loc - number_buf, 'd'); - memcpy(current_number_buf_loc, &d, sizeof(double)); - current_number_buf_loc += sizeof(double); - } + really_inline u32 get_current_loc() { return current_loc; } - really_inline u32 get_current_loc() { - return current_loc; - } + really_inline void annotate_previousloc(u32 saved_loc, u64 val) { + tape[saved_loc] |= val; + } - really_inline void annotate_previousloc(u32 saved_loc,u64 val) { - tape[saved_loc] |= val; - } - - - // public interface + // public interface #if 1 - - struct ParsedJsonHandle { - ParsedJson & pj; - u32 depth; - u32 scope_header; // the start of our current scope that contains our current location - u32 location; // our current location on a tape - explicit ParsedJsonHandle(ParsedJson & pj_) : pj(pj_), depth(0), scope_header(0), location(0) {} - // OK with default copy constructor as the way to clone the POD structure + struct ParsedJsonHandle { + ParsedJson &pj; + u32 depth; + u32 scope_header; // the start of our current scope that contains our + // current location + u32 location; // our current location on a tape - // some placeholder navigation. Will convert over to a more native C++-ish way of doing - // things once it's working (i.e. ++ and -- operators and get start/end iterators) - // return true if we can do the navigation, false otherwise - bool next(); // valid if we're not at the end of a scope - bool prev(); // valid if we're not at the start of a scope - bool up(); // valid if we are at depth != 0 - bool down(); // valid if we're at a [ or { call site; moves us to header of that scope - //void to_start_scope(); // move us to the start of our current scope; always succeeds - //void to_end_scope(); // move us to the start of our current scope; always succeeds + explicit ParsedJsonHandle(ParsedJson &pj_) + : pj(pj_), depth(0), scope_header(0), location(0) {} + // OK with default copy constructor as the way to clone the POD structure - // these navigation elements move us across scope if need be, so allow us to iterate over - // everything at a given depth - //bool next_flat(); // valid if we're not at the end of a tape - //bool prev_flat(); // valid if we're not at the start of a tape + // some placeholder navigation. Will convert over to a more native C++-ish + // way of doing things once it's working (i.e. ++ and -- operators and get + // start/end iterators) return true if we can do the navigation, false + // otherwise + bool next(); // valid if we're not at the end of a scope + bool prev(); // valid if we're not at the start of a scope + bool up(); // valid if we are at depth != 0 + bool down(); // valid if we're at a [ or { call site; moves us to header of + // that scope + // void to_start_scope(); // move us to the start of our current + // scope; always succeeds void to_end_scope(); // move us to the + // start of our current scope; always succeeds - void print(std::ostream & os); // print the thing we're currently pointing at - u8 get_type(); // retrieve the character code of what we're looking at: [{"sltfn are the possibilities - s64 get_s64(); // get the s64 value at this node; valid only if we're at "s" - double get_double(); // get the double value at this node; valid only if we're at "d" - char * get_string(); // get the string value at this node; valid only if we're at " - }; + // these navigation elements move us across scope if need be, so allow us to + // iterate over everything at a given depth + // bool next_flat(); // valid if we're not at the end of a + // tape bool prev_flat(); // valid if we're not at the start + // of a tape + + void print(std::ostream &os); // print the thing we're currently pointing at + u8 get_type(); // retrieve the character code of what we're looking at: + // [{"sltfn are the possibilities + s64 get_s64(); // get the s64 value at this node; valid only if we're at "s" + double get_double(); // get the double value at this node; valid only if + // we're at "d" + char * + get_string(); // get the string value at this node; valid only if we're at " + }; #endif }; - #ifdef DEBUG -inline void dump256(m256 d, const std::string& msg) { +inline void dump256(m256 d, const std::string &msg) { for (u32 i = 0; i < 32; i++) { std::cout << std::setw(3) << (int)*(((u8 *)(&d)) + i); if (!((i + 1) % 8)) @@ -178,14 +276,14 @@ inline void dump256(m256 d, const std::string& msg) { } // dump bits low to high -inline void dumpbits(u64 v, const std::string& msg) { +inline void dumpbits(u64 v, const std::string &msg) { for (u32 i = 0; i < 64; i++) { std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_"); } std::cout << " " << msg << "\n"; } -inline void dumpbits32(u32 v, const std::string& msg) { +inline void dumpbits32(u32 v, const std::string &msg) { for (u32 i = 0; i < 32; i++) { std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_"); } @@ -198,14 +296,14 @@ inline void dumpbits32(u32 v, const std::string& msg) { #endif // dump bits low to high -inline void dumpbits_always(u64 v, const std::string& msg) { +inline void dumpbits_always(u64 v, const std::string &msg) { for (u32 i = 0; i < 64; i++) { std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_"); } std::cout << " " << msg << "\n"; } -inline void dumpbits32_always(u32 v, const std::string& msg) { +inline void dumpbits32_always(u32 v, const std::string &msg) { for (u32 i = 0; i < 32; i++) { std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_"); } diff --git a/src/stage34_unified.cpp b/src/stage34_unified.cpp index 2699d1d2..ba45bdf5 100644 --- a/src/stage34_unified.cpp +++ b/src/stage34_unified.cpp @@ -429,11 +429,20 @@ succeed: DEBUG_PRINTF("in succeed, depth = %d \n", depth); // we annotate the root node // depth--; - // next line allows us to go back to the start - pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root // next line tells the root node how to go to the end pj.annotate_previousloc(pj.containing_scope_offset[depth], pj.get_current_loc()); + // next line allows us to go back to the start + if(depth != 0) { + printf("internal bug\n"); + abort(); + } + if(pj.containing_scope_offset[depth] != 0) { + printf("internal bug\n"); + abort(); + } + pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root + #ifdef DEBUG pj.dump_tapes();