MOre cleaning.

This commit is contained in:
Daniel Lemire 2018-11-27 20:42:35 -05:00
parent ba0f6fea51
commit 8648c4108e
4 changed files with 232 additions and 124 deletions

View File

@ -184,7 +184,7 @@ int main(int argc, char *argv[]) {
cout << "Min: " << min_result << " bytes read: " << p.second cout << "Min: " << min_result << " bytes read: " << p.second
<< " Gigabytes/second: " << (p.second) / (min_result * 1000000000.0) << " Gigabytes/second: " << (p.second) / (min_result * 1000000000.0)
<< "\n"; << "\n";
if(dump) pj_ptr->dump_tapes(); if(dump) pj_ptr->printjson();
free(p.first); free(p.first);
deallocate_ParsedJson(pj_ptr); deallocate_ParsedJson(pj_ptr);
if (!isok) { if (!isok) {

View File

@ -109,3 +109,4 @@ inline size_t codepoint_to_utf8(uint32_t cp, u8 *c) {
return 0; // bad r return 0; // bad r
} }

View File

@ -1,5 +1,7 @@
#pragma once #pragma once
#include <inttypes.h>
#ifdef _MSC_VER #ifdef _MSC_VER
/* Microsoft C/C++-compatible compiler */ /* Microsoft C/C++-compatible compiler */
#include <intrin.h> #include <intrin.h>
@ -8,29 +10,59 @@
#include <x86intrin.h> #include <x86intrin.h>
#endif #endif
#include <iostream>
#include <iomanip> #include <iomanip>
#include <iostream>
#define JSONVALUEMASK 0xFFFFFFFFFFFFFF;
static inline void print_with_escapes(const unsigned char *src) {
while (*src) {
switch (*src) {
case '\n':
putchar('\\');
putchar('n');
break;
case '\"':
putchar('\\');
putchar('"');
break;
case '\t':
putchar('\\');
putchar('t');
break;
case '\\':
putchar('\\');
putchar('\\');
break;
default:
if (*src <= 0x1F) {
printf("\\u%x", *src);
} else
putchar(*src);
}
src++;
}
}
//const u32 MAX_DEPTH = 2048; // const u32 MAX_DEPTH = 2048;
//const u32 DEPTH_SAFETY_MARGIN = 32; // should be power-of-2 as we check this // const u32 DEPTH_SAFETY_MARGIN = 32; // should be power-of-2 as we check this
// with a modulo in our hot stage 3 loop // with a modulo in our hot stage 3 loop
//const u32 START_DEPTH = DEPTH_SAFETY_MARGIN; // const u32 START_DEPTH = DEPTH_SAFETY_MARGIN;
//const u32 REDLINE_DEPTH = MAX_DEPTH - DEPTH_SAFETY_MARGIN; // const u32 REDLINE_DEPTH = MAX_DEPTH - DEPTH_SAFETY_MARGIN;
//const size_t MAX_TAPE_ENTRIES = 127 * 1024; // const size_t MAX_TAPE_ENTRIES = 127 * 1024;
//const size_t MAX_TAPE = MAX_DEPTH * MAX_TAPE_ENTRIES; // const size_t MAX_TAPE = MAX_DEPTH * MAX_TAPE_ENTRIES;
///////////// /////////////
// TODO: move this to be more like a real class // TODO: move this to be more like a real class
// currently, you need to create it like so... // currently, you need to create it like so...
// ParsedJson *pj_ptr = allocate_ParsedJson(numberofbytes); // allocate memory for parsing up to numberofbytes // ParsedJson *pj_ptr = allocate_ParsedJson(numberofbytes); // allocate memory
// and we clear it like so deallocate_ParsedJson(pj_ptr); // for parsing up to numberofbytes and we clear it like so
// That's obviously not very C++-ish. It should be trivial to add a constructor and a destructor. // deallocate_ParsedJson(pj_ptr); That's obviously not very C++-ish. It should
// be trivial to add a constructor and a destructor.
//////////// ////////////
struct ParsedJson { struct ParsedJson {
public: public:
size_t bytecapacity; // indicates how many bits are meant to be supported by size_t bytecapacity; // indicates how many bits are meant to be supported by
// structurals // structurals
size_t depthcapacity; // how deep we can go size_t depthcapacity; // how deep we can go
u32 current_loc; u32 current_loc;
@ -38,133 +70,199 @@ public:
u32 n_structural_indexes; u32 n_structural_indexes;
u32 *structural_indexes; u32 *structural_indexes;
u64 * tape;//[MAX_TAPE]; u64 *tape;
u32 * containing_scope_offset; u32 *containing_scope_offset;
void * * ret_address; void **ret_address;
u8 * string_buf;// should be at least bytecapacity
u8 *string_buf; // should be at least bytecapacity
u8 *current_string_buf_loc; u8 *current_string_buf_loc;
u8 * number_buf;// holds either doubles or longs, really // should be at least 4 * bytecapacity
u8 *number_buf; // holds either doubles or longs, really // should be at least
// 4 * bytecapacity
u8 *current_number_buf_loc; u8 *current_number_buf_loc;
void init() {
current_string_buf_loc = string_buf;
current_number_buf_loc = number_buf;
current_loc = 0;
//for (u32 i = 0; i < MAX_DEPTH; i++) { // this should be called when parsing (right before writing the tapes)
// tape_locs[i] = i * MAX_TAPE_ENTRIES; void init() {
//} current_string_buf_loc = string_buf;
//tap_locs will be unitialized by design current_number_buf_loc = number_buf;
current_loc = 0;
}
// print the json to stdout (should be valid)
void printjson() {
size_t tapeidx = 0;
u64 tape_val = tape[tapeidx];
u8 type = (tape_val >> 56);
size_t howmany = 0;
if (type == 'r') {
howmany = tape_val & JSONVALUEMASK;
} else {
printf("Error: no starting root node?");
abort();
} }
tapeidx++;
void dump_tapes() { bool *inobject = new bool[depthcapacity];
/*for (u32 i = 0; i < MAX_DEPTH; i++) { size_t *inobjectidx = new size_t[depthcapacity];
u32 start_loc = i * MAX_TAPE_ENTRIES; int depth = 1; // only root at level 0
std::cout << " tape section i " << i; inobjectidx[depth] = 0;
if (i == START_DEPTH) { int64_t intval;
std::cout << " (START) "; double doubleval;
} else if ((i < START_DEPTH) || (i >= REDLINE_DEPTH)) { for (; tapeidx < howmany; tapeidx++) {
std::cout << " (REDLINE) "; // printf("\ncounter: %d\n", tapeidx);
} else { tape_val = tape[tapeidx];
std::cout << " (NORMAL) "; u64 payload = tape_val & JSONVALUEMASK;
} type = (tape_val >> 56);
if (!inobject[depth]) {
std::cout << " from: " << start_loc << " to: " << tape_locs[i] << " " if ((inobjectidx[depth] > 0) && (type != ']'))
<< " size: " << (tape_locs[i] - start_loc) << "\n"; printf(", ");
for (u32 j = start_loc; j < tape_locs[i]; j++) { inobjectidx[depth]++;
if (tape[j]) { } else if (inobject) {
std::cout << "j: " << j << " tape[j] char " << (char)(tape[j] >> 56) if ((inobjectidx[depth] > 0) && ((inobjectidx[depth] & 1) == 0) &&
<< " tape[j][0..55]: " << (tape[j] & 0xffffffffffffffULL) << "\n"; (type != '}'))
} printf(", ");
} if (((inobjectidx[depth] & 1) == 1))
}*/ printf(" : ");
inobjectidx[depth]++;
}
switch (type) {
case '"': // we have a string
putchar('"');
print_with_escapes((const unsigned char *)(string_buf + payload));
putchar('"');
break;
case 'l': // we have a long int
memcpy(&intval, number_buf + payload, sizeof(intval));
printf("%" PRId64, intval);
break;
case 'd': // we have a double
memcpy(&doubleval, number_buf + payload, sizeof(doubleval));
printf("%f", doubleval);
break;
case 'n': // we have a null
printf("null");
break;
case 't': // we have a true
printf("true");
break;
case 'f': // we have a false
printf("false");
break;
case '{': // we have an object
printf("\n");
printf("%*s\n%*s", depth, "{", depth + 1, "");
depth++;
inobject[depth] = true;
inobjectidx[depth] = 0;
break;
case '}': // we end an object
depth--;
printf("\n%*s}\n%*s", depth - 1, "", depth, "");
break;
case '[': // we start an array
printf("\n");
printf("%*s\n%*s", depth, "[", depth + 1, "");
depth++;
inobject[depth] = false;
inobjectidx[depth] = 0;
break;
case ']': // we end an array
depth--;
printf("\n%*s]\n%*s", depth - 1, "", depth, "");
break;
case 'r': // we start and end with the root node
printf("should we be hitting the root node?\n");
default:
printf("bug %c\n", type);
abort();
}
} }
}
// all elements are stored on the tape using a 64-bit word. // all elements are stored on the tape using a 64-bit word.
// //
// strings, double and ints are stored as // strings, double and ints are stored as
// a 64-bit word with a pointer to the actual value // a 64-bit word with a pointer to the actual value
// //
// //
// //
// for objects or arrays, store [ or { at the beginning and } and ] at the end. // for objects or arrays, store [ or { at the beginning and } and ] at the
// For the openings ([ or {), we annotate them with a reference to the location on the tape of // end. For the openings ([ or {), we annotate them with a reference to the
// the end, and for then closings (} and ]), we annotate them with a reference to the // location on the tape of the end, and for then closings (} and ]), we
// location of the opening // annotate them with a reference to the location of the opening
// //
// //
// this should be considered a private function // this should be considered a private function
really_inline void write_tape(u64 val, u8 c) { really_inline void write_tape(u64 val, u8 c) {
tape[current_loc++] = val | (((u64)c) << 56); tape[current_loc++] = val | (((u64)c) << 56);
//tape[tape_locs[depth]] = val | (((u64)c) << 56); }
//tape_locs[depth]++;
}
really_inline void write_tape_s64(s64 i) {
write_tape(current_number_buf_loc - number_buf, 'l');
memcpy(current_number_buf_loc, &i, sizeof(s64));
current_number_buf_loc += sizeof(s64);
}
really_inline void write_tape_s64(s64 i) { really_inline void write_tape_double(double d) {
write_tape(current_number_buf_loc - number_buf, 'l'); write_tape(current_number_buf_loc - number_buf, 'd');
memcpy(current_number_buf_loc, &i, sizeof(s64)); memcpy(current_number_buf_loc, &d, sizeof(double));
current_number_buf_loc += sizeof(s64); current_number_buf_loc += sizeof(double);
} }
really_inline void write_tape_double(double d) { really_inline u32 get_current_loc() { return current_loc; }
write_tape(current_number_buf_loc - number_buf, 'd');
memcpy(current_number_buf_loc, &d, sizeof(double));
current_number_buf_loc += sizeof(double);
}
really_inline u32 get_current_loc() { really_inline void annotate_previousloc(u32 saved_loc, u64 val) {
return current_loc; tape[saved_loc] |= val;
} }
really_inline void annotate_previousloc(u32 saved_loc,u64 val) { // public interface
tape[saved_loc] |= val;
}
// public interface
#if 1 #if 1
struct ParsedJsonHandle {
ParsedJson & pj;
u32 depth;
u32 scope_header; // the start of our current scope that contains our current location
u32 location; // our current location on a tape
explicit ParsedJsonHandle(ParsedJson & pj_) : pj(pj_), depth(0), scope_header(0), location(0) {} struct ParsedJsonHandle {
// OK with default copy constructor as the way to clone the POD structure ParsedJson &pj;
u32 depth;
u32 scope_header; // the start of our current scope that contains our
// current location
u32 location; // our current location on a tape
// some placeholder navigation. Will convert over to a more native C++-ish way of doing explicit ParsedJsonHandle(ParsedJson &pj_)
// things once it's working (i.e. ++ and -- operators and get start/end iterators) : pj(pj_), depth(0), scope_header(0), location(0) {}
// return true if we can do the navigation, false otherwise // OK with default copy constructor as the way to clone the POD structure
bool next(); // valid if we're not at the end of a scope
bool prev(); // valid if we're not at the start of a scope
bool up(); // valid if we are at depth != 0
bool down(); // valid if we're at a [ or { call site; moves us to header of that scope
//void to_start_scope(); // move us to the start of our current scope; always succeeds
//void to_end_scope(); // move us to the start of our current scope; always succeeds
// these navigation elements move us across scope if need be, so allow us to iterate over // some placeholder navigation. Will convert over to a more native C++-ish
// everything at a given depth // way of doing things once it's working (i.e. ++ and -- operators and get
//bool next_flat(); // valid if we're not at the end of a tape // start/end iterators) return true if we can do the navigation, false
//bool prev_flat(); // valid if we're not at the start of a tape // otherwise
bool next(); // valid if we're not at the end of a scope
bool prev(); // valid if we're not at the start of a scope
bool up(); // valid if we are at depth != 0
bool down(); // valid if we're at a [ or { call site; moves us to header of
// that scope
// void to_start_scope(); // move us to the start of our current
// scope; always succeeds void to_end_scope(); // move us to the
// start of our current scope; always succeeds
void print(std::ostream & os); // print the thing we're currently pointing at // these navigation elements move us across scope if need be, so allow us to
u8 get_type(); // retrieve the character code of what we're looking at: [{"sltfn are the possibilities // iterate over everything at a given depth
s64 get_s64(); // get the s64 value at this node; valid only if we're at "s" // bool next_flat(); // valid if we're not at the end of a
double get_double(); // get the double value at this node; valid only if we're at "d" // tape bool prev_flat(); // valid if we're not at the start
char * get_string(); // get the string value at this node; valid only if we're at " // of a tape
};
void print(std::ostream &os); // print the thing we're currently pointing at
u8 get_type(); // retrieve the character code of what we're looking at:
// [{"sltfn are the possibilities
s64 get_s64(); // get the s64 value at this node; valid only if we're at "s"
double get_double(); // get the double value at this node; valid only if
// we're at "d"
char *
get_string(); // get the string value at this node; valid only if we're at "
};
#endif #endif
}; };
#ifdef DEBUG #ifdef DEBUG
inline void dump256(m256 d, const std::string& msg) { inline void dump256(m256 d, const std::string &msg) {
for (u32 i = 0; i < 32; i++) { for (u32 i = 0; i < 32; i++) {
std::cout << std::setw(3) << (int)*(((u8 *)(&d)) + i); std::cout << std::setw(3) << (int)*(((u8 *)(&d)) + i);
if (!((i + 1) % 8)) if (!((i + 1) % 8))
@ -178,14 +276,14 @@ inline void dump256(m256 d, const std::string& msg) {
} }
// dump bits low to high // dump bits low to high
inline void dumpbits(u64 v, const std::string& msg) { inline void dumpbits(u64 v, const std::string &msg) {
for (u32 i = 0; i < 64; i++) { for (u32 i = 0; i < 64; i++) {
std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_"); std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_");
} }
std::cout << " " << msg << "\n"; std::cout << " " << msg << "\n";
} }
inline void dumpbits32(u32 v, const std::string& msg) { inline void dumpbits32(u32 v, const std::string &msg) {
for (u32 i = 0; i < 32; i++) { for (u32 i = 0; i < 32; i++) {
std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_"); std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_");
} }
@ -198,14 +296,14 @@ inline void dumpbits32(u32 v, const std::string& msg) {
#endif #endif
// dump bits low to high // dump bits low to high
inline void dumpbits_always(u64 v, const std::string& msg) { inline void dumpbits_always(u64 v, const std::string &msg) {
for (u32 i = 0; i < 64; i++) { for (u32 i = 0; i < 64; i++) {
std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_"); std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_");
} }
std::cout << " " << msg << "\n"; std::cout << " " << msg << "\n";
} }
inline void dumpbits32_always(u32 v, const std::string& msg) { inline void dumpbits32_always(u32 v, const std::string &msg) {
for (u32 i = 0; i < 32; i++) { for (u32 i = 0; i < 32; i++) {
std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_"); std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_");
} }

View File

@ -429,11 +429,20 @@ succeed:
DEBUG_PRINTF("in succeed, depth = %d \n", depth); DEBUG_PRINTF("in succeed, depth = %d \n", depth);
// we annotate the root node // we annotate the root node
// depth--; // depth--;
// next line allows us to go back to the start
pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
// next line tells the root node how to go to the end // next line tells the root node how to go to the end
pj.annotate_previousloc(pj.containing_scope_offset[depth], pj.annotate_previousloc(pj.containing_scope_offset[depth],
pj.get_current_loc()); pj.get_current_loc());
// next line allows us to go back to the start
if(depth != 0) {
printf("internal bug\n");
abort();
}
if(pj.containing_scope_offset[depth] != 0) {
printf("internal bug\n");
abort();
}
pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
#ifdef DEBUG #ifdef DEBUG
pj.dump_tapes(); pj.dump_tapes();