Getting there slowly.

This commit is contained in:
Daniel Lemire 2018-12-11 22:39:39 -05:00
parent f983703a2e
commit 751dce98f5
6 changed files with 327 additions and 51 deletions

View File

@ -24,7 +24,7 @@ endif
MAINEXECUTABLES=parse minify json2json MAINEXECUTABLES=parse minify json2json
TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck
COMPARISONEXECUTABLES=minifiercompetition parsingcompetition allparserscheckfile COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition allparserscheckfile
HEADERS= include/simdjson/simdutf8check.h include/simdjson/stringparsing.h include/simdjson/numberparsing.h include/simdjson/jsonparser.h include/simdjson/common_defs.h include/simdjson/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/simdjson/parsedjson.h include/simdjson/stage1_find_marks.h include/simdjson/stage2_flatten.h include/simdjson/stage34_unified.h include/simdjson/jsoncharutils.h include/simdjson/jsonformatutils.h HEADERS= include/simdjson/simdutf8check.h include/simdjson/stringparsing.h include/simdjson/numberparsing.h include/simdjson/jsonparser.h include/simdjson/common_defs.h include/simdjson/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/simdjson/parsedjson.h include/simdjson/stage1_find_marks.h include/simdjson/stage2_flatten.h include/simdjson/stage34_unified.h include/simdjson/jsoncharutils.h include/simdjson/jsonformatutils.h
LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp src/stage2_flatten.cpp src/stage34_unified.cpp LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp src/stage2_flatten.cpp src/stage34_unified.cpp
@ -103,6 +103,10 @@ json2json: tools/json2json.cpp $(HEADERS) $(LIBFILES)
ujdecode.o: $(UJSON4C_INCLUDE) ujdecode.o: $(UJSON4C_INCLUDE)
$(CC) $(CFLAGS) -c dependencies/ujson4c/src/ujdecode.c $(CC) $(CFLAGS) -c dependencies/ujson4c/src/ujdecode.c
parseandstatcompetition: benchmark/parseandstatcompetition.cpp $(HEADERS) $(LIBFILES) $(OBJECTS)
$(CXX) $(CXXFLAGS) -o parseandstatcompetition $(LIBFILES) benchmark/parseandstatcompetition.cpp $(OBJECTS) -I. $(LIBFLAGS)
parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(OBJECTS) parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(OBJECTS)
$(CXX) $(CXXFLAGS) -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp $(OBJECTS) -I. $(LIBFLAGS) $(CXX) $(CXXFLAGS) -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp $(OBJECTS) -I. $(LIBFLAGS)

View File

@ -69,7 +69,10 @@ make parsingcompetition
``` ```
## Limitations ## Scope
We provide a fast parser. It fully validates the input according to the various specifications.
The parser builds a useful immutable (read-only) DOM (document-object model) which can be later accessed.
To simplify the engineering, we make some assumptions. To simplify the engineering, we make some assumptions.
@ -78,6 +81,9 @@ To simplify the engineering, we make some assumptions.
- We only support GNU GCC and LLVM Clang at this time. There is no support for Microsoft Visual Studio, though it should not be difficult (help is invited). - We only support GNU GCC and LLVM Clang at this time. There is no support for Microsoft Visual Studio, though it should not be difficult (help is invited).
- In cases of failure, we just report a failure without any indication as to the nature of the problem. (This can be easily improved without affecting performance.) - In cases of failure, we just report a failure without any indication as to the nature of the problem. (This can be easily improved without affecting performance.)
*We do not aim to provide a general-purpose JSON library.*
## Features ## Features
- The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.) - The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.)

View File

@ -0,0 +1,196 @@
#include <unistd.h>
#include "simdjson/jsonparser.h"
#include "benchmark.h"
// #define RAPIDJSON_SSE2 // bad for performance
// #define RAPIDJSON_SSE42 // bad for performance
#include "rapidjson/document.h"
#include "rapidjson/reader.h"
#include "rapidjson/stringbuffer.h"
#include "rapidjson/writer.h"
#include "sajson.h"
using namespace rapidjson;
using namespace std;
struct stat_s {
size_t number_count;
size_t object_count;
size_t array_count;
size_t null_count;
size_t true_count;
size_t false_count;
bool valid;
};
typedef struct stat_s stat_t;
stat_t simdjson_computestats(const std::string_view & p) {
stat_t answer;
ParsedJson pj = build_parsed_json(p);
answer.valid = pj.isValid();
if(!answer.valid) {
return answer;
}
answer.number_count = 0;
answer.object_count = 0;
answer.array_count = 0;
answer.null_count = 0;
answer.true_count = 0;
answer.false_count = 0;
size_t tapeidx = 0;
u64 tape_val = pj.tape[tapeidx++];
u8 type = (tape_val >> 56);
size_t howmany = 0;
assert (type == 'r');
howmany = tape_val & JSONVALUEMASK;
tapeidx++;
for (; tapeidx < howmany; tapeidx++) {
tape_val = pj.tape[tapeidx];
u64 payload = tape_val & JSONVALUEMASK;
type = (tape_val >> 56);
switch (type) {
case 'l': // we have a long int
answer.number_count++;
tapeidx++; // skipping the integer
break;
case 'd': // we have a double
answer.number_count++;
tapeidx++; // skipping the double
break;
case 'n': // we have a null
answer.null_count++;
break;
case 't': // we have a true
answer.true_count++;
break;
case 'f': // we have a false
answer.false_count ++;
break;
case '{': // we have an object
answer.object_count ++;
break;
case '}': // we end an object
break;
case '[': // we start an array
answer.array_count ++;
break;
case ']': // we end an array
break;
default:
answer.valid = false;
return answer;
}
}
return answer;
}
stat_t rapid_computestats(const std::string_view & p) {
stat_t answer;
rapidjson::Document d;
d.ParseInsitu<kParseValidateEncodingFlag>(p.data());
answer.valid = ! d.HasParseError();
if(d.HasParseError()) {
}
if(!answer.valid) {
return answer;
}
answer.number_count = 0;
answer.object_count = 0;
answer.array_count = 0;
answer.null_count = 0;
answer.true_count = 0;
answer.false_count = 0;
}
int main(int argc, char *argv[]) {
bool verbose = false;
bool all = false;
int c;
while ((c = getopt (argc, argv, "v")) != -1)
switch (c)
{
case 'v':
verbose = true;
break;
default:
abort ();
}
if (optind >= argc) {
cerr << "Usage: " << argv[0] << " <jsonfile>\n";
cerr << "Or " << argv[0] << " -v <jsonfile>\n";
exit(1);
}
const char * filename = argv[optind];
if(optind + 1 < argc) {
cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
}
std::string_view p;
try {
p = get_corpus(filename);
} catch (const std::exception& e) { // caught by reference to base
std::cout << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
if (verbose) {
std::cout << "Input has ";
if (p.size() > 1024 * 1024)
std::cout << p.size() / (1024 * 1024) << " MB ";
else if (p.size() > 1024)
std::cout << p.size() / 1024 << " KB ";
else
std::cout << p.size() << " B ";
std::cout << std::endl;
}
ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size(), 1024);
if (!allocok) {
std::cerr << "can't allocate memory" << std::endl;
return EXIT_FAILURE;
}
int repeat = 10;
int volume = p.size();
BEST_TIME("simdjson (dynamic mem) ", build_parsed_json(p).isValid(), true, , repeat, volume, true);
BEST_TIME("simdjson (static alloc) ", json_parse(p, pj), true, , repeat, volume, true);
rapidjson::Document d;
char *buffer = (char *)malloc(p.size() + 1);
memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0';
BEST_TIME("RapidJSON",
d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(),
false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
BEST_TIME("RapidJSON (insitu)", d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), false,
memcpy(buffer, p.data(), p.size()), repeat, volume, true);
BEST_TIME("sajson (dynamic mem, insitu)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
size_t astbuffersize = p.size();
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
BEST_TIME("sajson (static alloc, insitu)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
std::string json11err;
if(all) BEST_TIME("dropbox (json11) ", (( json11::Json::parse(buffer,json11err).is_null() ) || ( ! json11err.empty() )), false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
if(all) BEST_TIME("fastjson ", fastjson_parse(buffer), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
JsonValue value;
JsonAllocator allocator;
char *endptr;
if(all) BEST_TIME("gason ", jsonParse(buffer, &endptr, &value, allocator), JSON_OK, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
void *state;
if(all) BEST_TIME("ultrajson ", (UJDecode(buffer, p.size(), NULL, &state) == NULL), false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
BEST_TIME("memcpy ", (memcpy(buffer, p.data(), p.size()) == buffer), true, , repeat, volume, true);
free((void*)p.data());
free(ast_buffer);
free(buffer);
}

View File

@ -97,7 +97,8 @@ int main(int argc, char *argv[]) {
char *buffer = (char *)malloc(p.size() + 1); char *buffer = (char *)malloc(p.size() + 1);
memcpy(buffer, p.data(), p.size()); memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0'; buffer[p.size()] = '\0';
//
// Todo: It is possible to preallocate a block of memory with RapidJSON using a MemoryAllocator.
BEST_TIME("RapidJSON", BEST_TIME("RapidJSON",
d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(), d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(),
false, memcpy(buffer, p.data(), p.size()), repeat, volume, true); false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);

View File

@ -283,7 +283,7 @@ public:
} }
// all elements are stored on the tape using a 64-bit word. // all nodes are stored on the tape using a 64-bit word.
// //
// strings, double and ints are stored as // strings, double and ints are stored as
// a 64-bit word with a pointer to the actual value // a 64-bit word with a pointer to the actual value
@ -327,18 +327,20 @@ public:
explicit iterator(ParsedJson &pj_) explicit iterator(ParsedJson &pj_)
: pj(pj_), depth(0), location(0), tape_length(0), depthindex(NULL) { : pj(pj_), depth(0), location(0), tape_length(0), depthindex(NULL) {
if(pj.isValid()) { if(pj.isValid()) {
depthindex = new size_t[pj.depthcapacity]; depthindex = new scopeindex_t[pj.depthcapacity];
if(depthindex == NULL) return; if(depthindex == NULL) return;
depthindex[0] = 0; depthindex[0].start_of_scope = location;
current_val = pj.tape[location++]; current_val = pj.tape[location++];
current_type = (current_val >> 56); current_type = (current_val >> 56);
depthindex[0].scope_type = current_type;
if (current_type == 'r') { if (current_type == 'r') {
tape_length = current_val & JSONVALUEMASK; tape_length = current_val & JSONVALUEMASK;
if(location < tape_length) { if(location < tape_length) {
current_val = pj.tape[location]; current_val = pj.tape[location];
current_type = (current_val >> 56); current_type = (current_val >> 56);
depth++; depth++;
depthindex[depth] = location; depthindex[depth].start_of_scope = location;
depthindex[depth].scope_type = current_type;
} }
} }
} }
@ -351,7 +353,7 @@ public:
pj(o.pj), depth(o.depth), location(o.location), pj(o.pj), depth(o.depth), location(o.location),
tape_length(o.tape_length), current_type(o.current_type), tape_length(o.tape_length), current_type(o.current_type),
current_val(o.current_val), depthindex(NULL) { current_val(o.current_val), depthindex(NULL) {
depthindex = new size_t[pj.depthcapacity]; depthindex = new scopeindex_t[pj.depthcapacity];
if(depthindex != NULL) { if(depthindex != NULL) {
memcpy(o.depthindex, depthindex, pj.depthcapacity * sizeof(depthindex[0])); memcpy(o.depthindex, depthindex, pj.depthcapacity * sizeof(depthindex[0]));
} else { } else {
@ -365,24 +367,104 @@ public:
current_val(o.current_val), depthindex(o.depthindex) { current_val(o.current_val), depthindex(o.depthindex) {
o.depthindex = NULL;// we take ownship o.depthindex = NULL;// we take ownship
} }
WARN_UNUSED WARN_UNUSED
bool isOk() const { bool isOk() const {
return location < tape_length; return location < tape_length;
} }
// useful for debuging purposes
size_t get_tape_location() const { size_t get_tape_location() const {
return location; return location;
} }
size_t get_tape_lenght() const { // useful for debuging purposes
size_t get_tape_length() const {
return tape_length; return tape_length;
} }
// return true if we can do the navigation, false // returns the current depth (start at 1 with 0 reserved for the fictitious root node)
size_t get_depth() const {
return depth;
}
// A scope is a series of nodes at the same depth, typically it is either an object ({) or an array ([).
// The root node has type 'r'.
u8 get_scope_type() const {
return depthindex[depth].scope_type;
}
// move forward in document order
WARN_UNUSED
bool move_forward() {
if(location + 1 >= tape_length) {
return false; // we are at the end!
}
// we are entering a new scope
if ((current_type == '[') || (current_type == '{')){
depth++;
depthindex[depth].start_of_scope = location;
depthindex[depth].scope_type = current_type;
}
location = location + 1;
current_val = pj.tape[location];
current_type = (current_val >> 56);
// if we encounter a scope closure, we need to move up
while ((current_type == ']') || (current_type == '}')) {
if(location + 1 >= tape_length) {
return false; // we are at the end!
}
depth--;
if(depth == 0) {
return false; // should not be necessary
}
location = location + 1;
current_val = pj.tape[location];
current_type = (current_val >> 56);
}
return true;
}
// retrieve the character code of what we're looking at:
// [{"sltfn are the possibilities
WARN_UNUSED
really_inline u8 get_type() const {
return current_type;
}
// get the s64 value at this node; valid only if we're at "l"
WARN_UNUSED
really_inline s64 get_integer() const {
if(location + 1 >= tape_length) return 0;// default value in case of error
return (s64) pj.tape[location + 1];
}
// get the double value at this node; valid only if
// we're at "d"
WARN_UNUSED
really_inline double get_double() const {
if(location + 1 >= tape_length) return NAN;// default value in case of error
double answer;
memcpy(&answer, & pj.tape[location + 1], sizeof(answer));
return answer;
}
// get the string value at this node (NULL ended); valid only if we're at "
// note that tabs, and line endings are escaped in the returned value (see print_with_escapes)
// return value is valid UTF-8
WARN_UNUSED
really_inline const char * get_string() const {
return (const char *)(pj.string_buf + (current_val & JSONVALUEMASK)) ;
}
// throughout return true if we can do the navigation, false
// otherwise // otherwise
// withing a give scope, we move forward // Withing a given scope (series of nodes at the same depth within either an
// valid if we're not at the end of a scope (returns true) // array or an object), we move forward.
// Thus, given [true, null, {"a":1}, [1,2]], we would visit true, null, { and [.
// At the object ({) or at the array ([), you can issue a "down" to visit their content.
// valid if we're not at the end of a scope (returns true).
WARN_UNUSED WARN_UNUSED
really_inline bool next() { really_inline bool next() {
if ((current_type == '[') || (current_type == '{')){ if ((current_type == '[') || (current_type == '{')){
@ -415,17 +497,22 @@ public:
} }
} }
// valid if we're not at the start of a scope
// Withing a given scope (series of nodes at the same depth within either an
// array or an object), we move backward.
// Thus, given [true, null, {"a":1}, [1,2]], we would visit ], }, null, true when starting at the end
// of the scope.
// At the object ({) or at the array ([), you can issue a "down" to visit their content.
WARN_UNUSED WARN_UNUSED
really_inline bool prev() { really_inline bool prev() {
if(location - 1 < depthindex[depth]) return false; if(location - 1 < depthindex[depth].start_of_scope) return false;
location -= 1; location -= 1;
current_val = pj.tape[location]; current_val = pj.tape[location];
current_type = (current_val >> 56); current_type = (current_val >> 56);
if ((current_type == ']') || (current_type == '}')){ if ((current_type == ']') || (current_type == '}')){
// we need to jump // we need to jump
size_t new_location = ( current_val & JSONVALUEMASK); size_t new_location = ( current_val & JSONVALUEMASK);
if(new_location < depthindex[depth]) { if(new_location < depthindex[depth].start_of_scope) {
return false; // shoud never happen return false; // shoud never happen
} }
location = new_location; location = new_location;
@ -435,8 +522,10 @@ public:
return true; return true;
} }
// Moves back to either the containing array or object (type { or [) from
// valid unless we are at the first level of the document // within a contained scope.
// Valid unless we are at the first level of the document
//
WARN_UNUSED WARN_UNUSED
really_inline bool up() { really_inline bool up() {
if(depth == 1) { if(depth == 1) {
@ -452,8 +541,10 @@ public:
} }
// valid if we're at a [ or { and it starts a non-empty scope; moves us to start of // Valid if we're at a [ or { and it starts a non-empty scope; moves us to start of
// that deeper scope if it not empty // that deeper scope if it not empty.
// Thus, given [true, null, {"a":1}, [1,2]], if we are at the { node, we would move to the
// "a" node.
WARN_UNUSED WARN_UNUSED
really_inline bool down() { really_inline bool down() {
if(location + 1 >= tape_length) return false; if(location + 1 >= tape_length) return false;
@ -464,7 +555,8 @@ public:
} }
depth++; depth++;
location = location + 1; location = location + 1;
depthindex[depth] = location; depthindex[depth].start_of_scope = location;
depthindex[depth].scope_type = current_type;
current_val = pj.tape[location]; current_val = pj.tape[location];
current_type = (current_val >> 56); current_type = (current_val >> 56);
return true; return true;
@ -472,9 +564,10 @@ public:
return false; return false;
} }
// move us to the start of our current scope // move us to the start of our current scope,
// a scope is a series of nodes at the same level
void to_start_scope() { void to_start_scope() {
location = depthindex[depth]; location = depthindex[depth].start_of_scope;
current_val = pj.tape[location]; current_val = pj.tape[location];
current_type = (current_val >> 56); current_type = (current_val >> 56);
} }
@ -522,33 +615,7 @@ public:
return true; return true;
} }
// retrieve the character code of what we're looking at: typedef struct {size_t start_of_scope; u8 scope_type;} scopeindex_t;
// [{"sltfn are the possibilities
really_inline u8 get_type() const {
return current_type;
}
// get the s64 value at this node; valid only if we're at "l"
really_inline s64 get_integer() const {
if(location + 1 >= tape_length) return 0;// default value in case of error
return (s64) pj.tape[location + 1];
}
// get the double value at this node; valid only if
// we're at "d"
really_inline double get_double() const {
if(location + 1 >= tape_length) return NAN;// default value in case of error
double answer;
memcpy(&answer, & pj.tape[location + 1], sizeof(answer));
return answer;
}
// get the string value at this node (NULL ended); valid only if we're at "
// note that tabs, and line endings are escaped in the returned value (see print_with_escapes)
// return value is valid UTF-8
really_inline const char * get_string() const {
return (const char *)(pj.string_buf + (current_val & JSONVALUEMASK)) ;
}
private: private:
@ -560,7 +627,7 @@ private:
size_t tape_length; size_t tape_length;
u8 current_type; u8 current_type;
u64 current_val; u64 current_val;
size_t *depthindex; scopeindex_t *depthindex;
}; };

View File

@ -23,6 +23,7 @@ void compute_dump(ParsedJson::iterator &pjh) {
} }
// we have a non-empty scope and we are at the beginning of it // we have a non-empty scope and we are at the beginning of it
if (inobject) { if (inobject) {
assert(pjh.get_scope_type() == '{');
std::cout << "{"; std::cout << "{";
assert(pjh.get_type() == '"'); assert(pjh.get_type() == '"');
pjh.print(std::cout); // must be a string pjh.print(std::cout); // must be a string
@ -39,6 +40,7 @@ void compute_dump(ParsedJson::iterator &pjh) {
} }
std::cout << "}"; std::cout << "}";
} else { } else {
assert(pjh.get_scope_type() == '[');
std::cout << "["; std::cout << "[";
compute_dump(pjh); // let us recurse compute_dump(pjh); // let us recurse
while (pjh.next()) { while (pjh.next()) {