From e979a0c93f69c28b1981e90c8116ac1a934d0e65 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Wed, 19 Dec 2018 00:40:04 -0500 Subject: [PATCH] Simplifying the build --- Makefile | 6 ++-- README.md | 2 +- benchmark/parseandstatcompetition.cpp | 5 ++- benchmark/parsingcompetition.cpp | 52 +++++++++------------------ scripts/bar.gnuplot | 2 +- tape.md | 35 +++++++++--------- 6 files changed, 45 insertions(+), 57 deletions(-) diff --git a/Makefile b/Makefile index cb37f5bc..ff5d8b52 100644 --- a/Makefile +++ b/Makefile @@ -116,8 +116,10 @@ distinctuseridcompetition: benchmark/distinctuseridcompetition.cpp $(HEADERS) $( $(CXX) $(CXXFLAGS) -o distinctuseridcompetition $(LIBFILES) benchmark/distinctuseridcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE) -parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS) - $(CXX) $(CXXFLAGS) -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE) +parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) #$(EXTRAOBJECTS) + $(CXX) $(CXXFLAGS) -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE) +#$(EXTRADEPSINCLUDE) +#$(EXTRAOBJECTS) allparserscheckfile: tests/allparserscheckfile.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS) $(CXX) $(CXXFLAGS) -o allparserscheckfile $(LIBFILES) tests/allparserscheckfile.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE) diff --git a/README.md b/README.md index 91f9a766..72b95936 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ make benchmark ## Tools - `json2json mydoc.json` parses the document, constructs a model and then dumps back the result to standard output. -- `json2json -d mydoc.json` parses the document, constructs a model and then dumps model (as a tape) to standard output. The tape format is described in the accompanying file tape.md. +- `json2json -d mydoc.json` parses the document, constructs a model and then dumps model (as a tape) to standard output. The tape format is described in the accompanying file `tape.md`. - `minify mydoc.json` minifies the JSON document, outputting the result to standard output. Minifying means to remove the unneeded white space charaters. ## Scope diff --git a/benchmark/parseandstatcompetition.cpp b/benchmark/parseandstatcompetition.cpp index bd40be4a..800dd178 100644 --- a/benchmark/parseandstatcompetition.cpp +++ b/benchmark/parseandstatcompetition.cpp @@ -44,6 +44,7 @@ void print_stat(const stat_t &s) { s.true_count, s.false_count); } +__attribute__ ((noinline)) stat_t simdjson_computestats(const std::string_view &p) { stat_t answer; ParsedJson pj = build_parsed_json(p); @@ -145,6 +146,7 @@ void sajson_traverse(stat_t &stats, const sajson::value &node) { } } +__attribute__ ((noinline)) stat_t sasjon_computestats(const std::string_view &p) { stat_t answer; char *buffer = (char *)malloc(p.size()); @@ -202,6 +204,7 @@ void rapid_traverse(stat_t &stats, const rapidjson::Value &v) { } } +__attribute__ ((noinline)) stat_t rapid_computestats(const std::string_view &p) { stat_t answer; char *buffer = (char *)malloc(p.size() + 1); @@ -286,7 +289,7 @@ int main(int argc, char *argv[]) { } assert(stat_equal(s1, s2)); assert(stat_equal(s1, s3)); - int repeat = 10; + int repeat = 50; int volume = p.size(); BEST_TIME("simdjson ", simdjson_computestats(p).valid, true, , repeat, volume, !justdata); diff --git a/benchmark/parsingcompetition.cpp b/benchmark/parsingcompetition.cpp index eca35424..d15edb10 100644 --- a/benchmark/parsingcompetition.cpp +++ b/benchmark/parsingcompetition.cpp @@ -10,18 +10,24 @@ #include "rapidjson/stringbuffer.h" #include "rapidjson/writer.h" +#include "sajson.h" + +#ifdef ALLPARSER #include "fastjson.cpp" #include "fastjson_dom.cpp" #include "gason.cpp" #include "json11.cpp" -#include "sajson.h" extern "C" { #include "ujdecode.h" #include "ultrajsondec.c" } +#endif + using namespace rapidjson; using namespace std; + +#ifdef ALLPARSER // fastjson has a tricky interface void on_json_error(void *, const fastjson::ErrorContext &ec) { // std::cerr<<"ERROR: "<((const char *)buffer).HasParseError(), false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); - if(!justdata) BEST_TIME("RapidJSON (insitu)", + BEST_TIME("RapidJSON", d.ParseInsitu(buffer).HasParseError(), - false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); - typedef rapidjson::GenericDocument, rapidjson::MemoryPoolAllocator<>, - rapidjson::MemoryPoolAllocator<>> - RapidDocumentType; - size_t rapidvaallocsize = p.size() * 128; // allocate plenty of memory - size_t rapidallocsize = p.size() * 4096; // allocate plenty of memory - char *rapidvalueBuffer = (char *)malloc(rapidvaallocsize); - char *rapidparseBuffer = (char *)malloc(rapidallocsize); - if ((rapidvalueBuffer != NULL) && (rapidvalueBuffer != NULL)) { - rapidjson::MemoryPoolAllocator<> valueAllocator(rapidvalueBuffer, - rapidvaallocsize); - rapidjson::MemoryPoolAllocator<> parseAllocator(rapidparseBuffer, - rapidallocsize); - RapidDocumentType preallocedd(&valueAllocator, rapidvaallocsize, - &parseAllocator); - - if(!justdata) BEST_TIME( - "RapidJSON (static alloc)", - preallocedd.Parse((const char *)buffer) - .HasParseError(), - false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); - // (static alloc, insitu) - BEST_TIME("RapidJSON", - preallocedd.ParseInsitu(buffer) - .HasParseError(), - false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); - assert(valueAllocator.Size() <= rapidvaallocsize); - assert(parseAllocator.Size() <= rapidallocsize); - } - free(rapidvalueBuffer); - free(rapidparseBuffer); + false, memcpy(buffer, p.data(), p.size()) && (buffer[p.size()] = '\0'), repeat, volume, !justdata); if(!justdata) BEST_TIME("sajson (dynamic mem, insitu)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)) @@ -154,6 +131,8 @@ int main(int argc, char *argv[]) { sajson::mutable_string_view(p.size(), buffer)) .is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); +#ifdef ALLPARSER + std::string json11err; if (all) BEST_TIME("dropbox (json11) ", @@ -176,6 +155,7 @@ int main(int argc, char *argv[]) { BEST_TIME("ultrajson ", (UJDecode(buffer, p.size(), NULL, &state) == NULL), false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); +#endif if(!justdata) BEST_TIME("memcpy ", (memcpy(buffer, p.data(), p.size()) == buffer), true, , repeat, volume, !justdata); diff --git a/scripts/bar.gnuplot b/scripts/bar.gnuplot index 9634c10b..aa5f941b 100644 --- a/scripts/bar.gnuplot +++ b/scripts/bar.gnuplot @@ -24,4 +24,4 @@ set format y "%0.1f"; set style line 1 lt rgb "#A0A0A0" lw 1 pt 1 ps 1 -plot filename using 0:2:xtic(1) with boxes notitle ls 1, '' using 0:(1):(sprintf("%.1f", $2)) with labels notitle +plot filename using 0:2:xtic(1) with boxes notitle ls 1, '' using 0:(1):(sprintf("%.2g", $2)) with labels notitle diff --git a/tape.md b/tape.md index 3bc2f65d..f673de14 100644 --- a/tape.md +++ b/tape.md @@ -1,13 +1,15 @@ # Tape structure in simdjson -We parse a JSON document to a tape. A tape is an array of 64-bit values. Each node encountered in the JSON document is written to the tape using one or more 64-bit tape elements; the layout of the tape is in "document order". Throughout, little endian encoding is assumed. The tape is indexed starting at 0 (the first element is at index 0). +We parse a JSON document to a tape. A tape is an array of 64-bit values. Each node encountered in the JSON document is written to the tape using one or more 64-bit tape elements; the layout of the tape is in "document order": elements are stored as they are encountered in the JSON document. + +Throughout, little endian encoding is assumed. The tape is indexed starting at 0 (the first element is at index 0). ## Example It is sometimes useful to start with an example. Consider the following JSON document: -``` +```json { "Image": { "Width": 800, @@ -26,7 +28,7 @@ It is sometimes useful to start with an example. Consider the following JSON doc The following is a dump of the content of the tape, with the first number of each line representing the index of a tape element. -``` +```bash $ ./json2json -d jsonexamples/small/demo.json 0 : r // pointing to 38 (right after last node) 1 : { // pointing to next tape location 38 (first node after the scope) @@ -64,34 +66,35 @@ $ ./json2json -d jsonexamples/small/demo.json ## General formal of the tape elements -Most tape elements are written as ('c' << 56) + x where 'c' is some ASCII character determining the type of the element and where x is a 56-bit value called the payload. +Most tape elements are written as `('c' << 56) + x` where `'c'` is some ASCII character determining the type of the element (out of 't', 'f', 'n', 'l', 'd', '"', '{', '}', '[', ']' ,'r') and where `x` is a 56-bit value called the payload. The payload is normally interpreted as an unsigned 56-bit integer. Note that 56-bit integers can be quite large. +Performance consideration: We believe that accessing the tape in regular units of 64 bits is more important for performance than saving memory. + ## Simple JSON values Simple JSON nodes are represented with one tape element: -- null is represented as the 64-bit value ('n' << 56) where 'n' is the 8-bit code point values (in ASCII) corresponding to the letter 'n'. -- true is represented as the 64-bit value ('t' << 56). -- false is represented as the 64-bit value ('f' << 56). +- null is represented as the 64-bit value `('n' << 56)` where `'n'` is the 8-bit code point values (in ASCII) corresponding to the letter `'n'`. +- true is represented as the 64-bit value `('t' << 56)`. +- false is represented as the 64-bit value `('f' << 56)`. -Performance consideration: It is somewhat wasteful to use 64-bit tape elements to store values that would require far less storage. However, we believe that this has no significant performance impact in most practical applications. ## Integer and Double values Integer values are represented as two 64-bit tape elements: -- The 64-bit value ('l' << 56) followed by the 64-bit integer value litterally. Integer values are assumed to be signed 64-bit values, using two's complement notation. +- The 64-bit value `('l' << 56)` followed by the 64-bit integer value litterally. Integer values are assumed to be signed 64-bit values, using two's complement notation. Float values are represented as two 64-bit tape elements: -- The 64-bit value ('d' << 56) followed by the 64-bit double value litterally in standard IEEE 754 notation. +- The 64-bit value `('d' << 56)` followed by the 64-bit double value litterally in standard IEEE 754 notation. -Performance consideration: We store numbers of the main tape because we believe that locality of reference is helpful for performance. The format is somewhat storage wasteful as 56 bits are ignored. +Performance consideration: We store numbers of the main tape because we believe that locality of reference is helpful for performance. ## Root node -Each JSON document will have two special 64-bit tape element representing a root node, one at the beginning and one at the end. +Each JSON document will have two special 64-bit tape elements representing a root node, one at the beginning and one at the end. -- The first 64-bit tape element contains the value ('r'<<56) + x where x is the location on the tape of the last root element. +- The first 64-bit tape element contains the value `('r'<<56) + x` where `x` is the location on the tape of the last root element. - The last 64-bit tape element contains the value ('r'<< 56). All of the parsed document is located between these two 64-bit tape elements. @@ -101,7 +104,7 @@ Hint: we can read the first tape element to determine the length of the tape. ## Strings -We store string values using UTF-8 encoding with null termination on a separate tape. A string value is represented on the main tape as the 64-bit tape element ('"'<< 56) + x where x is the location on the string tape of the null-terminated string. +We store string values using UTF-8 encoding with null termination on a separate tape. A string value is represented on the main tape as the 64-bit tape element `('"'<< 56) + x` where the payload `x` is the location on the string tape of the null-terminated string. ## Arrays @@ -118,8 +121,8 @@ Performance consideration: We can skip the content of an array entirely by acces JSON objects are represented using two 64-bit tape elements. -- The first 64-bit tape element contains the value ('{' << 56) + x where the payload x is 1 + the index of the second 64-bit tape element on the tape. -- The second 64-bit tape element contains the value ('{' << 56) + x where the payload x contains the index of the first 64-bit tape element on the tape. +- The first 64-bit tape element contains the value `('{' << 56) + x` where the payload `x` is 1 + the index of the second 64-bit tape element on the tape. +- The second 64-bit tape element contains the value `('{' << 56) + x` where the payload `x` contains the index of the first 64-bit tape element on the tape. In-between these two tape elements, we alternate between key (which must strings) and values. A value could be an object or an array.