Simplifying the build

This commit is contained in:
Daniel Lemire 2018-12-19 00:40:04 -05:00
parent ea8000501b
commit e979a0c93f
6 changed files with 45 additions and 57 deletions

View File

@ -116,8 +116,10 @@ distinctuseridcompetition: benchmark/distinctuseridcompetition.cpp $(HEADERS) $(
$(CXX) $(CXXFLAGS) -o distinctuseridcompetition $(LIBFILES) benchmark/distinctuseridcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(CXX) $(CXXFLAGS) -o distinctuseridcompetition $(LIBFILES) benchmark/distinctuseridcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS) parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) #$(EXTRAOBJECTS)
$(CXX) $(CXXFLAGS) -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE) $(CXX) $(CXXFLAGS) -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
#$(EXTRADEPSINCLUDE)
#$(EXTRAOBJECTS)
allparserscheckfile: tests/allparserscheckfile.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS) allparserscheckfile: tests/allparserscheckfile.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS)
$(CXX) $(CXXFLAGS) -o allparserscheckfile $(LIBFILES) tests/allparserscheckfile.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE) $(CXX) $(CXXFLAGS) -o allparserscheckfile $(LIBFILES) tests/allparserscheckfile.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE)

View File

@ -78,7 +78,7 @@ make benchmark
## Tools ## Tools
- `json2json mydoc.json` parses the document, constructs a model and then dumps back the result to standard output. - `json2json mydoc.json` parses the document, constructs a model and then dumps back the result to standard output.
- `json2json -d mydoc.json` parses the document, constructs a model and then dumps model (as a tape) to standard output. The tape format is described in the accompanying file tape.md. - `json2json -d mydoc.json` parses the document, constructs a model and then dumps model (as a tape) to standard output. The tape format is described in the accompanying file `tape.md`.
- `minify mydoc.json` minifies the JSON document, outputting the result to standard output. Minifying means to remove the unneeded white space charaters. - `minify mydoc.json` minifies the JSON document, outputting the result to standard output. Minifying means to remove the unneeded white space charaters.
## Scope ## Scope

View File

@ -44,6 +44,7 @@ void print_stat(const stat_t &s) {
s.true_count, s.false_count); s.true_count, s.false_count);
} }
__attribute__ ((noinline))
stat_t simdjson_computestats(const std::string_view &p) { stat_t simdjson_computestats(const std::string_view &p) {
stat_t answer; stat_t answer;
ParsedJson pj = build_parsed_json(p); ParsedJson pj = build_parsed_json(p);
@ -145,6 +146,7 @@ void sajson_traverse(stat_t &stats, const sajson::value &node) {
} }
} }
__attribute__ ((noinline))
stat_t sasjon_computestats(const std::string_view &p) { stat_t sasjon_computestats(const std::string_view &p) {
stat_t answer; stat_t answer;
char *buffer = (char *)malloc(p.size()); char *buffer = (char *)malloc(p.size());
@ -202,6 +204,7 @@ void rapid_traverse(stat_t &stats, const rapidjson::Value &v) {
} }
} }
__attribute__ ((noinline))
stat_t rapid_computestats(const std::string_view &p) { stat_t rapid_computestats(const std::string_view &p) {
stat_t answer; stat_t answer;
char *buffer = (char *)malloc(p.size() + 1); char *buffer = (char *)malloc(p.size() + 1);
@ -286,7 +289,7 @@ int main(int argc, char *argv[]) {
} }
assert(stat_equal(s1, s2)); assert(stat_equal(s1, s2));
assert(stat_equal(s1, s3)); assert(stat_equal(s1, s3));
int repeat = 10; int repeat = 50;
int volume = p.size(); int volume = p.size();
BEST_TIME("simdjson ", simdjson_computestats(p).valid, true, , repeat, BEST_TIME("simdjson ", simdjson_computestats(p).valid, true, , repeat,
volume, !justdata); volume, !justdata);

View File

@ -10,18 +10,24 @@
#include "rapidjson/stringbuffer.h" #include "rapidjson/stringbuffer.h"
#include "rapidjson/writer.h" #include "rapidjson/writer.h"
#include "sajson.h"
#ifdef ALLPARSER
#include "fastjson.cpp" #include "fastjson.cpp"
#include "fastjson_dom.cpp" #include "fastjson_dom.cpp"
#include "gason.cpp" #include "gason.cpp"
#include "json11.cpp" #include "json11.cpp"
#include "sajson.h"
extern "C" { extern "C" {
#include "ujdecode.h" #include "ujdecode.h"
#include "ultrajsondec.c" #include "ultrajsondec.c"
} }
#endif
using namespace rapidjson; using namespace rapidjson;
using namespace std; using namespace std;
#ifdef ALLPARSER
// fastjson has a tricky interface // fastjson has a tricky interface
void on_json_error(void *, const fastjson::ErrorContext &ec) { void on_json_error(void *, const fastjson::ErrorContext &ec) {
// std::cerr<<"ERROR: "<<ec.mesg<<std::endl; // std::cerr<<"ERROR: "<<ec.mesg<<std::endl;
@ -33,6 +39,7 @@ bool fastjson_parse(const char *input) {
NULL); NULL);
} }
// end of fastjson stuff // end of fastjson stuff
#endif
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
bool verbose = false; bool verbose = false;
@ -89,7 +96,7 @@ int main(int argc, char *argv[]) {
std::cerr << "can't allocate memory" << std::endl; std::cerr << "can't allocate memory" << std::endl;
return EXIT_FAILURE; return EXIT_FAILURE;
} }
int repeat = 10; int repeat = 50;
int volume = p.size(); int volume = p.size();
if(!justdata) BEST_TIME("simdjson (dynamic mem) ", build_parsed_json(p).isValid(), true, , if(!justdata) BEST_TIME("simdjson (dynamic mem) ", build_parsed_json(p).isValid(), true, ,
repeat, volume, !justdata); repeat, volume, !justdata);
@ -97,49 +104,19 @@ int main(int argc, char *argv[]) {
BEST_TIME("simdjson ", json_parse(p, pj), true, , repeat, BEST_TIME("simdjson ", json_parse(p, pj), true, , repeat,
volume, !justdata); volume, !justdata);
rapidjson::Document d; rapidjson::Document d;
char *buffer = (char *)malloc(p.size() + 1); char *buffer = (char *)malloc(p.size() + 1);
memcpy(buffer, p.data(), p.size()); memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0'; buffer[p.size()] = '\0';
if(!justdata) BEST_TIME( if(!justdata) BEST_TIME(
"RapidJSON", "RapidJSON (doc reused) ",
d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(), d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(),
false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
if(!justdata) BEST_TIME("RapidJSON (insitu)", BEST_TIME("RapidJSON",
d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(),
false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); false, memcpy(buffer, p.data(), p.size()) && (buffer[p.size()] = '\0'), repeat, volume, !justdata);
typedef rapidjson::GenericDocument<UTF8<>, rapidjson::MemoryPoolAllocator<>,
rapidjson::MemoryPoolAllocator<>>
RapidDocumentType;
size_t rapidvaallocsize = p.size() * 128; // allocate plenty of memory
size_t rapidallocsize = p.size() * 4096; // allocate plenty of memory
char *rapidvalueBuffer = (char *)malloc(rapidvaallocsize);
char *rapidparseBuffer = (char *)malloc(rapidallocsize);
if ((rapidvalueBuffer != NULL) && (rapidvalueBuffer != NULL)) {
rapidjson::MemoryPoolAllocator<> valueAllocator(rapidvalueBuffer,
rapidvaallocsize);
rapidjson::MemoryPoolAllocator<> parseAllocator(rapidparseBuffer,
rapidallocsize);
RapidDocumentType preallocedd(&valueAllocator, rapidvaallocsize,
&parseAllocator);
if(!justdata) BEST_TIME(
"RapidJSON (static alloc)",
preallocedd.Parse<kParseValidateEncodingFlag>((const char *)buffer)
.HasParseError(),
false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
// (static alloc, insitu)
BEST_TIME("RapidJSON",
preallocedd.ParseInsitu<kParseValidateEncodingFlag>(buffer)
.HasParseError(),
false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
assert(valueAllocator.Size() <= rapidvaallocsize);
assert(parseAllocator.Size() <= rapidallocsize);
}
free(rapidvalueBuffer);
free(rapidparseBuffer);
if(!justdata) BEST_TIME("sajson (dynamic mem, insitu)", if(!justdata) BEST_TIME("sajson (dynamic mem, insitu)",
sajson::parse(sajson::dynamic_allocation(), sajson::parse(sajson::dynamic_allocation(),
sajson::mutable_string_view(p.size(), buffer)) sajson::mutable_string_view(p.size(), buffer))
@ -154,6 +131,8 @@ int main(int argc, char *argv[]) {
sajson::mutable_string_view(p.size(), buffer)) sajson::mutable_string_view(p.size(), buffer))
.is_valid(), .is_valid(),
true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
#ifdef ALLPARSER
std::string json11err; std::string json11err;
if (all) if (all)
BEST_TIME("dropbox (json11) ", BEST_TIME("dropbox (json11) ",
@ -176,6 +155,7 @@ int main(int argc, char *argv[]) {
BEST_TIME("ultrajson ", BEST_TIME("ultrajson ",
(UJDecode(buffer, p.size(), NULL, &state) == NULL), false, (UJDecode(buffer, p.size(), NULL, &state) == NULL), false,
memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
#endif
if(!justdata) BEST_TIME("memcpy ", if(!justdata) BEST_TIME("memcpy ",
(memcpy(buffer, p.data(), p.size()) == buffer), true, , repeat, (memcpy(buffer, p.data(), p.size()) == buffer), true, , repeat,
volume, !justdata); volume, !justdata);

View File

@ -24,4 +24,4 @@ set format y "%0.1f";
set style line 1 lt rgb "#A0A0A0" lw 1 pt 1 ps 1 set style line 1 lt rgb "#A0A0A0" lw 1 pt 1 ps 1
plot filename using 0:2:xtic(1) with boxes notitle ls 1, '' using 0:(1):(sprintf("%.1f", $2)) with labels notitle plot filename using 0:2:xtic(1) with boxes notitle ls 1, '' using 0:(1):(sprintf("%.2g", $2)) with labels notitle

35
tape.md
View File

@ -1,13 +1,15 @@
# Tape structure in simdjson # Tape structure in simdjson
We parse a JSON document to a tape. A tape is an array of 64-bit values. Each node encountered in the JSON document is written to the tape using one or more 64-bit tape elements; the layout of the tape is in "document order". Throughout, little endian encoding is assumed. The tape is indexed starting at 0 (the first element is at index 0). We parse a JSON document to a tape. A tape is an array of 64-bit values. Each node encountered in the JSON document is written to the tape using one or more 64-bit tape elements; the layout of the tape is in "document order": elements are stored as they are encountered in the JSON document.
Throughout, little endian encoding is assumed. The tape is indexed starting at 0 (the first element is at index 0).
## Example ## Example
It is sometimes useful to start with an example. Consider the following JSON document: It is sometimes useful to start with an example. Consider the following JSON document:
``` ```json
{ {
"Image": { "Image": {
"Width": 800, "Width": 800,
@ -26,7 +28,7 @@ It is sometimes useful to start with an example. Consider the following JSON doc
The following is a dump of the content of the tape, with the first number of each line representing the index of a tape element. The following is a dump of the content of the tape, with the first number of each line representing the index of a tape element.
``` ```bash
$ ./json2json -d jsonexamples/small/demo.json $ ./json2json -d jsonexamples/small/demo.json
0 : r // pointing to 38 (right after last node) 0 : r // pointing to 38 (right after last node)
1 : { // pointing to next tape location 38 (first node after the scope) 1 : { // pointing to next tape location 38 (first node after the scope)
@ -64,34 +66,35 @@ $ ./json2json -d jsonexamples/small/demo.json
## General formal of the tape elements ## General formal of the tape elements
Most tape elements are written as ('c' << 56) + x where 'c' is some ASCII character determining the type of the element and where x is a 56-bit value called the payload. Most tape elements are written as `('c' << 56) + x` where `'c'` is some ASCII character determining the type of the element (out of 't', 'f', 'n', 'l', 'd', '"', '{', '}', '[', ']' ,'r') and where `x` is a 56-bit value called the payload. The payload is normally interpreted as an unsigned 56-bit integer. Note that 56-bit integers can be quite large.
Performance consideration: We believe that accessing the tape in regular units of 64 bits is more important for performance than saving memory.
## Simple JSON values ## Simple JSON values
Simple JSON nodes are represented with one tape element: Simple JSON nodes are represented with one tape element:
- null is represented as the 64-bit value ('n' << 56) where 'n' is the 8-bit code point values (in ASCII) corresponding to the letter 'n'. - null is represented as the 64-bit value `('n' << 56)` where `'n'` is the 8-bit code point values (in ASCII) corresponding to the letter `'n'`.
- true is represented as the 64-bit value ('t' << 56). - true is represented as the 64-bit value `('t' << 56)`.
- false is represented as the 64-bit value ('f' << 56). - false is represented as the 64-bit value `('f' << 56)`.
Performance consideration: It is somewhat wasteful to use 64-bit tape elements to store values that would require far less storage. However, we believe that this has no significant performance impact in most practical applications.
## Integer and Double values ## Integer and Double values
Integer values are represented as two 64-bit tape elements: Integer values are represented as two 64-bit tape elements:
- The 64-bit value ('l' << 56) followed by the 64-bit integer value litterally. Integer values are assumed to be signed 64-bit values, using two's complement notation. - The 64-bit value `('l' << 56)` followed by the 64-bit integer value litterally. Integer values are assumed to be signed 64-bit values, using two's complement notation.
Float values are represented as two 64-bit tape elements: Float values are represented as two 64-bit tape elements:
- The 64-bit value ('d' << 56) followed by the 64-bit double value litterally in standard IEEE 754 notation. - The 64-bit value `('d' << 56)` followed by the 64-bit double value litterally in standard IEEE 754 notation.
Performance consideration: We store numbers of the main tape because we believe that locality of reference is helpful for performance. The format is somewhat storage wasteful as 56 bits are ignored. Performance consideration: We store numbers of the main tape because we believe that locality of reference is helpful for performance.
## Root node ## Root node
Each JSON document will have two special 64-bit tape element representing a root node, one at the beginning and one at the end. Each JSON document will have two special 64-bit tape elements representing a root node, one at the beginning and one at the end.
- The first 64-bit tape element contains the value ('r'<<56) + x where x is the location on the tape of the last root element. - The first 64-bit tape element contains the value `('r'<<56) + x` where `x` is the location on the tape of the last root element.
- The last 64-bit tape element contains the value ('r'<< 56). - The last 64-bit tape element contains the value ('r'<< 56).
All of the parsed document is located between these two 64-bit tape elements. All of the parsed document is located between these two 64-bit tape elements.
@ -101,7 +104,7 @@ Hint: we can read the first tape element to determine the length of the tape.
## Strings ## Strings
We store string values using UTF-8 encoding with null termination on a separate tape. A string value is represented on the main tape as the 64-bit tape element ('"'<< 56) + x where x is the location on the string tape of the null-terminated string. We store string values using UTF-8 encoding with null termination on a separate tape. A string value is represented on the main tape as the 64-bit tape element `('"'<< 56) + x` where the payload `x` is the location on the string tape of the null-terminated string.
## Arrays ## Arrays
@ -118,8 +121,8 @@ Performance consideration: We can skip the content of an array entirely by acces
JSON objects are represented using two 64-bit tape elements. JSON objects are represented using two 64-bit tape elements.
- The first 64-bit tape element contains the value ('{' << 56) + x where the payload x is 1 + the index of the second 64-bit tape element on the tape. - The first 64-bit tape element contains the value `('{' << 56) + x` where the payload `x` is 1 + the index of the second 64-bit tape element on the tape.
- The second 64-bit tape element contains the value ('{' << 56) + x where the payload x contains the index of the first 64-bit tape element on the tape. - The second 64-bit tape element contains the value `('{' << 56) + x` where the payload `x` contains the index of the first 64-bit tape element on the tape.
In-between these two tape elements, we alternate between key (which must strings) and values. A value could be an object or an array. In-between these two tape elements, we alternate between key (which must strings) and values. A value could be an object or an array.