Simplifying the build

2018-12-19 00:40:04 -05:00 · 2018-12-19 00:40:04 -05:00 · e979a0c93f
parent ea8000501b
commit e979a0c93f
6 changed files with 45 additions and 57 deletions
--- a/6
+++ b/6
@ -116,8 +116,10 @@ distinctuseridcompetition: benchmark/distinctuseridcompetition.cpp $(HEADERS) $(
 	$(CXX) $(CXXFLAGS)  -o distinctuseridcompetition $(LIBFILES) benchmark/distinctuseridcompetition.cpp  -I. $(LIBFLAGS) $(COREDEPSINCLUDE)


-parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS) 
-	$(CXX) $(CXXFLAGS)  -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE)
+parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) #$(EXTRAOBJECTS) 
+	$(CXX) $(CXXFLAGS)  -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE) 
+#$(EXTRADEPSINCLUDE)
+#$(EXTRAOBJECTS) 

 allparserscheckfile: tests/allparserscheckfile.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS) 
 	$(CXX) $(CXXFLAGS) -o allparserscheckfile $(LIBFILES) tests/allparserscheckfile.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE)
--- a/README.md
+++ b/README.md
@ -78,7 +78,7 @@ make benchmark
 ## Tools

 - `json2json mydoc.json` parses the document, constructs a model and then dumps back the result to standard output. 
- `json2json -d mydoc.json` parses the document, constructs a model and then dumps model (as a tape) to standard output. The tape format is described in the accompanying file tape.md.
+- `json2json -d mydoc.json` parses the document, constructs a model and then dumps model (as a tape) to standard output. The tape format is described in the accompanying file `tape.md`.
 - `minify mydoc.json` minifies the JSON document, outputting the result to standard output. Minifying means to remove the unneeded white space charaters. 

 ## Scope
--- a/benchmark/parseandstatcompetition.cpp
+++ b/benchmark/parseandstatcompetition.cpp
@ -44,6 +44,7 @@ void print_stat(const stat_t &s) {
         s.true_count, s.false_count);
 }

+__attribute__ ((noinline))
 stat_t simdjson_computestats(const std::string_view &p) {
  stat_t answer;
  ParsedJson pj = build_parsed_json(p);
@ -145,6 +146,7 @@ void sajson_traverse(stat_t &stats, const sajson::value &node) {
  }
 }

+__attribute__ ((noinline))
 stat_t sasjon_computestats(const std::string_view &p) {
  stat_t answer;
  char *buffer = (char *)malloc(p.size());
@ -202,6 +204,7 @@ void rapid_traverse(stat_t &stats, const rapidjson::Value &v) {
  }
 }

+__attribute__ ((noinline))
 stat_t rapid_computestats(const std::string_view &p) {
  stat_t answer;
  char *buffer = (char *)malloc(p.size() + 1);
@ -286,7 +289,7 @@ int main(int argc, char *argv[]) {
  }
  assert(stat_equal(s1, s2));
  assert(stat_equal(s1, s3));
-  int repeat = 10;
+  int repeat = 50;
  int volume = p.size();
  BEST_TIME("simdjson  ", simdjson_computestats(p).valid, true, , repeat,
            volume, !justdata);
--- a/benchmark/parsingcompetition.cpp
+++ b/benchmark/parsingcompetition.cpp
@ -10,18 +10,24 @@
 #include "rapidjson/stringbuffer.h"
 #include "rapidjson/writer.h"

+#include "sajson.h"
+
+#ifdef ALLPARSER
 #include "fastjson.cpp"
 #include "fastjson_dom.cpp"
 #include "gason.cpp"
 #include "json11.cpp"
-#include "sajson.h"
 extern "C" {
 #include "ujdecode.h"
 #include "ultrajsondec.c"
 }
+#endif 
+
 using namespace rapidjson;
 using namespace std;

+
+#ifdef ALLPARSER
 // fastjson has a tricky interface
 void on_json_error(void *, const fastjson::ErrorContext &ec) {
  // std::cerr<<"ERROR: "<<ec.mesg<<std::endl;
@ -33,6 +39,7 @@ bool fastjson_parse(const char *input) {
                                     NULL);
 }
 // end of fastjson stuff
+#endif

 int main(int argc, char *argv[]) {
  bool verbose = false;
@ -89,7 +96,7 @@ int main(int argc, char *argv[]) {
    std::cerr << "can't allocate memory" << std::endl;
    return EXIT_FAILURE;
  }
-  int repeat = 10;
+  int repeat = 50;
  int volume = p.size();
  if(!justdata) BEST_TIME("simdjson (dynamic mem) ", build_parsed_json(p).isValid(), true, ,
            repeat, volume, !justdata);
@ -97,49 +104,19 @@ int main(int argc, char *argv[]) {
  BEST_TIME("simdjson ", json_parse(p, pj), true, , repeat,
            volume, !justdata);

+ 
  rapidjson::Document d;

  char *buffer = (char *)malloc(p.size() + 1);
  memcpy(buffer, p.data(), p.size());
  buffer[p.size()] = '\0';
-
  if(!justdata) BEST_TIME(
-      "RapidJSON",
+      "RapidJSON (doc reused) ",
      d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(),
      false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
-  if(!justdata) BEST_TIME("RapidJSON (insitu)",
+  BEST_TIME("RapidJSON",
            d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(),
-            false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
-  typedef rapidjson::GenericDocument<UTF8<>, rapidjson::MemoryPoolAllocator<>,
-                                     rapidjson::MemoryPoolAllocator<>>
-      RapidDocumentType;
-  size_t rapidvaallocsize = p.size() * 128; // allocate plenty of memory
-  size_t rapidallocsize = p.size() * 4096;  // allocate plenty of memory
-  char *rapidvalueBuffer = (char *)malloc(rapidvaallocsize);
-  char *rapidparseBuffer = (char *)malloc(rapidallocsize);
-  if ((rapidvalueBuffer != NULL) && (rapidvalueBuffer != NULL)) {
-    rapidjson::MemoryPoolAllocator<> valueAllocator(rapidvalueBuffer,
-                                                    rapidvaallocsize);
-    rapidjson::MemoryPoolAllocator<> parseAllocator(rapidparseBuffer,
-                                                    rapidallocsize);
-    RapidDocumentType preallocedd(&valueAllocator, rapidvaallocsize,
-                                  &parseAllocator);
-
-    if(!justdata) BEST_TIME(
-        "RapidJSON (static alloc)",
-        preallocedd.Parse<kParseValidateEncodingFlag>((const char *)buffer)
-            .HasParseError(),
-        false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
-    //  (static alloc, insitu)
-    BEST_TIME("RapidJSON",
-              preallocedd.ParseInsitu<kParseValidateEncodingFlag>(buffer)
-                  .HasParseError(),
-              false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
-    assert(valueAllocator.Size() <= rapidvaallocsize);
-    assert(parseAllocator.Size() <= rapidallocsize);
-  }
-  free(rapidvalueBuffer);
-  free(rapidparseBuffer);
+            false, memcpy(buffer, p.data(), p.size()) && (buffer[p.size()] = '\0'), repeat, volume, !justdata);
  if(!justdata) BEST_TIME("sajson (dynamic mem, insitu)",
            sajson::parse(sajson::dynamic_allocation(),
                          sajson::mutable_string_view(p.size(), buffer))
@ -154,6 +131,8 @@ int main(int argc, char *argv[]) {
                          sajson::mutable_string_view(p.size(), buffer))
                .is_valid(),
            true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
+#ifdef ALLPARSER
+
  std::string json11err;
  if (all)
    BEST_TIME("dropbox (json11)     ",
@ -176,6 +155,7 @@ int main(int argc, char *argv[]) {
    BEST_TIME("ultrajson         ",
              (UJDecode(buffer, p.size(), NULL, &state) == NULL), false,
              memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
+#endif
  if(!justdata) BEST_TIME("memcpy            ",
            (memcpy(buffer, p.data(), p.size()) == buffer), true, , repeat,
            volume, !justdata);
--- a/scripts/bar.gnuplot
+++ b/scripts/bar.gnuplot
@ -24,4 +24,4 @@ set format y "%0.1f";

 set style line 1 lt rgb "#A0A0A0" lw 1 pt 1 ps 1

-plot filename using 0:2:xtic(1) with boxes notitle ls 1, '' using 0:(1):(sprintf("%.1f", $2)) with labels notitle
+plot filename using 0:2:xtic(1) with boxes notitle ls 1, '' using 0:(1):(sprintf("%.2g", $2)) with labels notitle
--- a/tape.md
+++ b/tape.md
@ -1,13 +1,15 @@

 # Tape structure in simdjson 

-We parse a JSON document to a tape. A tape is an array of 64-bit values. Each node encountered in the JSON document is written to the tape using one or more 64-bit tape elements; the layout of the tape is in "document order". Throughout, little endian encoding is assumed. The tape is indexed starting at 0 (the first element is at index 0).
+We parse a JSON document to a tape. A tape is an array of 64-bit values. Each node encountered in the JSON document is written to the tape using one or more 64-bit tape elements; the layout of the tape is in "document order": elements are stored as they are encountered in the JSON document. 
+
+Throughout, little endian encoding is assumed. The tape is indexed starting at 0 (the first element is at index 0).

 ## Example

 It is sometimes useful to start with an example. Consider the following JSON document:

-```
+```json
 {
 	"Image": {
 		"Width": 800,
@ -26,7 +28,7 @@ It is sometimes useful to start with an example. Consider the following JSON doc

 The following is a dump of the content of the tape, with the first number of each line representing the index of a tape element.

-```
+```bash
 $ ./json2json -d jsonexamples/small/demo.json
 0 : r	// pointing to 38 (right after last node)
 1 : {	// pointing to next tape location 38 (first node after the scope)
@ -64,34 +66,35 @@ $ ./json2json -d jsonexamples/small/demo.json

 ## General formal of the tape elements

-Most tape elements  are written as ('c' << 56) + x where 'c' is some ASCII character determining the type of the element and where x is a 56-bit value called the payload.
+Most tape elements  are written as `('c' << 56) + x` where `'c'` is some ASCII character determining the type of the element (out of 't', 'f', 'n', 'l', 'd', '"', '{', '}', '[', ']' ,'r') and where `x` is a 56-bit value called the payload. The payload is normally interpreted as an unsigned 56-bit integer. Note that 56-bit integers can be quite large.


+Performance consideration: We believe that  accessing the tape in regular units of 64 bits is more important for performance than saving memory. 
+
 ## Simple JSON values

 Simple JSON nodes are represented with one tape element:

- null is  represented as the 64-bit value ('n' << 56) where 'n' is the 8-bit code point values (in ASCII) corresponding to the letter 'n'.
- true is  represented as the 64-bit value ('t' << 56).
- false is  represented as the 64-bit value ('f' << 56).
+- null is  represented as the 64-bit value `('n' << 56)` where `'n'` is the 8-bit code point values (in ASCII) corresponding to the letter `'n'`.
+- true is  represented as the 64-bit value `('t' << 56)`.
+- false is  represented as the 64-bit value `('f' << 56)`.

-Performance consideration: It is somewhat wasteful to use 64-bit tape elements to store values that would require far less storage. However, we believe that this has no significant performance impact in most practical applications.

 ## Integer and Double values

 Integer values are represented as two 64-bit tape elements:
- The 64-bit value ('l' << 56) followed by the 64-bit integer value litterally. Integer values are assumed to be signed 64-bit values, using two's complement notation.
+- The 64-bit value `('l' << 56)` followed by the 64-bit integer value litterally. Integer values are assumed to be signed 64-bit values, using two's complement notation.

 Float values are represented as two 64-bit tape elements:
- The 64-bit value ('d' << 56) followed by the 64-bit double value litterally in standard IEEE 754 notation.
+- The 64-bit value `('d' << 56)` followed by the 64-bit double value litterally in standard IEEE 754 notation.

-Performance consideration: We store numbers of the main tape because we believe that locality of reference is helpful for performance. The format is somewhat storage wasteful as 56 bits are ignored.
+Performance consideration: We store numbers of the main tape because we believe that locality of reference is helpful for performance. 

 ## Root node

-Each JSON document will have two special 64-bit tape element representing a root node, one at the beginning and one at the end.
+Each JSON document will have two special 64-bit tape elements representing a root node, one at the beginning and one at the end.

- The first 64-bit tape element contains the value ('r'<<56) + x where x is the location on the tape of the last root element.
+- The first 64-bit tape element contains the value `('r'<<56) + x` where `x` is the location on the tape of the last root element.
 - The last 64-bit tape element contains the value ('r'<< 56).

 All of the parsed document is located between these two 64-bit tape elements.
@ -101,7 +104,7 @@ Hint: we can read the first tape element to determine the length of the tape.

 ## Strings

-We store string values using UTF-8 encoding with null termination on a separate tape. A string value is represented on the main tape as the 64-bit tape element ('"'<< 56) + x where x is the location on the string tape of the null-terminated string.
+We store string values using UTF-8 encoding with null termination on a separate tape. A string value is represented on the main tape as the 64-bit tape element `('"'<< 56) + x` where the payload `x` is the location on the string tape of the null-terminated string. 

 ## Arrays 

@ -118,8 +121,8 @@ Performance consideration: We can skip the content of an array entirely by acces

 JSON objects are represented using two 64-bit tape elements.

- The first  64-bit tape element contains the value ('{' << 56) + x where  the payload  x is 1 + the index of the second 64-bit tape element on the tape.
- The second 64-bit tape element contains the value ('{' << 56) + x where   the payload x contains the index of the first 64-bit tape element on the tape.
+- The first  64-bit tape element contains the value `('{' << 56) + x` where  the payload  `x` is 1 + the index of the second 64-bit tape element on the tape.
+- The second 64-bit tape element contains the value `('{' << 56) + x` where   the payload `x` contains the index of the first 64-bit tape element on the tape.

 In-between these two tape elements, we alternate between key (which must strings) and values. A value could be an object or an array.