Ok. Looks complete.

This commit is contained in:
Daniel Lemire 2018-12-14 21:32:42 -05:00
parent c127570c83
commit 0769c39e27
8 changed files with 321 additions and 191 deletions

View File

@ -5,9 +5,9 @@
.PHONY: clean cleandist .PHONY: clean cleandist
COREDEPSINCLUDE = -Idependencies/rapidjson/include -Idependencies/sajson/include
DEPSINCLUDE = -Idependencies/rapidjson/include -Idependencies/sajson/include -Idependencies/json11 -Idependencies/fastjson/src -Idependencies/fastjson/include -Idependencies/gason/src -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src EXTRADEPSINCLUDE = -Idependencies/json11 -Idependencies/fastjson/src -Idependencies/fastjson/include -Idependencies/gason/src -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src
CXXFLAGS = -std=c++17 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux $(DEPSINCLUDE) CXXFLAGS = -std=c++17 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux
CFLAGS = -march=native -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src CFLAGS = -march=native -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src
ifeq ($(SANITIZE),1) ifeq ($(SANITIZE),1)
CXXFLAGS += -g3 -O0 -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined CXXFLAGS += -g3 -O0 -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined
@ -24,7 +24,7 @@ endif
MAINEXECUTABLES=parse minify json2json MAINEXECUTABLES=parse minify json2json
TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck
COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition allparserscheckfile COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition distinctuseridcompetition allparserscheckfile
HEADERS= include/simdjson/simdutf8check.h include/simdjson/stringparsing.h include/simdjson/numberparsing.h include/simdjson/jsonparser.h include/simdjson/common_defs.h include/simdjson/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/simdjson/parsedjson.h include/simdjson/stage1_find_marks.h include/simdjson/stage2_flatten.h include/simdjson/stage34_unified.h include/simdjson/jsoncharutils.h include/simdjson/jsonformatutils.h HEADERS= include/simdjson/simdutf8check.h include/simdjson/stringparsing.h include/simdjson/numberparsing.h include/simdjson/jsonparser.h include/simdjson/common_defs.h include/simdjson/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/simdjson/parsedjson.h include/simdjson/stage1_find_marks.h include/simdjson/stage2_flatten.h include/simdjson/stage34_unified.h include/simdjson/jsoncharutils.h include/simdjson/jsonformatutils.h
LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp src/stage2_flatten.cpp src/stage34_unified.cpp LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp src/stage2_flatten.cpp src/stage34_unified.cpp
@ -40,9 +40,11 @@ GASON_INCLUDE:=dependencies/gason/src/gason.h
UJSON4C_INCLUDE:=dependencies/ujson4c/src/ujdecode.c UJSON4C_INCLUDE:=dependencies/ujson4c/src/ujdecode.c
LIBS=$(RAPIDJSON_INCLUDE) $(SAJSON_INCLUDE) $(JSON11_INCLUDE) $(FASTJSON_INCLUDE) $(GASON_INCLUDE) $(UJSON4C_INCLUDE) LIBS=$(RAPIDJSON_INCLUDE) $(SAJSON_INCLUDE) $(JSON11_INCLUDE) $(FASTJSON_INCLUDE) $(GASON_INCLUDE) $(UJSON4C_INCLUDE)
OBJECTS=ujdecode.o EXTRAOBJECTS=ujdecode.o
all: $(MAINEXECUTABLES) all: $(MAINEXECUTABLES)
competition: $(COMPARISONEXECUTABLES)
test: jsoncheck numberparsingcheck stringparsingcheck test: jsoncheck numberparsingcheck stringparsingcheck
./numberparsingcheck ./numberparsingcheck
./stringparsingcheck ./stringparsingcheck
@ -91,7 +93,7 @@ stringparsingcheck:tests/stringparsingcheck.cpp $(HEADERS) $(LIBFILES)
minifiercompetition: benchmark/minifiercompetition.cpp $(HEADERS) $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIBFILES) minifiercompetition: benchmark/minifiercompetition.cpp $(HEADERS) $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIBFILES)
$(CXX) $(CXXFLAGS) -o minifiercompetition $(LIBFILES) $(MINIFIERLIBFILES) benchmark/minifiercompetition.cpp -I. $(LIBFLAGS) $(CXX) $(CXXFLAGS) -o minifiercompetition $(LIBFILES) $(MINIFIERLIBFILES) benchmark/minifiercompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
minify: tools/minify.cpp $(HEADERS) $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIBFILES) minify: tools/minify.cpp $(HEADERS) $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIBFILES)
$(CXX) $(CXXFLAGS) -o minify $(MINIFIERLIBFILES) $(LIBFILES) tools/minify.cpp -I. $(CXX) $(CXXFLAGS) -o minify $(MINIFIERLIBFILES) $(LIBFILES) tools/minify.cpp -I.
@ -103,15 +105,18 @@ json2json: tools/json2json.cpp $(HEADERS) $(LIBFILES)
ujdecode.o: $(UJSON4C_INCLUDE) ujdecode.o: $(UJSON4C_INCLUDE)
$(CC) $(CFLAGS) -c dependencies/ujson4c/src/ujdecode.c $(CC) $(CFLAGS) -c dependencies/ujson4c/src/ujdecode.c
parseandstatcompetition: benchmark/parseandstatcompetition.cpp $(HEADERS) $(LIBFILES) $(OBJECTS) parseandstatcompetition: benchmark/parseandstatcompetition.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o parseandstatcompetition $(LIBFILES) benchmark/parseandstatcompetition.cpp $(OBJECTS) -I. $(LIBFLAGS) $(CXX) $(CXXFLAGS) -o parseandstatcompetition $(LIBFILES) benchmark/parseandstatcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
distinctuseridcompetition: benchmark/distinctuseridcompetition.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o distinctuseridcompetition $(LIBFILES) benchmark/distinctuseridcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(OBJECTS) parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS)
$(CXX) $(CXXFLAGS) -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp $(OBJECTS) -I. $(LIBFLAGS) $(CXX) $(CXXFLAGS) -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE)
allparserscheckfile: tests/allparserscheckfile.cpp $(HEADERS) $(LIBFILES) $(OBJECTS) allparserscheckfile: tests/allparserscheckfile.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS)
$(CXX) $(CXXFLAGS) -o allparserscheckfile $(LIBFILES) tests/allparserscheckfile.cpp $(OBJECTS) -I. $(LIBFLAGS) $(CXX) $(CXXFLAGS) -o allparserscheckfile $(LIBFILES) tests/allparserscheckfile.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE)
parsehisto: benchmark/parse.cpp $(HEADERS) $(LIBFILES) parsehisto: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o parsehisto benchmark/parse.cpp $(LIBFILES) $(LIBFLAGS) -DBUILDHISTOGRAM $(CXX) $(CXXFLAGS) -o parsehisto benchmark/parse.cpp $(LIBFILES) $(LIBFLAGS) -DBUILDHISTOGRAM
@ -121,7 +126,7 @@ cppcheck:
clean: clean:
rm -f $(OBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES) rm -f $(EXTRAOBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES)
cleandist: cleandist:
rm -f $(OBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES) rm -f $(EXTRAOBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES)

View File

@ -90,6 +90,7 @@ To simplify the engineering, we make some assumptions.
- We assume AVX2 support which is available in all recent mainstream x86 processors produced by AMD and Intel. No support for non-x86 processors is included though it can be done. We plan to support ARM processors (help is invited). - We assume AVX2 support which is available in all recent mainstream x86 processors produced by AMD and Intel. No support for non-x86 processors is included though it can be done. We plan to support ARM processors (help is invited).
- We only support GNU GCC and LLVM Clang at this time. There is no support for Microsoft Visual Studio, though it should not be difficult (help is invited). - We only support GNU GCC and LLVM Clang at this time. There is no support for Microsoft Visual Studio, though it should not be difficult (help is invited).
- In cases of failure, we just report a failure without any indication as to the nature of the problem. (This can be easily improved without affecting performance.) - In cases of failure, we just report a failure without any indication as to the nature of the problem. (This can be easily improved without affecting performance.)
- As allowed by the specification, we allow repeated keys within an object (other parsers like sajson do the same).
*We do not aim to provide a general-purpose JSON library.* A library like RapidJSON offers much more than just parsing, it helps you generate JSON and offers various other convenient functions. We merely parse the document. *We do not aim to provide a general-purpose JSON library.* A library like RapidJSON offers much more than just parsing, it helps you generate JSON and offers various other convenient functions. We merely parse the document.
@ -97,7 +98,7 @@ To simplify the engineering, we make some assumptions.
## Features ## Features
- The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.) - The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.)
- We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers. - We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808). Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson stores integers larger than 2147483648 as floating-point numbers.)
- We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.) - We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.)
- We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.) - We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.)
- We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tags in strings.) - We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tags in strings.)
@ -111,6 +112,102 @@ The parser works in three stages:
- Stage 3. (Structure building) Involves constructing a "tree" of sort to navigate through the data. Strings and numbers are parsed at this stage. - Stage 3. (Structure building) Involves constructing a "tree" of sort to navigate through the data. Strings and numbers are parsed at this stage.
## Navigating the parsed document
Here is a code sample to dump back the parsed JSON to a string:
```c
ParsedJson::iterator pjh(pj);
if (!pjh.isOk()) {
std::cerr << " Could not iterate parsed result. " << std::endl;
return EXIT_FAILURE;
}
compute_dump(pj);
//
// where compute_dump is :
void compute_dump(ParsedJson::iterator &pjh) {
if (pjh.is_object()) {
std::cout << "{";
if (pjh.down()) {
pjh.print(std::cout); // must be a string
std::cout << ":";
pjh.next();
compute_dump(pjh); // let us recurse
while (pjh.next()) {
std::cout << ",";
pjh.print(std::cout);
std::cout << ":";
pjh.next();
compute_dump(pjh); // let us recurse
}
pjh.up();
}
std::cout << "}";
} else if (pjh.is_array()) {
std::cout << "[";
if (pjh.down()) {
compute_dump(pjh); // let us recurse
while (pjh.next()) {
std::cout << ",";
compute_dump(pjh); // let us recurse
}
pjh.up();
}
std::cout << "]";
} else {
pjh.print(std::cout); // just print the lone value
}
}
```
The following function will find all user.id integers:
```C
void simdjson_traverse(std::vector<int64_t> &answer, ParsedJson::iterator &i) {
switch (i.get_type()) {
case '{':
if (i.down()) {
do {
bool founduser = equals(i.get_string(), "user");
i.next(); // move to value
if (i.is_object()) {
if (founduser && i.move_to_key("id")) {
if (i.is_integer()) {
answer.push_back(i.get_integer());
}
i.up();
}
simdjson_traverse(answer, i);
} else if (i.is_array()) {
simdjson_traverse(answer, i);
}
} while (i.next());
i.up();
}
break;
case '[':
if (i.down()) {
do {
if (i.is_object_or_array()) {
simdjson_traverse(answer, i);
}
} while (i.next());
i.up();
}
break;
case 'l':
case 'd':
case 'n':
case 't':
case 'f':
default:
break;
}
}
```
## Various References ## Various References
- [Google double-conv](https://github.com/google/double-conversion/) - [Google double-conv](https://github.com/google/double-conversion/)

View File

@ -1,5 +1,7 @@
#include "simdjson/jsonparser.h" #include "simdjson/jsonparser.h"
#include <algorithm>
#include <unistd.h> #include <unistd.h>
#include <vector>
#include "benchmark.h" #include "benchmark.h"
@ -26,108 +28,112 @@ name;
#include "sajson.h" #include "sajson.h"
#include "fastjson.cpp"
#include "fastjson_dom.cpp"
#include "gason.cpp"
#include "json11.cpp"
#include "sajson.h"
extern "C" {
#include "ujdecode.h"
#include "ultrajsondec.c"
}
using namespace rapidjson; using namespace rapidjson;
using namespace std; using namespace std;
bool equals(const char *s1, const char *s2) { return strcmp(s1, s2) == 0; }
void remove_duplicates(vector<int64_t> &v) {
std::sort(v.begin(), v.end());
auto last = std::unique(v.begin(), v.end());
v.erase(last, v.end());
}
void print_vec(vector<int64_t> &v) {
for (auto i : v) {
std::cout << i << " ";
}
std::cout << std::endl;
}
void simdjson_traverse(std::vector<int64_t> &answer, ParsedJson::iterator &i) {
switch (i.get_type()) {
case '{':
if (i.down()) {
do {
bool founduser = equals(i.get_string(), "user");
i.next(); // move to value
if (i.is_object()) {
if (founduser && i.move_to_key("id")) {
if (i.is_integer()) {
answer.push_back(i.get_integer());
}
i.up();
}
simdjson_traverse(answer, i);
} else if (i.is_array()) {
simdjson_traverse(answer, i);
}
} while (i.next());
i.up();
}
break;
case '[':
if (i.down()) {
do {
if (i.is_object_or_array()) {
simdjson_traverse(answer, i);
}
} while (i.next());
i.up();
}
break;
case 'l':
case 'd':
case 'n':
case 't':
case 'f':
default:
break;
}
}
std::vector<int64_t> simdjson_computestats(const std::string_view &p) { std::vector<int64_t> simdjson_computestats(const std::string_view &p) {
std::vector<int64_t> answer; std::vector<int64_t> answer;
ParsedJson pj = build_parsed_json(p); ParsedJson pj = build_parsed_json(p);
answer.valid = pj.isValid(); if (!pj.isValid()) {
if (!answer.valid) {
return answer; return answer;
} }
answer.number_count = 0; ParsedJson::iterator i(pj);
answer.object_count = 0;
answer.array_count = 0; simdjson_traverse(answer, i);
answer.null_count = 0; remove_duplicates(answer);
answer.true_count = 0;
answer.false_count = 0;
size_t tapeidx = 0;
u64 tape_val = pj.tape[tapeidx++];
u8 type = (tape_val >> 56);
size_t howmany = 0;
assert(type == 'r');
howmany = tape_val & JSONVALUEMASK;
for (; tapeidx < howmany; tapeidx++) {
tape_val = pj.tape[tapeidx];
// u64 payload = tape_val & JSONVALUEMASK;
type = (tape_val >> 56);
switch (type) {
case 'l': // we have a long int
answer.number_count++;
tapeidx++; // skipping the integer
break;
case 'd': // we have a double
answer.number_count++;
tapeidx++; // skipping the double
break;
case 'n': // we have a null
answer.null_count++;
break;
case 't': // we have a true
answer.true_count++;
break;
case 'f': // we have a false
answer.false_count++;
break;
case '{': // we have an object
answer.object_count++;
break;
case '}': // we end an object
break;
case '[': // we start an array
answer.array_count++;
break;
case ']': // we end an array
break;
default:
break; // ignore
}
}
return answer; return answer;
} }
void sajson_traverse(std::vector<int64_t> &answer, const sajson::value &node) {
void sajson_traverse(stat_t &stats, const sajson::value &node) {
using namespace sajson; using namespace sajson;
switch (node.get_type()) { switch (node.get_type()) {
case TYPE_ARRAY: { case TYPE_ARRAY: {
stats.array_count++;
auto length = node.get_length(); auto length = node.get_length();
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
sajson_traverse(stats, node.get_array_element(i)); sajson_traverse(answer, node.get_array_element(i));
} }
break; break;
} }
case TYPE_OBJECT: { case TYPE_OBJECT: {
stats.object_count++;
auto length = node.get_length(); auto length = node.get_length();
for (auto i = 0u; i < length; ++i) { for (auto i = 0u; i < length; ++i) {
if(strcmp(node.get_object_key(i), "user") == 0) { if (equals(node.get_object_key(i).data(), "user")) { // found a user!!!
auto child = node.get_object_value(i); auto uservalue = node.get_object_value(i); // get the value
if(child.get_type() == TYPE_OBJECT) { if (uservalue.get_type() ==
for (auto j = 0u; j < length; ++j) { TYPE_OBJECT) { // the value should be an object
if(strcmp(node.get_object_key(i), "user") == 0) { auto uservaluelength = uservalue.get_length();
} for (auto j = 0u; j < uservaluelength;
++j) { // go through the children
if (equals(uservalue.get_object_key(j).data(),
"id")) { // ah ah found id
auto v = uservalue.get_object_value(j);
if (v.get_type() == TYPE_INTEGER) { // check that it is an integer
answer.push_back(v.get_integer_value()); // record it!
} else if (v.get_type() == TYPE_DOUBLE) {
answer.push_back((int64_t)v.get_double_value()); // record it!
} }
}
} }
}
} }
sajson_traverse(stats, node.get_object_value(i)); sajson_traverse(answer, node.get_object_value(i));
} }
break; break;
} }
@ -143,82 +149,72 @@ void sajson_traverse(stat_t &stats, const sajson::value &node) {
} }
} }
stat_t sasjon_computestats(const std::string_view &p) { std::vector<int64_t> sasjon_computestats(const std::string_view &p) {
stat_t answer; std::vector<int64_t> answer;
char *buffer = (char *)malloc(p.size()); char *buffer = (char *)malloc(p.size());
memcpy(buffer, p.data(), p.size()); memcpy(buffer, p.data(), p.size());
auto d = sajson::parse(sajson::dynamic_allocation(), auto d = sajson::parse(sajson::dynamic_allocation(),
sajson::mutable_string_view(p.size(), buffer)); sajson::mutable_string_view(p.size(), buffer));
answer.valid = d.is_valid(); if (!d.is_valid()) {
if (!answer.valid) {
return answer; return answer;
} }
answer.number_count = 0;
answer.object_count = 0;
answer.array_count = 0;
answer.null_count = 0;
answer.true_count = 0;
answer.false_count = 0;
sajson_traverse(answer, d.get_root()); sajson_traverse(answer, d.get_root());
free(buffer); free(buffer);
remove_duplicates(answer);
return answer; return answer;
} }
void rapid_traverse(stat_t &stats, const rapidjson::Value &v) { void rapid_traverse(std::vector<int64_t> &answer, const rapidjson::Value &v) {
switch (v.GetType()) { switch (v.GetType()) {
case kNullType:
stats.null_count++;
break;
case kFalseType:
stats.false_count++;
break;
case kTrueType:
stats.true_count++;
break;
case kObjectType: case kObjectType:
for (Value::ConstMemberIterator m = v.MemberBegin(); m != v.MemberEnd(); for (Value::ConstMemberIterator m = v.MemberBegin(); m != v.MemberEnd();
++m) { ++m) {
rapid_traverse(stats, m->value); if (equals(m->name.GetString(), "user")) {
const rapidjson::Value &child = m->value;
if (child.GetType() == kObjectType) {
for (Value::ConstMemberIterator k = child.MemberBegin();
k != child.MemberEnd(); ++k) {
if (equals(k->name.GetString(), "id")) {
const rapidjson::Value &val = k->value;
if (val.GetType() == kNumberType) {
answer.push_back(val.GetInt64());
}
}
}
}
}
rapid_traverse(answer, m->value);
} }
stats.object_count++;
break; break;
case kArrayType: case kArrayType:
for (Value::ConstValueIterator i = v.Begin(); i != v.End(); for (Value::ConstValueIterator i = v.Begin(); i != v.End();
++i) { // v.Size(); ++i) { // v.Size();
rapid_traverse(stats, *i); rapid_traverse(answer, *i);
} }
stats.array_count++;
break; break;
case kNullType:
case kFalseType:
case kTrueType:
case kStringType: case kStringType:
break;
case kNumberType: case kNumberType:
stats.number_count++; default:
break; break;
} }
} }
stat_t rapid_computestats(const std::string_view &p) { std::vector<int64_t> rapid_computestats(const std::string_view &p) {
stat_t answer; std::vector<int64_t> answer;
char *buffer = (char *)malloc(p.size() + 1); char *buffer = (char *)malloc(p.size() + 1);
memcpy(buffer, p.data(), p.size()); memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0'; buffer[p.size()] = '\0';
rapidjson::Document d; rapidjson::Document d;
d.ParseInsitu<kParseValidateEncodingFlag>(buffer); d.ParseInsitu<kParseValidateEncodingFlag>(buffer);
answer.valid = !d.HasParseError(); if (d.HasParseError()) {
if (!answer.valid) {
return answer; return answer;
} }
answer.number_count = 0;
answer.object_count = 0;
answer.array_count = 0;
answer.null_count = 0;
answer.true_count = 0;
answer.false_count = 0;
rapid_traverse(answer, d); rapid_traverse(answer, d);
free(buffer); free(buffer);
remove_duplicates(answer);
return answer; return answer;
} }
@ -262,29 +258,32 @@ int main(int argc, char *argv[]) {
std::cout << p.size() << " B "; std::cout << p.size() << " B ";
std::cout << std::endl; std::cout << std::endl;
} }
stat_t s1 = simdjson_computestats(p); std::vector<int64_t> s1 = simdjson_computestats(p);
if (verbose) { if (verbose) {
printf("simdjson: "); printf("simdjson: ");
print_stat(s1); print_vec(s1);
} }
stat_t s2 = rapid_computestats(p); std::vector<int64_t> s2 = rapid_computestats(p);
if (verbose) { if (verbose) {
printf("rapid: "); printf("rapid: ");
print_stat(s2); print_vec(s2);
} }
stat_t s3 = sasjon_computestats(p); std::vector<int64_t> s3 = sasjon_computestats(p);
if (verbose) { if (verbose) {
printf("sasjon: "); printf("sasjon: ");
print_stat(s3); print_vec(s3);
} }
assert(stat_equal(s1, s2)); assert(s1 == s2);
assert(stat_equal(s1, s3)); assert(s1 == s3);
size_t size = s1.size();
int repeat = 10; int repeat = 10;
int volume = p.size(); int volume = p.size();
BEST_TIME("simdjson ", simdjson_computestats(p).valid, true, , repeat, BEST_TIME("simdjson ", simdjson_computestats(p).size(), size, , repeat,
volume, true); volume, true);
BEST_TIME("rapid ", rapid_computestats(p).valid, true, , repeat, volume,
BEST_TIME("rapid ", rapid_computestats(p).size(), size, , repeat, volume,
true); true);
BEST_TIME("sasjon ", sasjon_computestats(p).valid, true, , repeat, volume, BEST_TIME("sasjon ", sasjon_computestats(p).size(), size, , repeat, volume,
true); true);
} }

View File

@ -26,16 +26,6 @@ name;
#include "sajson.h" #include "sajson.h"
#include "fastjson.cpp"
#include "fastjson_dom.cpp"
#include "gason.cpp"
#include "json11.cpp"
#include "sajson.h"
extern "C" {
#include "ujdecode.h"
#include "ultrajsondec.c"
}
using namespace rapidjson; using namespace rapidjson;
using namespace std; using namespace std;

View File

@ -395,7 +395,6 @@ public:
} }
// move forward in document order // move forward in document order
WARN_UNUSED
bool move_forward() { bool move_forward() {
if(location + 1 >= tape_length) { if(location + 1 >= tape_length) {
return false; // we are at the end! return false; // we are at the end!
@ -427,13 +426,11 @@ public:
// retrieve the character code of what we're looking at: // retrieve the character code of what we're looking at:
// [{"sltfn are the possibilities // [{"sltfn are the possibilities
WARN_UNUSED
really_inline u8 get_type() const { really_inline u8 get_type() const {
return current_type; return current_type;
} }
// get the s64 value at this node; valid only if we're at "l" // get the s64 value at this node; valid only if we're at "l"
WARN_UNUSED
really_inline s64 get_integer() const { really_inline s64 get_integer() const {
if(location + 1 >= tape_length) return 0;// default value in case of error if(location + 1 >= tape_length) return 0;// default value in case of error
return (s64) pj.tape[location + 1]; return (s64) pj.tape[location + 1];
@ -441,7 +438,6 @@ public:
// get the double value at this node; valid only if // get the double value at this node; valid only if
// we're at "d" // we're at "d"
WARN_UNUSED
really_inline double get_double() const { really_inline double get_double() const {
if(location + 1 >= tape_length) return NAN;// default value in case of error if(location + 1 >= tape_length) return NAN;// default value in case of error
double answer; double answer;
@ -449,10 +445,54 @@ public:
return answer; return answer;
} }
bool is_object_or_array() const {
return is_object_or_array(get_type());
}
bool is_object() const {
return get_type() == '{';
}
bool is_array() const {
return get_type() == '[';
}
bool is_string() const {
return get_type() == '"';
}
bool is_integer() const {
return get_type() == 'l';
}
bool is_double() const {
return get_type() == 'd';
}
static bool is_object_or_array(u8 type) {
return (type == '[' || (type == '{'));
}
// when at {, go one level deep, looking for a given key
// if successful, we are left pointing at the value,
// if not, we are still pointing at the object ({)
// (in case of repeated keys, this only finds the first one)
bool move_to_key(const char * key) {
if(down()) {
do {
assert(is_string());
bool rightkey = (strcmp(get_string(),key)==0);
next();
if(rightkey) return true;
} while(next());
assert(up());// not found
}
return false;
}
// get the string value at this node (NULL ended); valid only if we're at " // get the string value at this node (NULL ended); valid only if we're at "
// note that tabs, and line endings are escaped in the returned value (see print_with_escapes) // note that tabs, and line endings are escaped in the returned value (see print_with_escapes)
// return value is valid UTF-8 // return value is valid UTF-8
WARN_UNUSED
really_inline const char * get_string() const { really_inline const char * get_string() const {
return (const char *)(pj.string_buf + (current_val & JSONVALUEMASK)) ; return (const char *)(pj.string_buf + (current_val & JSONVALUEMASK)) ;
} }
@ -465,7 +505,6 @@ public:
// Thus, given [true, null, {"a":1}, [1,2]], we would visit true, null, { and [. // Thus, given [true, null, {"a":1}, [1,2]], we would visit true, null, { and [.
// At the object ({) or at the array ([), you can issue a "down" to visit their content. // At the object ({) or at the array ([), you can issue a "down" to visit their content.
// valid if we're not at the end of a scope (returns true). // valid if we're not at the end of a scope (returns true).
WARN_UNUSED
really_inline bool next() { really_inline bool next() {
if ((current_type == '[') || (current_type == '{')){ if ((current_type == '[') || (current_type == '{')){
// we need to jump // we need to jump
@ -498,12 +537,12 @@ public:
} }
// Withing a given scope (series of nodes at the same depth within either an // Withing a given scope (series of nodes at the same depth within either an
// array or an object), we move backward. // array or an object), we move backward.
// Thus, given [true, null, {"a":1}, [1,2]], we would visit ], }, null, true when starting at the end // Thus, given [true, null, {"a":1}, [1,2]], we would visit ], }, null, true when starting at the end
// of the scope. // of the scope.
// At the object ({) or at the array ([), you can issue a "down" to visit their content. // At the object ({) or at the array ([), you can issue a "down" to visit their content.
WARN_UNUSED
really_inline bool prev() { really_inline bool prev() {
if(location - 1 < depthindex[depth].start_of_scope) return false; if(location - 1 < depthindex[depth].start_of_scope) return false;
location -= 1; location -= 1;
@ -526,7 +565,6 @@ public:
// within a contained scope. // within a contained scope.
// Valid unless we are at the first level of the document // Valid unless we are at the first level of the document
// //
WARN_UNUSED
really_inline bool up() { really_inline bool up() {
if(depth == 1) { if(depth == 1) {
return false; // don't allow moving back to root return false; // don't allow moving back to root
@ -545,7 +583,6 @@ public:
// that deeper scope if it not empty. // that deeper scope if it not empty.
// Thus, given [true, null, {"a":1}, [1,2]], if we are at the { node, we would move to the // Thus, given [true, null, {"a":1}, [1,2]], if we are at the { node, we would move to the
// "a" node. // "a" node.
WARN_UNUSED
really_inline bool down() { really_inline bool down() {
if(location + 1 >= tape_length) return false; if(location + 1 >= tape_length) return false;
if ((current_type == '[') || (current_type == '{')) { if ((current_type == '[') || (current_type == '{')) {

View File

@ -0,0 +1 @@
{"name":1,"name":2, "this is allowable as per the json spec": true}

View File

@ -2,6 +2,7 @@
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
cd $SCRIPTPATH/.. cd $SCRIPTPATH/..
make parseandstatcompetition make parseandstatcompetition
echo "parsing and collecting basic stats on json documents as quickly as possible"
echo echo
for i in $SCRIPTPATH/../jsonexamples/*.json; do for i in $SCRIPTPATH/../jsonexamples/*.json; do
[ -f "$i" ] || break [ -f "$i" ] || break
@ -9,3 +10,15 @@ for i in $SCRIPTPATH/../jsonexamples/*.json; do
$SCRIPTPATH/../parseandstatcompetition $i $SCRIPTPATH/../parseandstatcompetition $i
echo echo
done done
make distinctuseridcompetition
echo "parsing and finding all user.id"
echo
for i in $SCRIPTPATH/../jsonexamples/twitter.json; do
[ -f "$i" ] || break
echo $i
$SCRIPTPATH/../distinctuseridcompetition jsonexamples/twitter.json
echo
done

View File

@ -7,49 +7,37 @@
using namespace std; using namespace std;
void compute_dump(ParsedJson::iterator &pjh) { void compute_dump(ParsedJson::iterator &pjh) {
bool inobject = (pjh.get_type() == '{'); if (pjh.is_object()) {
bool inarray = (pjh.get_type() == '[');
if ((!inobject) && (!inarray)) {
pjh.print(std::cout); // just print the lone value
return; // we are done
}
// we have either an array or an object
bool goingdown = pjh.down();
if(!goingdown) {
// we have an empty scope
if(inobject) std::cout<<"{}";
else std::cout<<"[]";
return;
}
// we have a non-empty scope and we are at the beginning of it
if (inobject) {
assert(pjh.get_scope_type() == '{');
std::cout << "{"; std::cout << "{";
assert(pjh.get_type() == '"'); if (pjh.down()) {
pjh.print(std::cout); // must be a string pjh.print(std::cout); // must be a string
std::cout << ":";
assert(pjh.next());
compute_dump(pjh); // let us recurse
while (pjh.next()) {
std::cout << ",";
assert(pjh.get_type() == '"');
pjh.print(std::cout);
std::cout << ":"; std::cout << ":";
assert(pjh.next()); pjh.next();
compute_dump(pjh); // let us recurse compute_dump(pjh); // let us recurse
while (pjh.next()) {
std::cout << ",";
pjh.print(std::cout);
std::cout << ":";
pjh.next();
compute_dump(pjh); // let us recurse
}
pjh.up();
} }
std::cout << "}"; std::cout << "}";
} else { } else if (pjh.is_array()) {
assert(pjh.get_scope_type() == '[');
std::cout << "["; std::cout << "[";
compute_dump(pjh); // let us recurse if (pjh.down()) {
while (pjh.next()) {
std::cout << ",";
compute_dump(pjh); // let us recurse compute_dump(pjh); // let us recurse
while (pjh.next()) {
std::cout << ",";
compute_dump(pjh); // let us recurse
}
pjh.up();
} }
std::cout << "]"; std::cout << "]";
} else {
pjh.print(std::cout); // just print the lone value
} }
assert(pjh.up());
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
@ -93,7 +81,7 @@ int main(int argc, char *argv[]) {
return EXIT_FAILURE; return EXIT_FAILURE;
} }
bool is_ok = json_parse(p, pj); // do the parsing, return false on error bool is_ok = json_parse(p, pj); // do the parsing, return false on error
free((void*)p.data()); free((void *)p.data());
if (!is_ok) { if (!is_ok) {
std::cerr << " Parsing failed. " << std::endl; std::cerr << " Parsing failed. " << std::endl;
return EXIT_FAILURE; return EXIT_FAILURE;