From 0769c39e270254c39b410f9f4077c66f6910cf5e Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 14 Dec 2018 21:32:42 -0500 Subject: [PATCH] Ok. Looks complete. --- Makefile | 33 ++-- README.md | 99 +++++++++- benchmark/distinctuseridcompetition.cpp | 245 ++++++++++++------------ benchmark/parseandstatcompetition.cpp | 10 - include/simdjson/parsedjson.h | 55 +++++- jsonchecker/fail39_EXCLUDE.json | 1 + scripts/parseandstat.sh | 13 ++ tools/json2json.cpp | 56 +++--- 8 files changed, 321 insertions(+), 191 deletions(-) create mode 100644 jsonchecker/fail39_EXCLUDE.json diff --git a/Makefile b/Makefile index 0933e10b..792a22a3 100644 --- a/Makefile +++ b/Makefile @@ -5,9 +5,9 @@ .PHONY: clean cleandist - -DEPSINCLUDE = -Idependencies/rapidjson/include -Idependencies/sajson/include -Idependencies/json11 -Idependencies/fastjson/src -Idependencies/fastjson/include -Idependencies/gason/src -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src -CXXFLAGS = -std=c++17 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux $(DEPSINCLUDE) +COREDEPSINCLUDE = -Idependencies/rapidjson/include -Idependencies/sajson/include +EXTRADEPSINCLUDE = -Idependencies/json11 -Idependencies/fastjson/src -Idependencies/fastjson/include -Idependencies/gason/src -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src +CXXFLAGS = -std=c++17 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux CFLAGS = -march=native -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src ifeq ($(SANITIZE),1) CXXFLAGS += -g3 -O0 -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined @@ -24,7 +24,7 @@ endif MAINEXECUTABLES=parse minify json2json TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck -COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition allparserscheckfile +COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition distinctuseridcompetition allparserscheckfile HEADERS= include/simdjson/simdutf8check.h include/simdjson/stringparsing.h include/simdjson/numberparsing.h include/simdjson/jsonparser.h include/simdjson/common_defs.h include/simdjson/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/simdjson/parsedjson.h include/simdjson/stage1_find_marks.h include/simdjson/stage2_flatten.h include/simdjson/stage34_unified.h include/simdjson/jsoncharutils.h include/simdjson/jsonformatutils.h LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp src/stage2_flatten.cpp src/stage34_unified.cpp @@ -40,9 +40,11 @@ GASON_INCLUDE:=dependencies/gason/src/gason.h UJSON4C_INCLUDE:=dependencies/ujson4c/src/ujdecode.c LIBS=$(RAPIDJSON_INCLUDE) $(SAJSON_INCLUDE) $(JSON11_INCLUDE) $(FASTJSON_INCLUDE) $(GASON_INCLUDE) $(UJSON4C_INCLUDE) -OBJECTS=ujdecode.o +EXTRAOBJECTS=ujdecode.o all: $(MAINEXECUTABLES) +competition: $(COMPARISONEXECUTABLES) + test: jsoncheck numberparsingcheck stringparsingcheck ./numberparsingcheck ./stringparsingcheck @@ -91,7 +93,7 @@ stringparsingcheck:tests/stringparsingcheck.cpp $(HEADERS) $(LIBFILES) minifiercompetition: benchmark/minifiercompetition.cpp $(HEADERS) $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIBFILES) - $(CXX) $(CXXFLAGS) -o minifiercompetition $(LIBFILES) $(MINIFIERLIBFILES) benchmark/minifiercompetition.cpp -I. $(LIBFLAGS) + $(CXX) $(CXXFLAGS) -o minifiercompetition $(LIBFILES) $(MINIFIERLIBFILES) benchmark/minifiercompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE) minify: tools/minify.cpp $(HEADERS) $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIBFILES) $(CXX) $(CXXFLAGS) -o minify $(MINIFIERLIBFILES) $(LIBFILES) tools/minify.cpp -I. @@ -103,15 +105,18 @@ json2json: tools/json2json.cpp $(HEADERS) $(LIBFILES) ujdecode.o: $(UJSON4C_INCLUDE) $(CC) $(CFLAGS) -c dependencies/ujson4c/src/ujdecode.c -parseandstatcompetition: benchmark/parseandstatcompetition.cpp $(HEADERS) $(LIBFILES) $(OBJECTS) - $(CXX) $(CXXFLAGS) -o parseandstatcompetition $(LIBFILES) benchmark/parseandstatcompetition.cpp $(OBJECTS) -I. $(LIBFLAGS) +parseandstatcompetition: benchmark/parseandstatcompetition.cpp $(HEADERS) $(LIBFILES) + $(CXX) $(CXXFLAGS) -o parseandstatcompetition $(LIBFILES) benchmark/parseandstatcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE) + +distinctuseridcompetition: benchmark/distinctuseridcompetition.cpp $(HEADERS) $(LIBFILES) + $(CXX) $(CXXFLAGS) -o distinctuseridcompetition $(LIBFILES) benchmark/distinctuseridcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE) -parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(OBJECTS) - $(CXX) $(CXXFLAGS) -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp $(OBJECTS) -I. $(LIBFLAGS) +parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS) + $(CXX) $(CXXFLAGS) -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE) -allparserscheckfile: tests/allparserscheckfile.cpp $(HEADERS) $(LIBFILES) $(OBJECTS) - $(CXX) $(CXXFLAGS) -o allparserscheckfile $(LIBFILES) tests/allparserscheckfile.cpp $(OBJECTS) -I. $(LIBFLAGS) +allparserscheckfile: tests/allparserscheckfile.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS) + $(CXX) $(CXXFLAGS) -o allparserscheckfile $(LIBFILES) tests/allparserscheckfile.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE) parsehisto: benchmark/parse.cpp $(HEADERS) $(LIBFILES) $(CXX) $(CXXFLAGS) -o parsehisto benchmark/parse.cpp $(LIBFILES) $(LIBFLAGS) -DBUILDHISTOGRAM @@ -121,7 +126,7 @@ cppcheck: clean: - rm -f $(OBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES) + rm -f $(EXTRAOBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES) cleandist: - rm -f $(OBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES) + rm -f $(EXTRAOBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES) diff --git a/README.md b/README.md index 9e723054..4f2c961d 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,7 @@ To simplify the engineering, we make some assumptions. - We assume AVX2 support which is available in all recent mainstream x86 processors produced by AMD and Intel. No support for non-x86 processors is included though it can be done. We plan to support ARM processors (help is invited). - We only support GNU GCC and LLVM Clang at this time. There is no support for Microsoft Visual Studio, though it should not be difficult (help is invited). - In cases of failure, we just report a failure without any indication as to the nature of the problem. (This can be easily improved without affecting performance.) +- As allowed by the specification, we allow repeated keys within an object (other parsers like sajson do the same). *We do not aim to provide a general-purpose JSON library.* A library like RapidJSON offers much more than just parsing, it helps you generate JSON and offers various other convenient functions. We merely parse the document. @@ -97,7 +98,7 @@ To simplify the engineering, we make some assumptions. ## Features - The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.) -- We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers. +- We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808). Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson stores integers larger than 2147483648 as floating-point numbers.) - We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.) - We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.) - We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tags in strings.) @@ -111,6 +112,102 @@ The parser works in three stages: - Stage 3. (Structure building) Involves constructing a "tree" of sort to navigate through the data. Strings and numbers are parsed at this stage. +## Navigating the parsed document + +Here is a code sample to dump back the parsed JSON to a string: + +```c + ParsedJson::iterator pjh(pj); + if (!pjh.isOk()) { + std::cerr << " Could not iterate parsed result. " << std::endl; + return EXIT_FAILURE; + } + compute_dump(pj); + // + // where compute_dump is : + +void compute_dump(ParsedJson::iterator &pjh) { + if (pjh.is_object()) { + std::cout << "{"; + if (pjh.down()) { + pjh.print(std::cout); // must be a string + std::cout << ":"; + pjh.next(); + compute_dump(pjh); // let us recurse + while (pjh.next()) { + std::cout << ","; + pjh.print(std::cout); + std::cout << ":"; + pjh.next(); + compute_dump(pjh); // let us recurse + } + pjh.up(); + } + std::cout << "}"; + } else if (pjh.is_array()) { + std::cout << "["; + if (pjh.down()) { + compute_dump(pjh); // let us recurse + while (pjh.next()) { + std::cout << ","; + compute_dump(pjh); // let us recurse + } + pjh.up(); + } + std::cout << "]"; + } else { + pjh.print(std::cout); // just print the lone value + } +} +``` + +The following function will find all user.id integers: + +```C +void simdjson_traverse(std::vector &answer, ParsedJson::iterator &i) { + switch (i.get_type()) { + case '{': + if (i.down()) { + do { + bool founduser = equals(i.get_string(), "user"); + i.next(); // move to value + if (i.is_object()) { + if (founduser && i.move_to_key("id")) { + if (i.is_integer()) { + answer.push_back(i.get_integer()); + } + i.up(); + } + simdjson_traverse(answer, i); + } else if (i.is_array()) { + simdjson_traverse(answer, i); + } + } while (i.next()); + i.up(); + } + break; + case '[': + if (i.down()) { + do { + if (i.is_object_or_array()) { + simdjson_traverse(answer, i); + } + } while (i.next()); + i.up(); + } + break; + case 'l': + case 'd': + case 'n': + case 't': + case 'f': + default: + break; + } +} +``` + + ## Various References - [Google double-conv](https://github.com/google/double-conversion/) diff --git a/benchmark/distinctuseridcompetition.cpp b/benchmark/distinctuseridcompetition.cpp index c9c39d6b..32121f0d 100644 --- a/benchmark/distinctuseridcompetition.cpp +++ b/benchmark/distinctuseridcompetition.cpp @@ -1,5 +1,7 @@ #include "simdjson/jsonparser.h" +#include #include +#include #include "benchmark.h" @@ -26,108 +28,112 @@ name; #include "sajson.h" -#include "fastjson.cpp" -#include "fastjson_dom.cpp" -#include "gason.cpp" -#include "json11.cpp" -#include "sajson.h" -extern "C" { -#include "ujdecode.h" -#include "ultrajsondec.c" -} - using namespace rapidjson; using namespace std; +bool equals(const char *s1, const char *s2) { return strcmp(s1, s2) == 0; } +void remove_duplicates(vector &v) { + std::sort(v.begin(), v.end()); + auto last = std::unique(v.begin(), v.end()); + v.erase(last, v.end()); +} +void print_vec(vector &v) { + for (auto i : v) { + std::cout << i << " "; + } + std::cout << std::endl; +} +void simdjson_traverse(std::vector &answer, ParsedJson::iterator &i) { + switch (i.get_type()) { + case '{': + if (i.down()) { + do { + bool founduser = equals(i.get_string(), "user"); + i.next(); // move to value + if (i.is_object()) { + if (founduser && i.move_to_key("id")) { + if (i.is_integer()) { + answer.push_back(i.get_integer()); + } + i.up(); + } + simdjson_traverse(answer, i); + } else if (i.is_array()) { + simdjson_traverse(answer, i); + } + } while (i.next()); + i.up(); + } + break; + case '[': + if (i.down()) { + do { + if (i.is_object_or_array()) { + simdjson_traverse(answer, i); + } + } while (i.next()); + i.up(); + } + break; + case 'l': + case 'd': + case 'n': + case 't': + case 'f': + default: + break; + } +} std::vector simdjson_computestats(const std::string_view &p) { std::vector answer; ParsedJson pj = build_parsed_json(p); - answer.valid = pj.isValid(); - if (!answer.valid) { + if (!pj.isValid()) { return answer; } - answer.number_count = 0; - answer.object_count = 0; - answer.array_count = 0; - answer.null_count = 0; - answer.true_count = 0; - answer.false_count = 0; - size_t tapeidx = 0; - u64 tape_val = pj.tape[tapeidx++]; - u8 type = (tape_val >> 56); - size_t howmany = 0; - assert(type == 'r'); - howmany = tape_val & JSONVALUEMASK; - for (; tapeidx < howmany; tapeidx++) { - tape_val = pj.tape[tapeidx]; - // u64 payload = tape_val & JSONVALUEMASK; - type = (tape_val >> 56); - switch (type) { - case 'l': // we have a long int - answer.number_count++; - tapeidx++; // skipping the integer - break; - case 'd': // we have a double - answer.number_count++; - tapeidx++; // skipping the double - break; - case 'n': // we have a null - answer.null_count++; - break; - case 't': // we have a true - answer.true_count++; - break; - case 'f': // we have a false - answer.false_count++; - break; - case '{': // we have an object - answer.object_count++; - break; - case '}': // we end an object - break; - case '[': // we start an array - answer.array_count++; - break; - case ']': // we end an array - break; - default: - break; // ignore - } - } + ParsedJson::iterator i(pj); + + simdjson_traverse(answer, i); + remove_duplicates(answer); return answer; } - -void sajson_traverse(stat_t &stats, const sajson::value &node) { +void sajson_traverse(std::vector &answer, const sajson::value &node) { using namespace sajson; switch (node.get_type()) { case TYPE_ARRAY: { - stats.array_count++; auto length = node.get_length(); for (size_t i = 0; i < length; ++i) { - sajson_traverse(stats, node.get_array_element(i)); + sajson_traverse(answer, node.get_array_element(i)); } break; } case TYPE_OBJECT: { - stats.object_count++; auto length = node.get_length(); for (auto i = 0u; i < length; ++i) { - if(strcmp(node.get_object_key(i), "user") == 0) { - auto child = node.get_object_value(i); - if(child.get_type() == TYPE_OBJECT) { - for (auto j = 0u; j < length; ++j) { - if(strcmp(node.get_object_key(i), "user") == 0) { - } - + if (equals(node.get_object_key(i).data(), "user")) { // found a user!!! + auto uservalue = node.get_object_value(i); // get the value + if (uservalue.get_type() == + TYPE_OBJECT) { // the value should be an object + auto uservaluelength = uservalue.get_length(); + for (auto j = 0u; j < uservaluelength; + ++j) { // go through the children + if (equals(uservalue.get_object_key(j).data(), + "id")) { // ah ah found id + auto v = uservalue.get_object_value(j); + if (v.get_type() == TYPE_INTEGER) { // check that it is an integer + answer.push_back(v.get_integer_value()); // record it! + } else if (v.get_type() == TYPE_DOUBLE) { + answer.push_back((int64_t)v.get_double_value()); // record it! } + } } + } } - sajson_traverse(stats, node.get_object_value(i)); + sajson_traverse(answer, node.get_object_value(i)); } break; } @@ -143,82 +149,72 @@ void sajson_traverse(stat_t &stats, const sajson::value &node) { } } -stat_t sasjon_computestats(const std::string_view &p) { - stat_t answer; +std::vector sasjon_computestats(const std::string_view &p) { + std::vector answer; char *buffer = (char *)malloc(p.size()); memcpy(buffer, p.data(), p.size()); auto d = sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)); - answer.valid = d.is_valid(); - if (!answer.valid) { + if (!d.is_valid()) { return answer; } - answer.number_count = 0; - answer.object_count = 0; - answer.array_count = 0; - answer.null_count = 0; - answer.true_count = 0; - answer.false_count = 0; sajson_traverse(answer, d.get_root()); free(buffer); + remove_duplicates(answer); return answer; } -void rapid_traverse(stat_t &stats, const rapidjson::Value &v) { +void rapid_traverse(std::vector &answer, const rapidjson::Value &v) { switch (v.GetType()) { - case kNullType: - stats.null_count++; - break; - case kFalseType: - stats.false_count++; - break; - case kTrueType: - stats.true_count++; - break; - case kObjectType: for (Value::ConstMemberIterator m = v.MemberBegin(); m != v.MemberEnd(); ++m) { - rapid_traverse(stats, m->value); + if (equals(m->name.GetString(), "user")) { + const rapidjson::Value &child = m->value; + if (child.GetType() == kObjectType) { + for (Value::ConstMemberIterator k = child.MemberBegin(); + k != child.MemberEnd(); ++k) { + if (equals(k->name.GetString(), "id")) { + const rapidjson::Value &val = k->value; + if (val.GetType() == kNumberType) { + answer.push_back(val.GetInt64()); + } + } + } + } + } + rapid_traverse(answer, m->value); } - stats.object_count++; break; case kArrayType: for (Value::ConstValueIterator i = v.Begin(); i != v.End(); ++i) { // v.Size(); - rapid_traverse(stats, *i); + rapid_traverse(answer, *i); } - stats.array_count++; break; - + case kNullType: + case kFalseType: + case kTrueType: case kStringType: - break; - case kNumberType: - stats.number_count++; + default: break; } } -stat_t rapid_computestats(const std::string_view &p) { - stat_t answer; +std::vector rapid_computestats(const std::string_view &p) { + std::vector answer; char *buffer = (char *)malloc(p.size() + 1); memcpy(buffer, p.data(), p.size()); buffer[p.size()] = '\0'; rapidjson::Document d; d.ParseInsitu(buffer); - answer.valid = !d.HasParseError(); - if (!answer.valid) { + if (d.HasParseError()) { return answer; } - answer.number_count = 0; - answer.object_count = 0; - answer.array_count = 0; - answer.null_count = 0; - answer.true_count = 0; - answer.false_count = 0; rapid_traverse(answer, d); free(buffer); + remove_duplicates(answer); return answer; } @@ -262,29 +258,32 @@ int main(int argc, char *argv[]) { std::cout << p.size() << " B "; std::cout << std::endl; } - stat_t s1 = simdjson_computestats(p); + std::vector s1 = simdjson_computestats(p); if (verbose) { printf("simdjson: "); - print_stat(s1); + print_vec(s1); } - stat_t s2 = rapid_computestats(p); + std::vector s2 = rapid_computestats(p); if (verbose) { printf("rapid: "); - print_stat(s2); + print_vec(s2); } - stat_t s3 = sasjon_computestats(p); + std::vector s3 = sasjon_computestats(p); if (verbose) { printf("sasjon: "); - print_stat(s3); + print_vec(s3); } - assert(stat_equal(s1, s2)); - assert(stat_equal(s1, s3)); + assert(s1 == s2); + assert(s1 == s3); + size_t size = s1.size(); + int repeat = 10; int volume = p.size(); - BEST_TIME("simdjson ", simdjson_computestats(p).valid, true, , repeat, + BEST_TIME("simdjson ", simdjson_computestats(p).size(), size, , repeat, volume, true); - BEST_TIME("rapid ", rapid_computestats(p).valid, true, , repeat, volume, + + BEST_TIME("rapid ", rapid_computestats(p).size(), size, , repeat, volume, true); - BEST_TIME("sasjon ", sasjon_computestats(p).valid, true, , repeat, volume, + BEST_TIME("sasjon ", sasjon_computestats(p).size(), size, , repeat, volume, true); } diff --git a/benchmark/parseandstatcompetition.cpp b/benchmark/parseandstatcompetition.cpp index 99d8e5d9..8b935b28 100644 --- a/benchmark/parseandstatcompetition.cpp +++ b/benchmark/parseandstatcompetition.cpp @@ -26,16 +26,6 @@ name; #include "sajson.h" -#include "fastjson.cpp" -#include "fastjson_dom.cpp" -#include "gason.cpp" -#include "json11.cpp" -#include "sajson.h" -extern "C" { -#include "ujdecode.h" -#include "ultrajsondec.c" -} - using namespace rapidjson; using namespace std; diff --git a/include/simdjson/parsedjson.h b/include/simdjson/parsedjson.h index 90e4af4d..93933908 100644 --- a/include/simdjson/parsedjson.h +++ b/include/simdjson/parsedjson.h @@ -395,7 +395,6 @@ public: } // move forward in document order - WARN_UNUSED bool move_forward() { if(location + 1 >= tape_length) { return false; // we are at the end! @@ -427,13 +426,11 @@ public: // retrieve the character code of what we're looking at: // [{"sltfn are the possibilities - WARN_UNUSED really_inline u8 get_type() const { return current_type; } // get the s64 value at this node; valid only if we're at "l" - WARN_UNUSED really_inline s64 get_integer() const { if(location + 1 >= tape_length) return 0;// default value in case of error return (s64) pj.tape[location + 1]; @@ -441,7 +438,6 @@ public: // get the double value at this node; valid only if // we're at "d" - WARN_UNUSED really_inline double get_double() const { if(location + 1 >= tape_length) return NAN;// default value in case of error double answer; @@ -449,10 +445,54 @@ public: return answer; } + bool is_object_or_array() const { + return is_object_or_array(get_type()); + } + + bool is_object() const { + return get_type() == '{'; + } + + bool is_array() const { + return get_type() == '['; + } + + bool is_string() const { + return get_type() == '"'; + } + + bool is_integer() const { + return get_type() == 'l'; + } + + bool is_double() const { + return get_type() == 'd'; + } + + static bool is_object_or_array(u8 type) { + return (type == '[' || (type == '{')); + } + + // when at {, go one level deep, looking for a given key + // if successful, we are left pointing at the value, + // if not, we are still pointing at the object ({) + // (in case of repeated keys, this only finds the first one) + bool move_to_key(const char * key) { + if(down()) { + do { + assert(is_string()); + bool rightkey = (strcmp(get_string(),key)==0); + next(); + if(rightkey) return true; + } while(next()); + assert(up());// not found + } + return false; + } + // get the string value at this node (NULL ended); valid only if we're at " // note that tabs, and line endings are escaped in the returned value (see print_with_escapes) // return value is valid UTF-8 - WARN_UNUSED really_inline const char * get_string() const { return (const char *)(pj.string_buf + (current_val & JSONVALUEMASK)) ; } @@ -465,7 +505,6 @@ public: // Thus, given [true, null, {"a":1}, [1,2]], we would visit true, null, { and [. // At the object ({) or at the array ([), you can issue a "down" to visit their content. // valid if we're not at the end of a scope (returns true). - WARN_UNUSED really_inline bool next() { if ((current_type == '[') || (current_type == '{')){ // we need to jump @@ -496,6 +535,7 @@ public: return true; } } + // Withing a given scope (series of nodes at the same depth within either an @@ -503,7 +543,6 @@ public: // Thus, given [true, null, {"a":1}, [1,2]], we would visit ], }, null, true when starting at the end // of the scope. // At the object ({) or at the array ([), you can issue a "down" to visit their content. - WARN_UNUSED really_inline bool prev() { if(location - 1 < depthindex[depth].start_of_scope) return false; location -= 1; @@ -526,7 +565,6 @@ public: // within a contained scope. // Valid unless we are at the first level of the document // - WARN_UNUSED really_inline bool up() { if(depth == 1) { return false; // don't allow moving back to root @@ -545,7 +583,6 @@ public: // that deeper scope if it not empty. // Thus, given [true, null, {"a":1}, [1,2]], if we are at the { node, we would move to the // "a" node. - WARN_UNUSED really_inline bool down() { if(location + 1 >= tape_length) return false; if ((current_type == '[') || (current_type == '{')) { diff --git a/jsonchecker/fail39_EXCLUDE.json b/jsonchecker/fail39_EXCLUDE.json new file mode 100644 index 00000000..c9520664 --- /dev/null +++ b/jsonchecker/fail39_EXCLUDE.json @@ -0,0 +1 @@ +{"name":1,"name":2, "this is allowable as per the json spec": true} diff --git a/scripts/parseandstat.sh b/scripts/parseandstat.sh index 7e38c2da..0ca2f8e5 100755 --- a/scripts/parseandstat.sh +++ b/scripts/parseandstat.sh @@ -2,6 +2,7 @@ SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" cd $SCRIPTPATH/.. make parseandstatcompetition +echo "parsing and collecting basic stats on json documents as quickly as possible" echo for i in $SCRIPTPATH/../jsonexamples/*.json; do [ -f "$i" ] || break @@ -9,3 +10,15 @@ for i in $SCRIPTPATH/../jsonexamples/*.json; do $SCRIPTPATH/../parseandstatcompetition $i echo done + +make distinctuseridcompetition +echo "parsing and finding all user.id" +echo + +for i in $SCRIPTPATH/../jsonexamples/twitter.json; do + [ -f "$i" ] || break + echo $i + $SCRIPTPATH/../distinctuseridcompetition jsonexamples/twitter.json + echo +done + diff --git a/tools/json2json.cpp b/tools/json2json.cpp index f4a80285..67596b9f 100644 --- a/tools/json2json.cpp +++ b/tools/json2json.cpp @@ -7,49 +7,37 @@ using namespace std; void compute_dump(ParsedJson::iterator &pjh) { - bool inobject = (pjh.get_type() == '{'); - bool inarray = (pjh.get_type() == '['); - if ((!inobject) && (!inarray)) { - pjh.print(std::cout); // just print the lone value - return; // we are done - } - // we have either an array or an object - bool goingdown = pjh.down(); - if(!goingdown) { - // we have an empty scope - if(inobject) std::cout<<"{}"; - else std::cout<<"[]"; - return; - } - // we have a non-empty scope and we are at the beginning of it - if (inobject) { - assert(pjh.get_scope_type() == '{'); + if (pjh.is_object()) { std::cout << "{"; - assert(pjh.get_type() == '"'); - pjh.print(std::cout); // must be a string - std::cout << ":"; - assert(pjh.next()); - compute_dump(pjh); // let us recurse - while (pjh.next()) { - std::cout << ","; - assert(pjh.get_type() == '"'); - pjh.print(std::cout); + if (pjh.down()) { + pjh.print(std::cout); // must be a string std::cout << ":"; - assert(pjh.next()); + pjh.next(); compute_dump(pjh); // let us recurse + while (pjh.next()) { + std::cout << ","; + pjh.print(std::cout); + std::cout << ":"; + pjh.next(); + compute_dump(pjh); // let us recurse + } + pjh.up(); } std::cout << "}"; - } else { - assert(pjh.get_scope_type() == '['); + } else if (pjh.is_array()) { std::cout << "["; - compute_dump(pjh); // let us recurse - while (pjh.next()) { - std::cout << ","; + if (pjh.down()) { compute_dump(pjh); // let us recurse + while (pjh.next()) { + std::cout << ","; + compute_dump(pjh); // let us recurse + } + pjh.up(); } std::cout << "]"; + } else { + pjh.print(std::cout); // just print the lone value } - assert(pjh.up()); } int main(int argc, char *argv[]) { @@ -93,7 +81,7 @@ int main(int argc, char *argv[]) { return EXIT_FAILURE; } bool is_ok = json_parse(p, pj); // do the parsing, return false on error - free((void*)p.data()); + free((void *)p.data()); if (!is_ok) { std::cerr << " Parsing failed. " << std::endl; return EXIT_FAILURE;