From 14039d05a96f426b95b0e626721f62caeff847f8 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 23 Oct 2020 08:47:01 -0400 Subject: [PATCH] Adding a new benchmark for ondemand: distinct user id (#1239) * Adding a distinct user id benchmark * reenabling everything * Removing an unnecessary "value()". * Better tests of the examples and some fixes. * Guarding exception code. --- benchmark/bench_ondemand.cpp | 4 + benchmark/distinctuserid/distinctuserid.h | 52 +++++++++++++ benchmark/distinctuserid/dom.h | 89 +++++++++++++++++++++++ benchmark/distinctuserid/ondemand.h | 67 +++++++++++++++++ doc/ondemand.md | 73 +++++++++++++++++-- tests/ondemand/ondemand_basictests.cpp | 85 ++++++++++++++++++++++ 6 files changed, 365 insertions(+), 5 deletions(-) create mode 100644 benchmark/distinctuserid/distinctuserid.h create mode 100644 benchmark/distinctuserid/dom.h create mode 100644 benchmark/distinctuserid/ondemand.h diff --git a/benchmark/bench_ondemand.cpp b/benchmark/bench_ondemand.cpp index bb1a2c42..4cb5ddc8 100644 --- a/benchmark/bench_ondemand.cpp +++ b/benchmark/bench_ondemand.cpp @@ -19,4 +19,8 @@ SIMDJSON_POP_DISABLE_WARNINGS #include "kostya/iter.h" #include "kostya/dom.h" +#include "distinctuserid/ondemand.h" +#include "distinctuserid/dom.h" + + BENCHMARK_MAIN(); diff --git a/benchmark/distinctuserid/distinctuserid.h b/benchmark/distinctuserid/distinctuserid.h new file mode 100644 index 00000000..49a61a08 --- /dev/null +++ b/benchmark/distinctuserid/distinctuserid.h @@ -0,0 +1,52 @@ + +#pragma once +#include +#include +#include "event_counter.h" +#include "json_benchmark.h" + + +bool equals(const char *s1, const char *s2) { return strcmp(s1, s2) == 0; } + +void remove_duplicates(std::vector &v) { + std::sort(v.begin(), v.end()); + auto last = std::unique(v.begin(), v.end()); + v.erase(last, v.end()); +} + +// +// Interface +// + +namespace distinct_user_id { +template static void DistinctUserID(benchmark::State &state); +} // namespace + +// +// Implementation +// + +#include "dom.h" + + +namespace distinct_user_id { + +using namespace simdjson; + +template static void DistinctUserID(benchmark::State &state) { + // + // Load the JSON file + // + constexpr const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json"; + error_code error; + padded_string json; + if ((error = padded_string::load(TWITTER_JSON).get(json))) { + std::cerr << error << std::endl; + state.SkipWithError("error loading"); + return; + } + + JsonBenchmark(state, json); +} + +} // namespace distinct_user_id diff --git a/benchmark/distinctuserid/dom.h b/benchmark/distinctuserid/dom.h new file mode 100644 index 00000000..fd54f61b --- /dev/null +++ b/benchmark/distinctuserid/dom.h @@ -0,0 +1,89 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "distinctuserid.h" + +namespace distinct_user_id { + +using namespace simdjson; + + +simdjson_really_inline void simdjson_recurse(std::vector & v, simdjson::dom::element element); +void simdjson_recurse(std::vector & v, simdjson::dom::array array) { + for (auto child : array) { + simdjson_recurse(v, child); + } +} +void simdjson_recurse(std::vector & v, simdjson::dom::object object) { + for (auto [key, value] : object) { + if((key.size() == 4) && (memcmp(key.data(), "user", 4) == 0)) { + // we are in an object under the key "user" + simdjson::error_code error; + simdjson::dom::object child_object; + simdjson::dom::object child_array; + if (not (error = value.get(child_object))) { + for (auto [child_key, child_value] : child_object) { + if((child_key.size() == 2) && (memcmp(child_key.data(), "id", 2) == 0)) { + int64_t x; + if (not (error = child_value.get(x))) { + v.push_back(x); + } + } + simdjson_recurse(v, child_value); + } + } else if (not (error = value.get(child_array))) { + simdjson_recurse(v, child_array); + } + // end of: we are in an object under the key "user" + } else { + simdjson_recurse(v, value); + } + } +} +simdjson_really_inline void simdjson_recurse(std::vector & v, simdjson::dom::element element) { + simdjson_unused simdjson::error_code error; + simdjson::dom::array array; + simdjson::dom::object object; + if (not (error = element.get(array))) { + simdjson_recurse(v, array); + } else if (not (error = element.get(object))) { + simdjson_recurse(v, object); + } +} + + + + + +class Dom { +public: + simdjson_really_inline bool Run(const padded_string &json); + simdjson_really_inline const std::vector &Result() { return ids; } + simdjson_really_inline size_t ItemCount() { return ids.size(); } + +private: + dom::parser parser{}; + std::vector ids{}; + +}; +void print_vec(const std::vector &v) { + for (auto i : v) { + std::cout << i << " "; + } + std::cout << std::endl; +} + +simdjson_really_inline bool Dom::Run(const padded_string &json) { + ids.clear(); + dom::element doc = parser.parse(json); + simdjson_recurse(ids, doc); + remove_duplicates(ids); + return true; +} + +BENCHMARK_TEMPLATE(DistinctUserID, Dom); + +} // namespace distinct_user_id + +#endif // SIMDJSON_EXCEPTIONS \ No newline at end of file diff --git a/benchmark/distinctuserid/ondemand.h b/benchmark/distinctuserid/ondemand.h new file mode 100644 index 00000000..fbb10fb9 --- /dev/null +++ b/benchmark/distinctuserid/ondemand.h @@ -0,0 +1,67 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "distinctuserid.h" + +namespace distinct_user_id { + +using namespace simdjson; +using namespace simdjson::builtin; + + +class OnDemand { +public: + OnDemand() { + if(!displayed_implementation) { + std::cout << "On Demand implementation: " << builtin_implementation()->name() << std::endl; + displayed_implementation = true; + } + } + simdjson_really_inline bool Run(const padded_string &json); + simdjson_really_inline const std::vector &Result() { return ids; } + simdjson_really_inline size_t ItemCount() { return ids.size(); } + +private: + ondemand::parser parser{}; + std::vector ids{}; + + static inline bool displayed_implementation = false; +}; + +simdjson_really_inline bool OnDemand::Run(const padded_string &json) { + ids.clear(); + // Walk the document, parsing as we go + auto doc = parser.iterate(json); + for (ondemand::object tweet : doc["statuses"]) { + // We believe that all statuses have a matching + // user, and we are willing to throw when they do not: + // + // You might think that you do not need the braces, but + // you do, otherwise you will get the wrong answer. That is + // because you can only have one active object or array + // at a time. + { + ondemand::object user = tweet["user"]; + int64_t id = user["id"]; + ids.push_back(id); + } + // Not all tweets have a "retweeted_status", but when they do + // we want to go and find the user within. + auto retweet = tweet["retweeted_status"]; + if(!retweet.error()) { + ondemand::object retweet_content = retweet; + ondemand::object reuser = retweet_content["user"]; + int64_t rid = reuser["id"]; + ids.push_back(rid); + } + } + remove_duplicates(ids); + return true; +} + +BENCHMARK_TEMPLATE(DistinctUserID, OnDemand); + +} // namespace distinct_user_id + +#endif // SIMDJSON_EXCEPTIONS diff --git a/doc/ondemand.md b/doc/ondemand.md index 93a0eb4e..629cd776 100644 --- a/doc/ondemand.md +++ b/doc/ondemand.md @@ -29,6 +29,11 @@ auto doc = parser.iterate(json); for (auto tweet : doc["statuses"]) { std::string_view text = tweet["text"]; std::string_view screen_name = tweet["user"]["screen_name"]; + std::string_view screen_name; + { + ondemand::object user = tweet["user"]; + screen_name = user["screen_name"]; + } uint64_t retweets = tweet["retweet_count"]; uint64_t favorites = tweet["favorite_count"]; cout << screen_name << " (" << retweets << " retweets / " << favorites << " favorites): " << text << endl; @@ -313,6 +318,11 @@ To help visualize the algorithm, we'll walk through the example C++ given at the rely on error chaining, so it is possible to delay error checks: we shall shortly explain error chaining more fully. + NOTE: You should always have such a `document` instance (here `doc`) and it should remain in scope for the duration + of your parsing function. E.g., you should not use the returned document as a temporary (e.g., `auto x = parser.iterate(json).get_object();`) + followed by other operations as the destruction of the `document` instance makes all of the derived instances + ill-defined. + 3. We iterate over the "statuses" field using a typical C++ iterator, reading past the initial `{ "statuses": [ {`. @@ -355,6 +365,11 @@ To help visualize the algorithm, we'll walk through the example C++ given at the when you attempt to cast the final `simdjson_result` to object. Upon casting, an exception is thrown if there was an error. + NOTE: while the document can be queried once for a key as if it were an object, it is not an actual object + instance. If you need to treat it as an object (e.g., to query more than one keys), you can cast it as + such `ondemand::object root_object = doc.get_object();`. + + 4. We get the `"text"` field as a string. ```c++ @@ -379,7 +394,8 @@ To help visualize the algorithm, we'll walk through the example C++ given at the 4. We get the `"screen_name"` from the `"user"` object. ```c++ - std::string_view screen_name = tweet["user"]["screen_name"]; + ondemand::object user = tweet["user"]; + screen_name = user["screen_name"]; ``` First, `["user"]` checks whether there are any more object fields by looking for either `,` or @@ -387,12 +403,19 @@ To help visualize the algorithm, we'll walk through the example C++ given at the `["screen_name"]` then converts to object, checking for `{`, and finds `"screen_name"`. - To convert to string, `lemire` is written to the document's string buffer, which now has *two* - string_views pointing into it, and looks like `first!\0lemire\0`. + To convert the result to usable string (i.e., the screen name `lemire`), the characters are written to the document's + string buffer (after possibly escaping them), which now has *two* string_views pointing into it, and looks like `first!\0lemire\0`. Finally, the temporary user object is destroyed, causing it to skip the remainder of the object (`}`). + NOTE: You may only have one active array or object active at any given time. An array or an object becomes + active when the `ondemand::object` or `ondemand::array` is created, and it releases its 'focus' when + its destructor is called. If you create an array or an object located inside a parent object or array, + the child array or object becomes active while the parent becomes temporarily inactive. If you access + several sibling objects or arrays, you must ensure that the destructor is called by scoping each access + (see Iteration Safety section below for further details). + 5. We get `"retweet_count"` and `"favorite_count"` as unsigned integers. ```c++ @@ -484,8 +507,6 @@ for(auto field : doc.get_object()) { } ``` - - ### Iteration Safety The On Demand API is powerful. To compensate, we add some safeguards to ensure that it can be used without fear @@ -501,6 +522,48 @@ in production systems: if it was `nullptr` but did not care what the actual value was--it will iterate. The destructor automates the iteration. + Some care is needed when using the On Demand API in scenarios where you need to access several sibling arrays or objects because + only one object or array can be active at any one time. Let us consider the following example: + +```C++ + ondemand::parser parser; + const padded_string json = R"({ "parent": {"child1": {"name": "John"} , "child2": {"name": "Daniel"}} })"_padded; + auto doc = parser.iterate(json); + ondemand::object parent = doc["parent"]; + // parent owns the focus + ondemand::object c1 = parent["child1"]; + // c1 owns the focus + // + if(std::string_view(c1["name"]) != "John") { ... } + // c2 attempts to grab the focus from parent but fails + ondemand::object c2 = parent["child2"]; + // c2 is now in an unsafe state and the following line would be unsafe + // if(std::string_view(c2["name"]) != "Daniel") { return false; } +``` + + A correct usage is given by the following example: + +```C++ + ondemand::parser parser; + const padded_string json = R"({ "parent": {"child1": {"name": "John"} , "child2": {"name": "Daniel"}} })"_padded; + auto doc = parser.iterate(json); + ondemand::object parent = doc["parent"]; + // At this point, parent owns the focus + { + ondemand::object c1 = parent["child1"]; + // c1 grabbed the focus from parent + if(std::string_view(c1["name"]) != "John") { return false; } + } + // c1 went out of scope, so its destructor was called and the focus + // was handed back to parent. + { + ondemand::object c2 = parent["child2"]; + // c2 grabbed the focus from parent + // the following is safe: + if(std::string_view(c2["name"]) != "Daniel") { return false; } + } +``` + ### Benefits of the On Demand Approach We expect that the On Demand approach has many of the performance benefits of the schema-based approach, while providing a flexibility that is similar to that of the DOM-based approach. diff --git a/tests/ondemand/ondemand_basictests.cpp b/tests/ondemand/ondemand_basictests.cpp index ebaa1675..58766088 100644 --- a/tests/ondemand/ondemand_basictests.cpp +++ b/tests/ondemand/ondemand_basictests.cpp @@ -116,6 +116,63 @@ namespace key_string_tests { } +namespace active_tests { +#if SIMDJSON_EXCEPTIONS + bool parser_child() { + TEST_START(); + ondemand::parser parser; + const padded_string json = R"({ "parent": {"child1": {"name": "John"} , "child2": {"name": "Daniel"}} })"_padded; + auto doc = parser.iterate(json); + ondemand::object parent = doc["parent"]; + { + ondemand::object c1 = parent["child1"]; + if(std::string_view(c1["name"]) != "John") { return false; } + } + { + ondemand::object c2 = parent["child2"]; + if(std::string_view(c2["name"]) != "Daniel") { return false; } + } + return true; + } + bool parser_doc_correct() { + TEST_START(); + ondemand::parser parser; + const padded_string json = R"({ "key1": 1, "key2":2, "key3": 3 })"_padded; + auto doc = parser.iterate(json); + ondemand::object root_object = doc.get_object(); + int64_t k1 = root_object["key1"]; + int64_t k2 = root_object["key2"]; + int64_t k3 = root_object["key3"]; + return (k1 == 1) && (k2 == 2) && (k3 == 3); + } + + bool parser_doc_limits() { + TEST_START(); + ondemand::parser parser; + const padded_string json = R"({ "key1": 1, "key2":2, "key3": 3 })"_padded; + auto doc = parser.iterate(json); + int64_t k1 = doc["key1"]; + try { + int64_t k2 = doc["key2"]; + (void) k2; + } catch (simdjson::simdjson_error &) { + return true; // we expect to fail. + } + (void) k1; + return false; + } +#endif + bool run() { + return +#if SIMDJSON_EXCEPTIONS + parser_child() && + parser_doc_correct() && + parser_doc_limits() && +#endif + true; + } + +} namespace number_tests { // ulp distance @@ -815,6 +872,32 @@ namespace twitter_tests { })); TEST_SUCCEED(); } +#if SIMDJSON_EXCEPTIONS + bool twitter_example() { + TEST_START(); + padded_string json; + ASSERT_SUCCESS( padded_string::load(TWITTER_JSON).get(json) ); + ondemand::parser parser; + auto doc = parser.iterate(json); + for (ondemand::object tweet : doc["statuses"]) { + uint64_t id = tweet["id"]; + std::string_view text = tweet["text"]; + std::string_view screen_name; + { + ondemand::object user = tweet["user"]; + screen_name = user["screen_name"]; + } + uint64_t retweets = tweet["retweet_count"]; + uint64_t favorites = tweet["favorite_count"]; + (void) id; + (void) text; + (void) retweets; + (void) favorites; + (void) screen_name; + } + TEST_SUCCEED(); + } +#endif bool twitter_default_profile() { TEST_START(); @@ -972,6 +1055,7 @@ namespace twitter_tests { twitter_image_sizes() && #if SIMDJSON_EXCEPTIONS twitter_count_exception() && + twitter_example() && twitter_default_profile_exception() && twitter_image_sizes_exception() && #endif @@ -1436,6 +1520,7 @@ int main(int argc, char *argv[]) { error_tests::run() && ordering_tests::run() && key_string_tests::run() && + active_tests::run() && true ) { std::cout << "Basic tests are ok." << std::endl;