Adding a new benchmark for ondemand: distinct user id (#1239)

* Adding a distinct user id benchmark

* reenabling everything

* Removing an unnecessary "value()".

* Better tests of the examples and some fixes.

* Guarding exception code.
This commit is contained in:
Daniel Lemire 2020-10-23 08:47:01 -04:00 committed by GitHub
parent c592da4937
commit 14039d05a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 365 additions and 5 deletions

View File

@ -19,4 +19,8 @@ SIMDJSON_POP_DISABLE_WARNINGS
#include "kostya/iter.h"
#include "kostya/dom.h"
#include "distinctuserid/ondemand.h"
#include "distinctuserid/dom.h"
BENCHMARK_MAIN();

View File

@ -0,0 +1,52 @@
#pragma once
#include <vector>
#include <cstdint>
#include "event_counter.h"
#include "json_benchmark.h"
bool equals(const char *s1, const char *s2) { return strcmp(s1, s2) == 0; }
void remove_duplicates(std::vector<int64_t> &v) {
std::sort(v.begin(), v.end());
auto last = std::unique(v.begin(), v.end());
v.erase(last, v.end());
}
//
// Interface
//
namespace distinct_user_id {
template<typename T> static void DistinctUserID(benchmark::State &state);
} // namespace
//
// Implementation
//
#include "dom.h"
namespace distinct_user_id {
using namespace simdjson;
template<typename T> static void DistinctUserID(benchmark::State &state) {
//
// Load the JSON file
//
constexpr const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json";
error_code error;
padded_string json;
if ((error = padded_string::load(TWITTER_JSON).get(json))) {
std::cerr << error << std::endl;
state.SkipWithError("error loading");
return;
}
JsonBenchmark<T, Dom>(state, json);
}
} // namespace distinct_user_id

View File

@ -0,0 +1,89 @@
#pragma once
#if SIMDJSON_EXCEPTIONS
#include "distinctuserid.h"
namespace distinct_user_id {
using namespace simdjson;
simdjson_really_inline void simdjson_recurse(std::vector<int64_t> & v, simdjson::dom::element element);
void simdjson_recurse(std::vector<int64_t> & v, simdjson::dom::array array) {
for (auto child : array) {
simdjson_recurse(v, child);
}
}
void simdjson_recurse(std::vector<int64_t> & v, simdjson::dom::object object) {
for (auto [key, value] : object) {
if((key.size() == 4) && (memcmp(key.data(), "user", 4) == 0)) {
// we are in an object under the key "user"
simdjson::error_code error;
simdjson::dom::object child_object;
simdjson::dom::object child_array;
if (not (error = value.get(child_object))) {
for (auto [child_key, child_value] : child_object) {
if((child_key.size() == 2) && (memcmp(child_key.data(), "id", 2) == 0)) {
int64_t x;
if (not (error = child_value.get(x))) {
v.push_back(x);
}
}
simdjson_recurse(v, child_value);
}
} else if (not (error = value.get(child_array))) {
simdjson_recurse(v, child_array);
}
// end of: we are in an object under the key "user"
} else {
simdjson_recurse(v, value);
}
}
}
simdjson_really_inline void simdjson_recurse(std::vector<int64_t> & v, simdjson::dom::element element) {
simdjson_unused simdjson::error_code error;
simdjson::dom::array array;
simdjson::dom::object object;
if (not (error = element.get(array))) {
simdjson_recurse(v, array);
} else if (not (error = element.get(object))) {
simdjson_recurse(v, object);
}
}
class Dom {
public:
simdjson_really_inline bool Run(const padded_string &json);
simdjson_really_inline const std::vector<int64_t> &Result() { return ids; }
simdjson_really_inline size_t ItemCount() { return ids.size(); }
private:
dom::parser parser{};
std::vector<int64_t> ids{};
};
void print_vec(const std::vector<int64_t> &v) {
for (auto i : v) {
std::cout << i << " ";
}
std::cout << std::endl;
}
simdjson_really_inline bool Dom::Run(const padded_string &json) {
ids.clear();
dom::element doc = parser.parse(json);
simdjson_recurse(ids, doc);
remove_duplicates(ids);
return true;
}
BENCHMARK_TEMPLATE(DistinctUserID, Dom);
} // namespace distinct_user_id
#endif // SIMDJSON_EXCEPTIONS

View File

@ -0,0 +1,67 @@
#pragma once
#if SIMDJSON_EXCEPTIONS
#include "distinctuserid.h"
namespace distinct_user_id {
using namespace simdjson;
using namespace simdjson::builtin;
class OnDemand {
public:
OnDemand() {
if(!displayed_implementation) {
std::cout << "On Demand implementation: " << builtin_implementation()->name() << std::endl;
displayed_implementation = true;
}
}
simdjson_really_inline bool Run(const padded_string &json);
simdjson_really_inline const std::vector<int64_t> &Result() { return ids; }
simdjson_really_inline size_t ItemCount() { return ids.size(); }
private:
ondemand::parser parser{};
std::vector<int64_t> ids{};
static inline bool displayed_implementation = false;
};
simdjson_really_inline bool OnDemand::Run(const padded_string &json) {
ids.clear();
// Walk the document, parsing as we go
auto doc = parser.iterate(json);
for (ondemand::object tweet : doc["statuses"]) {
// We believe that all statuses have a matching
// user, and we are willing to throw when they do not:
//
// You might think that you do not need the braces, but
// you do, otherwise you will get the wrong answer. That is
// because you can only have one active object or array
// at a time.
{
ondemand::object user = tweet["user"];
int64_t id = user["id"];
ids.push_back(id);
}
// Not all tweets have a "retweeted_status", but when they do
// we want to go and find the user within.
auto retweet = tweet["retweeted_status"];
if(!retweet.error()) {
ondemand::object retweet_content = retweet;
ondemand::object reuser = retweet_content["user"];
int64_t rid = reuser["id"];
ids.push_back(rid);
}
}
remove_duplicates(ids);
return true;
}
BENCHMARK_TEMPLATE(DistinctUserID, OnDemand);
} // namespace distinct_user_id
#endif // SIMDJSON_EXCEPTIONS

View File

@ -29,6 +29,11 @@ auto doc = parser.iterate(json);
for (auto tweet : doc["statuses"]) {
std::string_view text = tweet["text"];
std::string_view screen_name = tweet["user"]["screen_name"];
std::string_view screen_name;
{
ondemand::object user = tweet["user"];
screen_name = user["screen_name"];
}
uint64_t retweets = tweet["retweet_count"];
uint64_t favorites = tweet["favorite_count"];
cout << screen_name << " (" << retweets << " retweets / " << favorites << " favorites): " << text << endl;
@ -313,6 +318,11 @@ To help visualize the algorithm, we'll walk through the example C++ given at the
rely on error chaining, so it is possible to delay error checks: we shall shortly explain error
chaining more fully.
NOTE: You should always have such a `document` instance (here `doc`) and it should remain in scope for the duration
of your parsing function. E.g., you should not use the returned document as a temporary (e.g., `auto x = parser.iterate(json).get_object();`)
followed by other operations as the destruction of the `document` instance makes all of the derived instances
ill-defined.
3. We iterate over the "statuses" field using a typical C++ iterator, reading past the initial
`{ "statuses": [ {`.
@ -355,6 +365,11 @@ To help visualize the algorithm, we'll walk through the example C++ given at the
when you attempt to cast the final `simdjson_result<object>` to object. Upon casting, an exception is
thrown if there was an error.
NOTE: while the document can be queried once for a key as if it were an object, it is not an actual object
instance. If you need to treat it as an object (e.g., to query more than one keys), you can cast it as
such `ondemand::object root_object = doc.get_object();`.
4. We get the `"text"` field as a string.
```c++
@ -379,7 +394,8 @@ To help visualize the algorithm, we'll walk through the example C++ given at the
4. We get the `"screen_name"` from the `"user"` object.
```c++
std::string_view screen_name = tweet["user"]["screen_name"];
ondemand::object user = tweet["user"];
screen_name = user["screen_name"];
```
First, `["user"]` checks whether there are any more object fields by looking for either `,` or
@ -387,12 +403,19 @@ To help visualize the algorithm, we'll walk through the example C++ given at the
`["screen_name"]` then converts to object, checking for `{`, and finds `"screen_name"`.
To convert to string, `lemire` is written to the document's string buffer, which now has *two*
string_views pointing into it, and looks like `first!\0lemire\0`.
To convert the result to usable string (i.e., the screen name `lemire`), the characters are written to the document's
string buffer (after possibly escaping them), which now has *two* string_views pointing into it, and looks like `first!\0lemire\0`.
Finally, the temporary user object is destroyed, causing it to skip the remainder of the object
(`}`).
NOTE: You may only have one active array or object active at any given time. An array or an object becomes
active when the `ondemand::object` or `ondemand::array` is created, and it releases its 'focus' when
its destructor is called. If you create an array or an object located inside a parent object or array,
the child array or object becomes active while the parent becomes temporarily inactive. If you access
several sibling objects or arrays, you must ensure that the destructor is called by scoping each access
(see Iteration Safety section below for further details).
5. We get `"retweet_count"` and `"favorite_count"` as unsigned integers.
```c++
@ -484,8 +507,6 @@ for(auto field : doc.get_object()) {
}
```
### Iteration Safety
The On Demand API is powerful. To compensate, we add some safeguards to ensure that it can be used without fear
@ -501,6 +522,48 @@ in production systems:
if it was `nullptr` but did not care what the actual value was--it will iterate. The destructor automates
the iteration.
Some care is needed when using the On Demand API in scenarios where you need to access several sibling arrays or objects because
only one object or array can be active at any one time. Let us consider the following example:
```C++
ondemand::parser parser;
const padded_string json = R"({ "parent": {"child1": {"name": "John"} , "child2": {"name": "Daniel"}} })"_padded;
auto doc = parser.iterate(json);
ondemand::object parent = doc["parent"];
// parent owns the focus
ondemand::object c1 = parent["child1"];
// c1 owns the focus
//
if(std::string_view(c1["name"]) != "John") { ... }
// c2 attempts to grab the focus from parent but fails
ondemand::object c2 = parent["child2"];
// c2 is now in an unsafe state and the following line would be unsafe
// if(std::string_view(c2["name"]) != "Daniel") { return false; }
```
A correct usage is given by the following example:
```C++
ondemand::parser parser;
const padded_string json = R"({ "parent": {"child1": {"name": "John"} , "child2": {"name": "Daniel"}} })"_padded;
auto doc = parser.iterate(json);
ondemand::object parent = doc["parent"];
// At this point, parent owns the focus
{
ondemand::object c1 = parent["child1"];
// c1 grabbed the focus from parent
if(std::string_view(c1["name"]) != "John") { return false; }
}
// c1 went out of scope, so its destructor was called and the focus
// was handed back to parent.
{
ondemand::object c2 = parent["child2"];
// c2 grabbed the focus from parent
// the following is safe:
if(std::string_view(c2["name"]) != "Daniel") { return false; }
}
```
### Benefits of the On Demand Approach
We expect that the On Demand approach has many of the performance benefits of the schema-based approach, while providing a flexibility that is similar to that of the DOM-based approach.

View File

@ -116,6 +116,63 @@ namespace key_string_tests {
}
namespace active_tests {
#if SIMDJSON_EXCEPTIONS
bool parser_child() {
TEST_START();
ondemand::parser parser;
const padded_string json = R"({ "parent": {"child1": {"name": "John"} , "child2": {"name": "Daniel"}} })"_padded;
auto doc = parser.iterate(json);
ondemand::object parent = doc["parent"];
{
ondemand::object c1 = parent["child1"];
if(std::string_view(c1["name"]) != "John") { return false; }
}
{
ondemand::object c2 = parent["child2"];
if(std::string_view(c2["name"]) != "Daniel") { return false; }
}
return true;
}
bool parser_doc_correct() {
TEST_START();
ondemand::parser parser;
const padded_string json = R"({ "key1": 1, "key2":2, "key3": 3 })"_padded;
auto doc = parser.iterate(json);
ondemand::object root_object = doc.get_object();
int64_t k1 = root_object["key1"];
int64_t k2 = root_object["key2"];
int64_t k3 = root_object["key3"];
return (k1 == 1) && (k2 == 2) && (k3 == 3);
}
bool parser_doc_limits() {
TEST_START();
ondemand::parser parser;
const padded_string json = R"({ "key1": 1, "key2":2, "key3": 3 })"_padded;
auto doc = parser.iterate(json);
int64_t k1 = doc["key1"];
try {
int64_t k2 = doc["key2"];
(void) k2;
} catch (simdjson::simdjson_error &) {
return true; // we expect to fail.
}
(void) k1;
return false;
}
#endif
bool run() {
return
#if SIMDJSON_EXCEPTIONS
parser_child() &&
parser_doc_correct() &&
parser_doc_limits() &&
#endif
true;
}
}
namespace number_tests {
// ulp distance
@ -815,6 +872,32 @@ namespace twitter_tests {
}));
TEST_SUCCEED();
}
#if SIMDJSON_EXCEPTIONS
bool twitter_example() {
TEST_START();
padded_string json;
ASSERT_SUCCESS( padded_string::load(TWITTER_JSON).get(json) );
ondemand::parser parser;
auto doc = parser.iterate(json);
for (ondemand::object tweet : doc["statuses"]) {
uint64_t id = tweet["id"];
std::string_view text = tweet["text"];
std::string_view screen_name;
{
ondemand::object user = tweet["user"];
screen_name = user["screen_name"];
}
uint64_t retweets = tweet["retweet_count"];
uint64_t favorites = tweet["favorite_count"];
(void) id;
(void) text;
(void) retweets;
(void) favorites;
(void) screen_name;
}
TEST_SUCCEED();
}
#endif
bool twitter_default_profile() {
TEST_START();
@ -972,6 +1055,7 @@ namespace twitter_tests {
twitter_image_sizes() &&
#if SIMDJSON_EXCEPTIONS
twitter_count_exception() &&
twitter_example() &&
twitter_default_profile_exception() &&
twitter_image_sizes_exception() &&
#endif
@ -1436,6 +1520,7 @@ int main(int argc, char *argv[]) {
error_tests::run() &&
ordering_tests::run() &&
key_string_tests::run() &&
active_tests::run() &&
true
) {
std::cout << "Basic tests are ok." << std::endl;