Adding a new benchmark for ondemand: distinct user id (#1239)
* Adding a distinct user id benchmark * reenabling everything * Removing an unnecessary "value()". * Better tests of the examples and some fixes. * Guarding exception code.
This commit is contained in:
parent
c592da4937
commit
14039d05a9
|
@ -19,4 +19,8 @@ SIMDJSON_POP_DISABLE_WARNINGS
|
|||
#include "kostya/iter.h"
|
||||
#include "kostya/dom.h"
|
||||
|
||||
#include "distinctuserid/ondemand.h"
|
||||
#include "distinctuserid/dom.h"
|
||||
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
|
||||
#pragma once
|
||||
#include <vector>
|
||||
#include <cstdint>
|
||||
#include "event_counter.h"
|
||||
#include "json_benchmark.h"
|
||||
|
||||
|
||||
bool equals(const char *s1, const char *s2) { return strcmp(s1, s2) == 0; }
|
||||
|
||||
void remove_duplicates(std::vector<int64_t> &v) {
|
||||
std::sort(v.begin(), v.end());
|
||||
auto last = std::unique(v.begin(), v.end());
|
||||
v.erase(last, v.end());
|
||||
}
|
||||
|
||||
//
|
||||
// Interface
|
||||
//
|
||||
|
||||
namespace distinct_user_id {
|
||||
template<typename T> static void DistinctUserID(benchmark::State &state);
|
||||
} // namespace
|
||||
|
||||
//
|
||||
// Implementation
|
||||
//
|
||||
|
||||
#include "dom.h"
|
||||
|
||||
|
||||
namespace distinct_user_id {
|
||||
|
||||
using namespace simdjson;
|
||||
|
||||
template<typename T> static void DistinctUserID(benchmark::State &state) {
|
||||
//
|
||||
// Load the JSON file
|
||||
//
|
||||
constexpr const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json";
|
||||
error_code error;
|
||||
padded_string json;
|
||||
if ((error = padded_string::load(TWITTER_JSON).get(json))) {
|
||||
std::cerr << error << std::endl;
|
||||
state.SkipWithError("error loading");
|
||||
return;
|
||||
}
|
||||
|
||||
JsonBenchmark<T, Dom>(state, json);
|
||||
}
|
||||
|
||||
} // namespace distinct_user_id
|
|
@ -0,0 +1,89 @@
|
|||
#pragma once
|
||||
|
||||
#if SIMDJSON_EXCEPTIONS
|
||||
|
||||
#include "distinctuserid.h"
|
||||
|
||||
namespace distinct_user_id {
|
||||
|
||||
using namespace simdjson;
|
||||
|
||||
|
||||
simdjson_really_inline void simdjson_recurse(std::vector<int64_t> & v, simdjson::dom::element element);
|
||||
void simdjson_recurse(std::vector<int64_t> & v, simdjson::dom::array array) {
|
||||
for (auto child : array) {
|
||||
simdjson_recurse(v, child);
|
||||
}
|
||||
}
|
||||
void simdjson_recurse(std::vector<int64_t> & v, simdjson::dom::object object) {
|
||||
for (auto [key, value] : object) {
|
||||
if((key.size() == 4) && (memcmp(key.data(), "user", 4) == 0)) {
|
||||
// we are in an object under the key "user"
|
||||
simdjson::error_code error;
|
||||
simdjson::dom::object child_object;
|
||||
simdjson::dom::object child_array;
|
||||
if (not (error = value.get(child_object))) {
|
||||
for (auto [child_key, child_value] : child_object) {
|
||||
if((child_key.size() == 2) && (memcmp(child_key.data(), "id", 2) == 0)) {
|
||||
int64_t x;
|
||||
if (not (error = child_value.get(x))) {
|
||||
v.push_back(x);
|
||||
}
|
||||
}
|
||||
simdjson_recurse(v, child_value);
|
||||
}
|
||||
} else if (not (error = value.get(child_array))) {
|
||||
simdjson_recurse(v, child_array);
|
||||
}
|
||||
// end of: we are in an object under the key "user"
|
||||
} else {
|
||||
simdjson_recurse(v, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
simdjson_really_inline void simdjson_recurse(std::vector<int64_t> & v, simdjson::dom::element element) {
|
||||
simdjson_unused simdjson::error_code error;
|
||||
simdjson::dom::array array;
|
||||
simdjson::dom::object object;
|
||||
if (not (error = element.get(array))) {
|
||||
simdjson_recurse(v, array);
|
||||
} else if (not (error = element.get(object))) {
|
||||
simdjson_recurse(v, object);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class Dom {
|
||||
public:
|
||||
simdjson_really_inline bool Run(const padded_string &json);
|
||||
simdjson_really_inline const std::vector<int64_t> &Result() { return ids; }
|
||||
simdjson_really_inline size_t ItemCount() { return ids.size(); }
|
||||
|
||||
private:
|
||||
dom::parser parser{};
|
||||
std::vector<int64_t> ids{};
|
||||
|
||||
};
|
||||
void print_vec(const std::vector<int64_t> &v) {
|
||||
for (auto i : v) {
|
||||
std::cout << i << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
simdjson_really_inline bool Dom::Run(const padded_string &json) {
|
||||
ids.clear();
|
||||
dom::element doc = parser.parse(json);
|
||||
simdjson_recurse(ids, doc);
|
||||
remove_duplicates(ids);
|
||||
return true;
|
||||
}
|
||||
|
||||
BENCHMARK_TEMPLATE(DistinctUserID, Dom);
|
||||
|
||||
} // namespace distinct_user_id
|
||||
|
||||
#endif // SIMDJSON_EXCEPTIONS
|
|
@ -0,0 +1,67 @@
|
|||
#pragma once
|
||||
|
||||
#if SIMDJSON_EXCEPTIONS
|
||||
|
||||
#include "distinctuserid.h"
|
||||
|
||||
namespace distinct_user_id {
|
||||
|
||||
using namespace simdjson;
|
||||
using namespace simdjson::builtin;
|
||||
|
||||
|
||||
class OnDemand {
|
||||
public:
|
||||
OnDemand() {
|
||||
if(!displayed_implementation) {
|
||||
std::cout << "On Demand implementation: " << builtin_implementation()->name() << std::endl;
|
||||
displayed_implementation = true;
|
||||
}
|
||||
}
|
||||
simdjson_really_inline bool Run(const padded_string &json);
|
||||
simdjson_really_inline const std::vector<int64_t> &Result() { return ids; }
|
||||
simdjson_really_inline size_t ItemCount() { return ids.size(); }
|
||||
|
||||
private:
|
||||
ondemand::parser parser{};
|
||||
std::vector<int64_t> ids{};
|
||||
|
||||
static inline bool displayed_implementation = false;
|
||||
};
|
||||
|
||||
simdjson_really_inline bool OnDemand::Run(const padded_string &json) {
|
||||
ids.clear();
|
||||
// Walk the document, parsing as we go
|
||||
auto doc = parser.iterate(json);
|
||||
for (ondemand::object tweet : doc["statuses"]) {
|
||||
// We believe that all statuses have a matching
|
||||
// user, and we are willing to throw when they do not:
|
||||
//
|
||||
// You might think that you do not need the braces, but
|
||||
// you do, otherwise you will get the wrong answer. That is
|
||||
// because you can only have one active object or array
|
||||
// at a time.
|
||||
{
|
||||
ondemand::object user = tweet["user"];
|
||||
int64_t id = user["id"];
|
||||
ids.push_back(id);
|
||||
}
|
||||
// Not all tweets have a "retweeted_status", but when they do
|
||||
// we want to go and find the user within.
|
||||
auto retweet = tweet["retweeted_status"];
|
||||
if(!retweet.error()) {
|
||||
ondemand::object retweet_content = retweet;
|
||||
ondemand::object reuser = retweet_content["user"];
|
||||
int64_t rid = reuser["id"];
|
||||
ids.push_back(rid);
|
||||
}
|
||||
}
|
||||
remove_duplicates(ids);
|
||||
return true;
|
||||
}
|
||||
|
||||
BENCHMARK_TEMPLATE(DistinctUserID, OnDemand);
|
||||
|
||||
} // namespace distinct_user_id
|
||||
|
||||
#endif // SIMDJSON_EXCEPTIONS
|
|
@ -29,6 +29,11 @@ auto doc = parser.iterate(json);
|
|||
for (auto tweet : doc["statuses"]) {
|
||||
std::string_view text = tweet["text"];
|
||||
std::string_view screen_name = tweet["user"]["screen_name"];
|
||||
std::string_view screen_name;
|
||||
{
|
||||
ondemand::object user = tweet["user"];
|
||||
screen_name = user["screen_name"];
|
||||
}
|
||||
uint64_t retweets = tweet["retweet_count"];
|
||||
uint64_t favorites = tweet["favorite_count"];
|
||||
cout << screen_name << " (" << retweets << " retweets / " << favorites << " favorites): " << text << endl;
|
||||
|
@ -313,6 +318,11 @@ To help visualize the algorithm, we'll walk through the example C++ given at the
|
|||
rely on error chaining, so it is possible to delay error checks: we shall shortly explain error
|
||||
chaining more fully.
|
||||
|
||||
NOTE: You should always have such a `document` instance (here `doc`) and it should remain in scope for the duration
|
||||
of your parsing function. E.g., you should not use the returned document as a temporary (e.g., `auto x = parser.iterate(json).get_object();`)
|
||||
followed by other operations as the destruction of the `document` instance makes all of the derived instances
|
||||
ill-defined.
|
||||
|
||||
|
||||
3. We iterate over the "statuses" field using a typical C++ iterator, reading past the initial
|
||||
`{ "statuses": [ {`.
|
||||
|
@ -355,6 +365,11 @@ To help visualize the algorithm, we'll walk through the example C++ given at the
|
|||
when you attempt to cast the final `simdjson_result<object>` to object. Upon casting, an exception is
|
||||
thrown if there was an error.
|
||||
|
||||
NOTE: while the document can be queried once for a key as if it were an object, it is not an actual object
|
||||
instance. If you need to treat it as an object (e.g., to query more than one keys), you can cast it as
|
||||
such `ondemand::object root_object = doc.get_object();`.
|
||||
|
||||
|
||||
4. We get the `"text"` field as a string.
|
||||
|
||||
```c++
|
||||
|
@ -379,7 +394,8 @@ To help visualize the algorithm, we'll walk through the example C++ given at the
|
|||
4. We get the `"screen_name"` from the `"user"` object.
|
||||
|
||||
```c++
|
||||
std::string_view screen_name = tweet["user"]["screen_name"];
|
||||
ondemand::object user = tweet["user"];
|
||||
screen_name = user["screen_name"];
|
||||
```
|
||||
|
||||
First, `["user"]` checks whether there are any more object fields by looking for either `,` or
|
||||
|
@ -387,12 +403,19 @@ To help visualize the algorithm, we'll walk through the example C++ given at the
|
|||
|
||||
`["screen_name"]` then converts to object, checking for `{`, and finds `"screen_name"`.
|
||||
|
||||
To convert to string, `lemire` is written to the document's string buffer, which now has *two*
|
||||
string_views pointing into it, and looks like `first!\0lemire\0`.
|
||||
To convert the result to usable string (i.e., the screen name `lemire`), the characters are written to the document's
|
||||
string buffer (after possibly escaping them), which now has *two* string_views pointing into it, and looks like `first!\0lemire\0`.
|
||||
|
||||
Finally, the temporary user object is destroyed, causing it to skip the remainder of the object
|
||||
(`}`).
|
||||
|
||||
NOTE: You may only have one active array or object active at any given time. An array or an object becomes
|
||||
active when the `ondemand::object` or `ondemand::array` is created, and it releases its 'focus' when
|
||||
its destructor is called. If you create an array or an object located inside a parent object or array,
|
||||
the child array or object becomes active while the parent becomes temporarily inactive. If you access
|
||||
several sibling objects or arrays, you must ensure that the destructor is called by scoping each access
|
||||
(see Iteration Safety section below for further details).
|
||||
|
||||
5. We get `"retweet_count"` and `"favorite_count"` as unsigned integers.
|
||||
|
||||
```c++
|
||||
|
@ -484,8 +507,6 @@ for(auto field : doc.get_object()) {
|
|||
}
|
||||
```
|
||||
|
||||
|
||||
|
||||
### Iteration Safety
|
||||
|
||||
The On Demand API is powerful. To compensate, we add some safeguards to ensure that it can be used without fear
|
||||
|
@ -501,6 +522,48 @@ in production systems:
|
|||
if it was `nullptr` but did not care what the actual value was--it will iterate. The destructor automates
|
||||
the iteration.
|
||||
|
||||
Some care is needed when using the On Demand API in scenarios where you need to access several sibling arrays or objects because
|
||||
only one object or array can be active at any one time. Let us consider the following example:
|
||||
|
||||
```C++
|
||||
ondemand::parser parser;
|
||||
const padded_string json = R"({ "parent": {"child1": {"name": "John"} , "child2": {"name": "Daniel"}} })"_padded;
|
||||
auto doc = parser.iterate(json);
|
||||
ondemand::object parent = doc["parent"];
|
||||
// parent owns the focus
|
||||
ondemand::object c1 = parent["child1"];
|
||||
// c1 owns the focus
|
||||
//
|
||||
if(std::string_view(c1["name"]) != "John") { ... }
|
||||
// c2 attempts to grab the focus from parent but fails
|
||||
ondemand::object c2 = parent["child2"];
|
||||
// c2 is now in an unsafe state and the following line would be unsafe
|
||||
// if(std::string_view(c2["name"]) != "Daniel") { return false; }
|
||||
```
|
||||
|
||||
A correct usage is given by the following example:
|
||||
|
||||
```C++
|
||||
ondemand::parser parser;
|
||||
const padded_string json = R"({ "parent": {"child1": {"name": "John"} , "child2": {"name": "Daniel"}} })"_padded;
|
||||
auto doc = parser.iterate(json);
|
||||
ondemand::object parent = doc["parent"];
|
||||
// At this point, parent owns the focus
|
||||
{
|
||||
ondemand::object c1 = parent["child1"];
|
||||
// c1 grabbed the focus from parent
|
||||
if(std::string_view(c1["name"]) != "John") { return false; }
|
||||
}
|
||||
// c1 went out of scope, so its destructor was called and the focus
|
||||
// was handed back to parent.
|
||||
{
|
||||
ondemand::object c2 = parent["child2"];
|
||||
// c2 grabbed the focus from parent
|
||||
// the following is safe:
|
||||
if(std::string_view(c2["name"]) != "Daniel") { return false; }
|
||||
}
|
||||
```
|
||||
|
||||
### Benefits of the On Demand Approach
|
||||
|
||||
We expect that the On Demand approach has many of the performance benefits of the schema-based approach, while providing a flexibility that is similar to that of the DOM-based approach.
|
||||
|
|
|
@ -116,6 +116,63 @@ namespace key_string_tests {
|
|||
|
||||
}
|
||||
|
||||
namespace active_tests {
|
||||
#if SIMDJSON_EXCEPTIONS
|
||||
bool parser_child() {
|
||||
TEST_START();
|
||||
ondemand::parser parser;
|
||||
const padded_string json = R"({ "parent": {"child1": {"name": "John"} , "child2": {"name": "Daniel"}} })"_padded;
|
||||
auto doc = parser.iterate(json);
|
||||
ondemand::object parent = doc["parent"];
|
||||
{
|
||||
ondemand::object c1 = parent["child1"];
|
||||
if(std::string_view(c1["name"]) != "John") { return false; }
|
||||
}
|
||||
{
|
||||
ondemand::object c2 = parent["child2"];
|
||||
if(std::string_view(c2["name"]) != "Daniel") { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool parser_doc_correct() {
|
||||
TEST_START();
|
||||
ondemand::parser parser;
|
||||
const padded_string json = R"({ "key1": 1, "key2":2, "key3": 3 })"_padded;
|
||||
auto doc = parser.iterate(json);
|
||||
ondemand::object root_object = doc.get_object();
|
||||
int64_t k1 = root_object["key1"];
|
||||
int64_t k2 = root_object["key2"];
|
||||
int64_t k3 = root_object["key3"];
|
||||
return (k1 == 1) && (k2 == 2) && (k3 == 3);
|
||||
}
|
||||
|
||||
bool parser_doc_limits() {
|
||||
TEST_START();
|
||||
ondemand::parser parser;
|
||||
const padded_string json = R"({ "key1": 1, "key2":2, "key3": 3 })"_padded;
|
||||
auto doc = parser.iterate(json);
|
||||
int64_t k1 = doc["key1"];
|
||||
try {
|
||||
int64_t k2 = doc["key2"];
|
||||
(void) k2;
|
||||
} catch (simdjson::simdjson_error &) {
|
||||
return true; // we expect to fail.
|
||||
}
|
||||
(void) k1;
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
bool run() {
|
||||
return
|
||||
#if SIMDJSON_EXCEPTIONS
|
||||
parser_child() &&
|
||||
parser_doc_correct() &&
|
||||
parser_doc_limits() &&
|
||||
#endif
|
||||
true;
|
||||
}
|
||||
|
||||
}
|
||||
namespace number_tests {
|
||||
|
||||
// ulp distance
|
||||
|
@ -815,6 +872,32 @@ namespace twitter_tests {
|
|||
}));
|
||||
TEST_SUCCEED();
|
||||
}
|
||||
#if SIMDJSON_EXCEPTIONS
|
||||
bool twitter_example() {
|
||||
TEST_START();
|
||||
padded_string json;
|
||||
ASSERT_SUCCESS( padded_string::load(TWITTER_JSON).get(json) );
|
||||
ondemand::parser parser;
|
||||
auto doc = parser.iterate(json);
|
||||
for (ondemand::object tweet : doc["statuses"]) {
|
||||
uint64_t id = tweet["id"];
|
||||
std::string_view text = tweet["text"];
|
||||
std::string_view screen_name;
|
||||
{
|
||||
ondemand::object user = tweet["user"];
|
||||
screen_name = user["screen_name"];
|
||||
}
|
||||
uint64_t retweets = tweet["retweet_count"];
|
||||
uint64_t favorites = tweet["favorite_count"];
|
||||
(void) id;
|
||||
(void) text;
|
||||
(void) retweets;
|
||||
(void) favorites;
|
||||
(void) screen_name;
|
||||
}
|
||||
TEST_SUCCEED();
|
||||
}
|
||||
#endif
|
||||
|
||||
bool twitter_default_profile() {
|
||||
TEST_START();
|
||||
|
@ -972,6 +1055,7 @@ namespace twitter_tests {
|
|||
twitter_image_sizes() &&
|
||||
#if SIMDJSON_EXCEPTIONS
|
||||
twitter_count_exception() &&
|
||||
twitter_example() &&
|
||||
twitter_default_profile_exception() &&
|
||||
twitter_image_sizes_exception() &&
|
||||
#endif
|
||||
|
@ -1436,6 +1520,7 @@ int main(int argc, char *argv[]) {
|
|||
error_tests::run() &&
|
||||
ordering_tests::run() &&
|
||||
key_string_tests::run() &&
|
||||
active_tests::run() &&
|
||||
true
|
||||
) {
|
||||
std::cout << "Basic tests are ok." << std::endl;
|
||||
|
|
Loading…
Reference in New Issue