Add top_tweet benchmark to test laziness

This commit is contained in:
John Keiser 2021-01-10 11:52:07 -08:00
parent 3279c2f15b
commit be61650102
12 changed files with 484 additions and 3 deletions

View File

@ -62,4 +62,11 @@ SIMDJSON_POP_DISABLE_WARNINGS
#include "find_tweet/rapidjson.h" #include "find_tweet/rapidjson.h"
#include "find_tweet/nlohmann_json.h" #include "find_tweet/nlohmann_json.h"
#include "top_tweet/simdjson_dom.h"
#include "top_tweet/simdjson_ondemand.h"
#include "top_tweet/yyjson.h"
#include "top_tweet/sajson.h"
#include "top_tweet/rapidjson.h"
#include "top_tweet/nlohmann_json.h"
BENCHMARK_MAIN(); BENCHMARK_MAIN();

View File

@ -12,7 +12,7 @@ struct yyjson_base {
yyjson_val *root = yyjson_doc_get_root(doc); yyjson_val *root = yyjson_doc_get_root(doc);
if (!yyjson_is_obj(root)) { return false; } if (!yyjson_is_obj(root)) { return false; }
yyjson_val *statuses = yyjson_obj_get(root, "statuses"); yyjson_val *statuses = yyjson_obj_get(root, "statuses");
if (!yyjson_is_arr(statuses)) { return "Statuses is not an array!"; } if (!yyjson_is_arr(statuses)) { return false; }
// Walk the document, parsing the tweets as we go // Walk the document, parsing the tweets as we go
size_t tweet_idx, tweets_max; size_t tweet_idx, tweets_max;

View File

@ -14,7 +14,7 @@ struct yyjson_base {
yyjson_val *root = yyjson_doc_get_root(doc); yyjson_val *root = yyjson_doc_get_root(doc);
if (!yyjson_is_obj(root)) { return false; } if (!yyjson_is_obj(root)) { return false; }
yyjson_val *statuses = yyjson_obj_get(root, "statuses"); yyjson_val *statuses = yyjson_obj_get(root, "statuses");
if (!yyjson_is_arr(statuses)) { return "Statuses is not an array!"; } if (!yyjson_is_arr(statuses)) { return false; }
// Walk the document, parsing the tweets as we go // Walk the document, parsing the tweets as we go
size_t tweet_idx, tweets_max; size_t tweet_idx, tweets_max;

View File

@ -37,7 +37,7 @@ struct yyjson_base {
yyjson_val *root = yyjson_doc_get_root(doc); yyjson_val *root = yyjson_doc_get_root(doc);
if (!yyjson_is_obj(root)) { return false; } if (!yyjson_is_obj(root)) { return false; }
yyjson_val *statuses = yyjson_obj_get(root, "statuses"); yyjson_val *statuses = yyjson_obj_get(root, "statuses");
if (!yyjson_is_arr(statuses)) { return "Statuses is not an array!"; } if (!yyjson_is_arr(statuses)) { return false; }
// Walk the document, parsing the tweets as we go // Walk the document, parsing the tweets as we go
size_t tweet_idx, tweets_max; size_t tweet_idx, tweets_max;

View File

@ -0,0 +1,49 @@
# Top Tweet Benchmark
The top_tweet benchmark finds the most-retweeted tweet in a twitter API response.
## Purpose
This scenario tends to measure an implementation's laziness: its ability to avoid parsing unneeded
values, without knowing beforehand which values are needed.
To find the top tweet, an implementation needs to iterate through all tweets, remembering which one
had the highest retweet count. While it scans, it will find many "candidate" tweets with the highest
retweet count *up to that point.* However, While the implementation iterates through tweets, it will
have many "candidate" tweets. Essentially, it has to keep track of the "top tweet so far" while it
searches. However, only the text and screen_name of the *final* top tweet need to be parsed.
Therefore, JSON parsers that can only parse values on the first pass (such as DOM or streaming
parsers) will be forced to parse text and screen_name of every candidate (if not every single
tweet). Parsers which can delay parsing of values until later will therefore shine in scenarios like
this.
## Rules
The benchmark will be called with `run(padded_string &json, int64_t max_retweet_count, top_tweet_result &result)`.
The benchmark must:
- Find the tweet with the highest retweet_count at the top level of the "statuses" array.
- Find the *last* such tweet: if multiple tweets have the same top retweet_count, the last one
should be returned.
- Exclude tweets with retweet_count above max_retweet_count. This restriction is solely here because
the default twitter.json has a rather high retweet count in the third tweet, and to test laziness
the matching tweet needs to be further down in the file.
- Fill in top_tweet_result with the corresponding fields from the matching tweet.
### Abridged Schema
The abridged schema (objects contain more fields than listed here):
```json
{
"statuses": [
{
"text": "i like to tweet", // text containing UTF-8 and escape characters
"user": {
"screen_name": "AlexanderHamilton" // string containing UTF-8 (and escape characters?)
},
"retweet_count": 2, // uint32
},
...
]
}
```

View File

@ -0,0 +1,39 @@
#pragma once
#if SIMDJSON_COMPETITION_NLOHMANN_JSON
#include "top_tweet.h"
namespace top_tweet {
using namespace simdjson;
struct nlohmann_json {
using StringType=std::string;
dom::parser parser{};
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
result.retweet_count = -1;
nlohmann::json top_tweet{};
auto root = nlohmann::json::parse(json.data(), json.data() + json.size());
for (auto tweet : root["statuses"]) {
int64_t retweet_count = tweet["retweet_count"];
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
result.retweet_count = retweet_count;
top_tweet = tweet;
}
}
result.text = top_tweet["text"];
result.screen_name = top_tweet["user"]["screen_name"];
return result.retweet_count != -1;
}
};
BENCHMARK_TEMPLATE(top_tweet, nlohmann_json)->UseManualTime();
} // namespace top_tweet
#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON

View File

@ -0,0 +1,69 @@
#pragma once
#ifdef SIMDJSON_COMPETITION_RAPIDJSON
#include "top_tweet.h"
namespace top_tweet {
using namespace rapidjson;
struct rapidjson_base {
using StringType=std::string_view;
Document doc{};
bool run(Document &root, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
result.retweet_count = -1;
// Loop over the tweets
if (root.HasParseError() || !root.IsObject()) { return false; }
const auto &statuses = root.FindMember("statuses");
if (statuses == root.MemberEnd() || !statuses->value.IsArray()) { return false; }
for (const Value &tweet : statuses->value.GetArray()) {
if (!tweet.IsObject()) { return false; }
// Check if this tweet has a higher retweet count than the current top tweet
const auto &retweet_count_json = tweet.FindMember("retweet_count");
if (retweet_count_json == tweet.MemberEnd() || !retweet_count_json->value.IsInt64()) { return false; }
int64_t retweet_count = retweet_count_json->value.GetInt64();
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
result.retweet_count = retweet_count;
// TODO I can't figure out if there's a way to keep the Value to use outside the loop ...
// Get text and screen_name of top tweet
const auto &text = tweet.FindMember("text");
if (text == tweet.MemberEnd() || !text->value.IsString()) { return false; }
result.text = { text->value.GetString(), text->value.GetStringLength() };
const auto &user = tweet.FindMember("user");
if (user == tweet.MemberEnd() || !user->value.IsObject()) { return false; }
const auto &screen_name = user->value.FindMember("screen_name");
if (screen_name == user->value.MemberEnd() || !screen_name->value.IsString()) { return false; }
result.screen_name = { screen_name->value.GetString(), screen_name->value.GetStringLength() };
}
}
return result.retweet_count != -1;
}
};
struct rapidjson : rapidjson_base {
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
return rapidjson_base::run(doc.Parse<kParseValidateEncodingFlag>(json.data()), max_retweet_count, result);
}
};
BENCHMARK_TEMPLATE(top_tweet, rapidjson)->UseManualTime();
struct rapidjson_insitu : rapidjson_base {
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
return rapidjson_base::run(doc.ParseInsitu<kParseValidateEncodingFlag>(json.data()), max_retweet_count, result);
}
};
BENCHMARK_TEMPLATE(top_tweet, rapidjson_insitu)->UseManualTime();
} // namespace partial_tweets
#endif // SIMDJSON_COMPETITION_RAPIDJSON

View File

@ -0,0 +1,62 @@
#pragma once
#ifdef SIMDJSON_COMPETITION_SAJSON
#include "top_tweet.h"
namespace top_tweet {
struct sajson {
using StringType=std::string_view;
size_t ast_buffer_size{0};
size_t *ast_buffer{nullptr};
bool run(simdjson::padded_string &json, int32_t max_retweet_count, top_tweet_result<StringType> &result) {
if (!ast_buffer) {
ast_buffer_size = json.size();
ast_buffer = (size_t *)std::malloc(ast_buffer_size * sizeof(size_t));
}
auto doc = ::sajson::parse(
::sajson::bounded_allocation(ast_buffer, ast_buffer_size),
::sajson::mutable_string_view(json.size(), json.data())
);
if (!doc.is_valid()) { return false; }
auto root = doc.get_root();
if (root.get_type() != ::sajson::TYPE_OBJECT) { return false; }
auto statuses = root.get_value_of_key({ "statuses", strlen("statuses") });
if (statuses.get_type() != ::sajson::TYPE_ARRAY) { return false; }
for (size_t i=0; i<statuses.get_length(); i++) {
auto tweet = statuses.get_array_element(i);
if (tweet.get_type() != ::sajson::TYPE_OBJECT) { return false; }
// We can't keep a copy of "value" around, so AFAICT we can't lazily parse
auto retweet_count_val = tweet.get_value_of_key({ "retweet_count", strlen("retweet_count") });
if (retweet_count_val.get_type() != ::sajson::TYPE_INTEGER) { return false; }
int32_t retweet_count = retweet_count_val.get_integer_value();
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
result.retweet_count = retweet_count;
auto text = tweet.get_value_of_key({ "text", strlen("text") });
if (text.get_type() != ::sajson::TYPE_STRING) { return false; }
result.text = { text.as_cstring(), text.get_string_length() };
auto user = tweet.get_value_of_key({ "user", strlen("user") });
if (user.get_type() != ::sajson::TYPE_OBJECT) { return false; }
auto screen_name = user.get_value_of_key({ "screen_name", strlen("screen_name") });
if (screen_name.get_type() != ::sajson::TYPE_STRING) { return false; }
result.screen_name = { screen_name.as_cstring(), screen_name.get_string_length() };
}
}
return result.retweet_count != -1;
}
};
BENCHMARK_TEMPLATE(top_tweet, sajson)->UseManualTime();
} // namespace top_tweet
#endif // SIMDJSON_COMPETITION_SAJSON

View File

@ -0,0 +1,39 @@
#pragma once
#if SIMDJSON_EXCEPTIONS
#include "top_tweet.h"
namespace top_tweet {
using namespace simdjson;
struct simdjson_dom {
using StringType=std::string_view;
dom::parser parser{};
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
result.retweet_count = -1;
dom::element top_tweet{};
auto doc = parser.parse(json);
for (auto tweet : doc["statuses"]) {
int64_t retweet_count = tweet["retweet_count"];
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
result.retweet_count = retweet_count;
top_tweet = tweet;
}
}
result.text = top_tweet["text"];
result.screen_name = top_tweet["user"]["screen_name"];
return result.retweet_count != -1;
}
};
BENCHMARK_TEMPLATE(top_tweet, simdjson_dom)->UseManualTime();
} // namespace top_tweet
#endif // SIMDJSON_EXCEPTIONS

View File

@ -0,0 +1,81 @@
#pragma once
#if SIMDJSON_EXCEPTIONS
#include "top_tweet.h"
namespace top_tweet {
using namespace simdjson;
using namespace simdjson::builtin;
struct simdjson_ondemand {
using StringType=std::string_view;
ondemand::parser parser{};
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
result.retweet_count = -1;
// We save these DOM values for later so we don't have to parse them
// into string_views until we're sure which ones we want to parse
// NOTE: simdjson does not presently support reuse of objects or arrays--just scalars. This is
// why we have to grab the text and screen_name fields instead of just saving the tweet object.
ondemand::value screen_name, text;
auto doc = parser.iterate(json);
for (auto tweet : doc["statuses"]) {
// Since text, user.screen_name, and retweet_count generally appear in order, it's nearly free
// for us to retrieve them here (and will cost a bit more if we do it in the if
// statement).
auto tweet_text = tweet["text"];
auto tweet_screen_name = tweet["user"]["screen_name"];
int64_t retweet_count = tweet["retweet_count"];
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
result.retweet_count = retweet_count;
// TODO std::move should not be necessary
text = std::move(tweet_text);
screen_name = std::move(tweet_screen_name);
}
}
// Now that we know which was the most retweeted, parse the values in it
result.screen_name = screen_name;
result.text = text;
return result.retweet_count != -1;
}
};
BENCHMARK_TEMPLATE(top_tweet, simdjson_ondemand)->UseManualTime();
struct simdjson_ondemand_forward_only {
using StringType=std::string_view;
ondemand::parser parser{};
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
result.retweet_count = -1;
auto doc = parser.iterate(json);
for (auto tweet : doc["statuses"]) {
// Since text, user.screen_name, and retweet_count generally appear in order, it's nearly free
// for us to retrieve them here (and will cost a bit more if we do it in the if
// statement).
auto tweet_text = tweet["text"];
auto tweet_screen_name = tweet["user"]["screen_name"];
int64_t retweet_count = tweet["retweet_count"];
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
result.retweet_count = retweet_count;
result.text = tweet_text;
result.screen_name = tweet_screen_name;
}
}
return result.retweet_count != -1;
}
};
BENCHMARK_TEMPLATE(top_tweet, simdjson_ondemand_forward_only)->UseManualTime();
} // namespace top_tweet
#endif // SIMDJSON_EXCEPTIONS

View File

@ -0,0 +1,67 @@
#pragma once
#include "json_benchmark/file_runner.h"
namespace top_tweet {
using namespace json_benchmark;
template<typename StringType>
struct top_tweet_result {
int64_t retweet_count{};
StringType screen_name{};
StringType text{};
template<typename OtherStringType>
simdjson_really_inline bool operator==(const top_tweet_result<OtherStringType> &other) const {
return retweet_count == other.retweet_count &&
screen_name == other.screen_name &&
text == other.text;
}
template<typename OtherStringType>
simdjson_really_inline bool operator!=(const top_tweet_result<OtherStringType> &other) const { return !(*this == other); }
};
template<typename StringType>
simdjson_unused static std::ostream &operator<<(std::ostream &o, const top_tweet_result<StringType> &t) {
o << "retweet_count: " << t.retweet_count << std::endl;
o << "screen_name: " << t.screen_name << std::endl;
o << "text: " << t.text << std::endl;
return o;
}
template<typename I>
struct runner : public file_runner<I> {
top_tweet_result<typename I::StringType> result{};
bool setup(benchmark::State &state) {
return this->load_json(state, TWITTER_JSON);
}
bool before_run(benchmark::State &state) {
if (!file_runner<I>::before_run(state)) { return false; }
result.retweet_count = -1;
return true;
}
bool run(benchmark::State &) {
return this->implementation.run(this->json, 60, result);
}
template<typename R>
bool diff(benchmark::State &state, runner<R> &reference) {
return diff_results(state, result, reference.result, diff_flags::NONE);
}
size_t items_per_iteration() {
return 1;
}
};
struct simdjson_dom;
template<typename I> simdjson_really_inline static void top_tweet(benchmark::State &state) {
json_benchmark::run_json_benchmark<runner<I>, runner<simdjson_dom>>(state);
}
} // namespace top_tweet

View File

@ -0,0 +1,68 @@
#pragma once
#ifdef SIMDJSON_COMPETITION_YYJSON
#include "top_tweet.h"
namespace top_tweet {
struct yyjson_base {
using StringType=std::string_view;
bool run(yyjson_doc *doc, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
result.retweet_count = -1;
yyjson_val *top_tweet{};
if (!doc) { return false; }
yyjson_val *root = yyjson_doc_get_root(doc);
if (!yyjson_is_obj(root)) { return false; }
yyjson_val *statuses = yyjson_obj_get(root, "statuses");
if (!yyjson_is_arr(statuses)) { return false; }
// Walk the document, parsing the tweets as we go
size_t tweet_idx, tweets_max;
yyjson_val *tweet;
yyjson_arr_foreach(statuses, tweet_idx, tweets_max, tweet) {
if (!yyjson_is_obj(tweet)) { return false; }
auto retweet_count_val = yyjson_obj_get(tweet, "retweet_count");
if (!yyjson_is_uint(retweet_count_val)) { return false; }
int64_t retweet_count = yyjson_get_uint(retweet_count_val);
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
result.retweet_count = retweet_count;
top_tweet = tweet;
}
}
auto text = yyjson_obj_get(top_tweet, "text");
if (!yyjson_is_str(text)) { return false; }
result.text = { yyjson_get_str(text), yyjson_get_len(text) };
auto user = yyjson_obj_get(top_tweet, "user");
if (!yyjson_is_obj(user)) { return false; }
auto screen_name = yyjson_obj_get(user, "screen_name");
if (!yyjson_is_str(screen_name)) { return false; }
result.screen_name = { yyjson_get_str(screen_name), yyjson_get_len(screen_name) };
return result.retweet_count != -1;
}
};
struct yyjson : yyjson_base {
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
return yyjson_base::run(yyjson_read(json.data(), json.size(), 0), max_retweet_count, result);
}
};
BENCHMARK_TEMPLATE(top_tweet, yyjson)->UseManualTime();
struct yyjson_insitu : yyjson_base {
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
return yyjson_base::run(yyjson_read_opts(json.data(), json.size(), YYJSON_READ_INSITU, 0, 0), max_retweet_count, result);
}
};
BENCHMARK_TEMPLATE(top_tweet, yyjson_insitu)->UseManualTime();
} // namespace top_tweet
#endif // SIMDJSON_COMPETITION_YYJSON