Add top_tweet benchmark to test laziness
This commit is contained in:
parent
3279c2f15b
commit
be61650102
|
@ -62,4 +62,11 @@ SIMDJSON_POP_DISABLE_WARNINGS
|
|||
#include "find_tweet/rapidjson.h"
|
||||
#include "find_tweet/nlohmann_json.h"
|
||||
|
||||
#include "top_tweet/simdjson_dom.h"
|
||||
#include "top_tweet/simdjson_ondemand.h"
|
||||
#include "top_tweet/yyjson.h"
|
||||
#include "top_tweet/sajson.h"
|
||||
#include "top_tweet/rapidjson.h"
|
||||
#include "top_tweet/nlohmann_json.h"
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
|
|
|
@ -12,7 +12,7 @@ struct yyjson_base {
|
|||
yyjson_val *root = yyjson_doc_get_root(doc);
|
||||
if (!yyjson_is_obj(root)) { return false; }
|
||||
yyjson_val *statuses = yyjson_obj_get(root, "statuses");
|
||||
if (!yyjson_is_arr(statuses)) { return "Statuses is not an array!"; }
|
||||
if (!yyjson_is_arr(statuses)) { return false; }
|
||||
|
||||
// Walk the document, parsing the tweets as we go
|
||||
size_t tweet_idx, tweets_max;
|
||||
|
|
|
@ -14,7 +14,7 @@ struct yyjson_base {
|
|||
yyjson_val *root = yyjson_doc_get_root(doc);
|
||||
if (!yyjson_is_obj(root)) { return false; }
|
||||
yyjson_val *statuses = yyjson_obj_get(root, "statuses");
|
||||
if (!yyjson_is_arr(statuses)) { return "Statuses is not an array!"; }
|
||||
if (!yyjson_is_arr(statuses)) { return false; }
|
||||
|
||||
// Walk the document, parsing the tweets as we go
|
||||
size_t tweet_idx, tweets_max;
|
||||
|
|
|
@ -37,7 +37,7 @@ struct yyjson_base {
|
|||
yyjson_val *root = yyjson_doc_get_root(doc);
|
||||
if (!yyjson_is_obj(root)) { return false; }
|
||||
yyjson_val *statuses = yyjson_obj_get(root, "statuses");
|
||||
if (!yyjson_is_arr(statuses)) { return "Statuses is not an array!"; }
|
||||
if (!yyjson_is_arr(statuses)) { return false; }
|
||||
|
||||
// Walk the document, parsing the tweets as we go
|
||||
size_t tweet_idx, tweets_max;
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
# Top Tweet Benchmark
|
||||
|
||||
The top_tweet benchmark finds the most-retweeted tweet in a twitter API response.
|
||||
|
||||
## Purpose
|
||||
|
||||
This scenario tends to measure an implementation's laziness: its ability to avoid parsing unneeded
|
||||
values, without knowing beforehand which values are needed.
|
||||
|
||||
To find the top tweet, an implementation needs to iterate through all tweets, remembering which one
|
||||
had the highest retweet count. While it scans, it will find many "candidate" tweets with the highest
|
||||
retweet count *up to that point.* However, While the implementation iterates through tweets, it will
|
||||
have many "candidate" tweets. Essentially, it has to keep track of the "top tweet so far" while it
|
||||
searches. However, only the text and screen_name of the *final* top tweet need to be parsed.
|
||||
Therefore, JSON parsers that can only parse values on the first pass (such as DOM or streaming
|
||||
parsers) will be forced to parse text and screen_name of every candidate (if not every single
|
||||
tweet). Parsers which can delay parsing of values until later will therefore shine in scenarios like
|
||||
this.
|
||||
|
||||
## Rules
|
||||
|
||||
The benchmark will be called with `run(padded_string &json, int64_t max_retweet_count, top_tweet_result &result)`.
|
||||
The benchmark must:
|
||||
- Find the tweet with the highest retweet_count at the top level of the "statuses" array.
|
||||
- Find the *last* such tweet: if multiple tweets have the same top retweet_count, the last one
|
||||
should be returned.
|
||||
- Exclude tweets with retweet_count above max_retweet_count. This restriction is solely here because
|
||||
the default twitter.json has a rather high retweet count in the third tweet, and to test laziness
|
||||
the matching tweet needs to be further down in the file.
|
||||
- Fill in top_tweet_result with the corresponding fields from the matching tweet.
|
||||
|
||||
### Abridged Schema
|
||||
|
||||
The abridged schema (objects contain more fields than listed here):
|
||||
|
||||
```json
|
||||
{
|
||||
"statuses": [
|
||||
{
|
||||
"text": "i like to tweet", // text containing UTF-8 and escape characters
|
||||
"user": {
|
||||
"screen_name": "AlexanderHamilton" // string containing UTF-8 (and escape characters?)
|
||||
},
|
||||
"retweet_count": 2, // uint32
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
```
|
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
|
||||
#if SIMDJSON_COMPETITION_NLOHMANN_JSON
|
||||
|
||||
#include "top_tweet.h"
|
||||
|
||||
namespace top_tweet {
|
||||
|
||||
using namespace simdjson;
|
||||
|
||||
struct nlohmann_json {
|
||||
using StringType=std::string;
|
||||
|
||||
dom::parser parser{};
|
||||
|
||||
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
|
||||
result.retweet_count = -1;
|
||||
nlohmann::json top_tweet{};
|
||||
|
||||
auto root = nlohmann::json::parse(json.data(), json.data() + json.size());
|
||||
for (auto tweet : root["statuses"]) {
|
||||
int64_t retweet_count = tweet["retweet_count"];
|
||||
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
|
||||
result.retweet_count = retweet_count;
|
||||
top_tweet = tweet;
|
||||
}
|
||||
}
|
||||
|
||||
result.text = top_tweet["text"];
|
||||
result.screen_name = top_tweet["user"]["screen_name"];
|
||||
return result.retweet_count != -1;
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_TEMPLATE(top_tweet, nlohmann_json)->UseManualTime();
|
||||
|
||||
} // namespace top_tweet
|
||||
|
||||
#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON
|
|
@ -0,0 +1,69 @@
|
|||
#pragma once
|
||||
|
||||
#ifdef SIMDJSON_COMPETITION_RAPIDJSON
|
||||
|
||||
#include "top_tweet.h"
|
||||
|
||||
namespace top_tweet {
|
||||
|
||||
using namespace rapidjson;
|
||||
|
||||
struct rapidjson_base {
|
||||
using StringType=std::string_view;
|
||||
|
||||
Document doc{};
|
||||
|
||||
bool run(Document &root, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
|
||||
result.retweet_count = -1;
|
||||
|
||||
// Loop over the tweets
|
||||
if (root.HasParseError() || !root.IsObject()) { return false; }
|
||||
const auto &statuses = root.FindMember("statuses");
|
||||
if (statuses == root.MemberEnd() || !statuses->value.IsArray()) { return false; }
|
||||
for (const Value &tweet : statuses->value.GetArray()) {
|
||||
if (!tweet.IsObject()) { return false; }
|
||||
|
||||
// Check if this tweet has a higher retweet count than the current top tweet
|
||||
const auto &retweet_count_json = tweet.FindMember("retweet_count");
|
||||
if (retweet_count_json == tweet.MemberEnd() || !retweet_count_json->value.IsInt64()) { return false; }
|
||||
int64_t retweet_count = retweet_count_json->value.GetInt64();
|
||||
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
|
||||
result.retweet_count = retweet_count;
|
||||
|
||||
// TODO I can't figure out if there's a way to keep the Value to use outside the loop ...
|
||||
|
||||
// Get text and screen_name of top tweet
|
||||
const auto &text = tweet.FindMember("text");
|
||||
if (text == tweet.MemberEnd() || !text->value.IsString()) { return false; }
|
||||
result.text = { text->value.GetString(), text->value.GetStringLength() };
|
||||
|
||||
const auto &user = tweet.FindMember("user");
|
||||
if (user == tweet.MemberEnd() || !user->value.IsObject()) { return false; }
|
||||
const auto &screen_name = user->value.FindMember("screen_name");
|
||||
if (screen_name == user->value.MemberEnd() || !screen_name->value.IsString()) { return false; }
|
||||
result.screen_name = { screen_name->value.GetString(), screen_name->value.GetStringLength() };
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return result.retweet_count != -1;
|
||||
}
|
||||
};
|
||||
|
||||
struct rapidjson : rapidjson_base {
|
||||
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
|
||||
return rapidjson_base::run(doc.Parse<kParseValidateEncodingFlag>(json.data()), max_retweet_count, result);
|
||||
}
|
||||
};
|
||||
BENCHMARK_TEMPLATE(top_tweet, rapidjson)->UseManualTime();
|
||||
|
||||
struct rapidjson_insitu : rapidjson_base {
|
||||
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
|
||||
return rapidjson_base::run(doc.ParseInsitu<kParseValidateEncodingFlag>(json.data()), max_retweet_count, result);
|
||||
}
|
||||
};
|
||||
BENCHMARK_TEMPLATE(top_tweet, rapidjson_insitu)->UseManualTime();
|
||||
|
||||
} // namespace partial_tweets
|
||||
|
||||
#endif // SIMDJSON_COMPETITION_RAPIDJSON
|
|
@ -0,0 +1,62 @@
|
|||
#pragma once
|
||||
|
||||
#ifdef SIMDJSON_COMPETITION_SAJSON
|
||||
|
||||
#include "top_tweet.h"
|
||||
|
||||
namespace top_tweet {
|
||||
|
||||
struct sajson {
|
||||
using StringType=std::string_view;
|
||||
|
||||
size_t ast_buffer_size{0};
|
||||
size_t *ast_buffer{nullptr};
|
||||
|
||||
bool run(simdjson::padded_string &json, int32_t max_retweet_count, top_tweet_result<StringType> &result) {
|
||||
if (!ast_buffer) {
|
||||
ast_buffer_size = json.size();
|
||||
ast_buffer = (size_t *)std::malloc(ast_buffer_size * sizeof(size_t));
|
||||
}
|
||||
auto doc = ::sajson::parse(
|
||||
::sajson::bounded_allocation(ast_buffer, ast_buffer_size),
|
||||
::sajson::mutable_string_view(json.size(), json.data())
|
||||
);
|
||||
if (!doc.is_valid()) { return false; }
|
||||
|
||||
auto root = doc.get_root();
|
||||
if (root.get_type() != ::sajson::TYPE_OBJECT) { return false; }
|
||||
auto statuses = root.get_value_of_key({ "statuses", strlen("statuses") });
|
||||
if (statuses.get_type() != ::sajson::TYPE_ARRAY) { return false; }
|
||||
|
||||
for (size_t i=0; i<statuses.get_length(); i++) {
|
||||
auto tweet = statuses.get_array_element(i);
|
||||
if (tweet.get_type() != ::sajson::TYPE_OBJECT) { return false; }
|
||||
|
||||
// We can't keep a copy of "value" around, so AFAICT we can't lazily parse
|
||||
auto retweet_count_val = tweet.get_value_of_key({ "retweet_count", strlen("retweet_count") });
|
||||
if (retweet_count_val.get_type() != ::sajson::TYPE_INTEGER) { return false; }
|
||||
int32_t retweet_count = retweet_count_val.get_integer_value();
|
||||
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
|
||||
result.retweet_count = retweet_count;
|
||||
|
||||
auto text = tweet.get_value_of_key({ "text", strlen("text") });
|
||||
if (text.get_type() != ::sajson::TYPE_STRING) { return false; }
|
||||
result.text = { text.as_cstring(), text.get_string_length() };
|
||||
|
||||
auto user = tweet.get_value_of_key({ "user", strlen("user") });
|
||||
if (user.get_type() != ::sajson::TYPE_OBJECT) { return false; }
|
||||
auto screen_name = user.get_value_of_key({ "screen_name", strlen("screen_name") });
|
||||
if (screen_name.get_type() != ::sajson::TYPE_STRING) { return false; }
|
||||
result.screen_name = { screen_name.as_cstring(), screen_name.get_string_length() };
|
||||
}
|
||||
}
|
||||
|
||||
return result.retweet_count != -1;
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_TEMPLATE(top_tweet, sajson)->UseManualTime();
|
||||
|
||||
} // namespace top_tweet
|
||||
|
||||
#endif // SIMDJSON_COMPETITION_SAJSON
|
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
|
||||
#if SIMDJSON_EXCEPTIONS
|
||||
|
||||
#include "top_tweet.h"
|
||||
|
||||
namespace top_tweet {
|
||||
|
||||
using namespace simdjson;
|
||||
|
||||
struct simdjson_dom {
|
||||
using StringType=std::string_view;
|
||||
|
||||
dom::parser parser{};
|
||||
|
||||
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
|
||||
result.retweet_count = -1;
|
||||
dom::element top_tweet{};
|
||||
|
||||
auto doc = parser.parse(json);
|
||||
for (auto tweet : doc["statuses"]) {
|
||||
int64_t retweet_count = tweet["retweet_count"];
|
||||
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
|
||||
result.retweet_count = retweet_count;
|
||||
top_tweet = tweet;
|
||||
}
|
||||
}
|
||||
|
||||
result.text = top_tweet["text"];
|
||||
result.screen_name = top_tweet["user"]["screen_name"];
|
||||
return result.retweet_count != -1;
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_TEMPLATE(top_tweet, simdjson_dom)->UseManualTime();
|
||||
|
||||
} // namespace top_tweet
|
||||
|
||||
#endif // SIMDJSON_EXCEPTIONS
|
|
@ -0,0 +1,81 @@
|
|||
#pragma once
|
||||
|
||||
#if SIMDJSON_EXCEPTIONS
|
||||
|
||||
#include "top_tweet.h"
|
||||
|
||||
namespace top_tweet {
|
||||
|
||||
using namespace simdjson;
|
||||
using namespace simdjson::builtin;
|
||||
|
||||
struct simdjson_ondemand {
|
||||
using StringType=std::string_view;
|
||||
|
||||
ondemand::parser parser{};
|
||||
|
||||
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
|
||||
result.retweet_count = -1;
|
||||
// We save these DOM values for later so we don't have to parse them
|
||||
// into string_views until we're sure which ones we want to parse
|
||||
// NOTE: simdjson does not presently support reuse of objects or arrays--just scalars. This is
|
||||
// why we have to grab the text and screen_name fields instead of just saving the tweet object.
|
||||
ondemand::value screen_name, text;
|
||||
|
||||
auto doc = parser.iterate(json);
|
||||
for (auto tweet : doc["statuses"]) {
|
||||
// Since text, user.screen_name, and retweet_count generally appear in order, it's nearly free
|
||||
// for us to retrieve them here (and will cost a bit more if we do it in the if
|
||||
// statement).
|
||||
auto tweet_text = tweet["text"];
|
||||
auto tweet_screen_name = tweet["user"]["screen_name"];
|
||||
int64_t retweet_count = tweet["retweet_count"];
|
||||
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
|
||||
result.retweet_count = retweet_count;
|
||||
// TODO std::move should not be necessary
|
||||
text = std::move(tweet_text);
|
||||
screen_name = std::move(tweet_screen_name);
|
||||
}
|
||||
}
|
||||
|
||||
// Now that we know which was the most retweeted, parse the values in it
|
||||
result.screen_name = screen_name;
|
||||
result.text = text;
|
||||
return result.retweet_count != -1;
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_TEMPLATE(top_tweet, simdjson_ondemand)->UseManualTime();
|
||||
|
||||
struct simdjson_ondemand_forward_only {
|
||||
using StringType=std::string_view;
|
||||
|
||||
ondemand::parser parser{};
|
||||
|
||||
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
|
||||
result.retweet_count = -1;
|
||||
|
||||
auto doc = parser.iterate(json);
|
||||
for (auto tweet : doc["statuses"]) {
|
||||
// Since text, user.screen_name, and retweet_count generally appear in order, it's nearly free
|
||||
// for us to retrieve them here (and will cost a bit more if we do it in the if
|
||||
// statement).
|
||||
auto tweet_text = tweet["text"];
|
||||
auto tweet_screen_name = tweet["user"]["screen_name"];
|
||||
int64_t retweet_count = tweet["retweet_count"];
|
||||
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
|
||||
result.retweet_count = retweet_count;
|
||||
result.text = tweet_text;
|
||||
result.screen_name = tweet_screen_name;
|
||||
}
|
||||
}
|
||||
|
||||
return result.retweet_count != -1;
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_TEMPLATE(top_tweet, simdjson_ondemand_forward_only)->UseManualTime();
|
||||
|
||||
} // namespace top_tweet
|
||||
|
||||
#endif // SIMDJSON_EXCEPTIONS
|
|
@ -0,0 +1,67 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "json_benchmark/file_runner.h"
|
||||
|
||||
namespace top_tweet {
|
||||
|
||||
using namespace json_benchmark;
|
||||
|
||||
template<typename StringType>
|
||||
struct top_tweet_result {
|
||||
int64_t retweet_count{};
|
||||
StringType screen_name{};
|
||||
StringType text{};
|
||||
template<typename OtherStringType>
|
||||
simdjson_really_inline bool operator==(const top_tweet_result<OtherStringType> &other) const {
|
||||
return retweet_count == other.retweet_count &&
|
||||
screen_name == other.screen_name &&
|
||||
text == other.text;
|
||||
}
|
||||
template<typename OtherStringType>
|
||||
simdjson_really_inline bool operator!=(const top_tweet_result<OtherStringType> &other) const { return !(*this == other); }
|
||||
};
|
||||
|
||||
template<typename StringType>
|
||||
simdjson_unused static std::ostream &operator<<(std::ostream &o, const top_tweet_result<StringType> &t) {
|
||||
o << "retweet_count: " << t.retweet_count << std::endl;
|
||||
o << "screen_name: " << t.screen_name << std::endl;
|
||||
o << "text: " << t.text << std::endl;
|
||||
return o;
|
||||
}
|
||||
|
||||
template<typename I>
|
||||
struct runner : public file_runner<I> {
|
||||
top_tweet_result<typename I::StringType> result{};
|
||||
|
||||
bool setup(benchmark::State &state) {
|
||||
return this->load_json(state, TWITTER_JSON);
|
||||
}
|
||||
|
||||
bool before_run(benchmark::State &state) {
|
||||
if (!file_runner<I>::before_run(state)) { return false; }
|
||||
result.retweet_count = -1;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool run(benchmark::State &) {
|
||||
return this->implementation.run(this->json, 60, result);
|
||||
}
|
||||
|
||||
template<typename R>
|
||||
bool diff(benchmark::State &state, runner<R> &reference) {
|
||||
return diff_results(state, result, reference.result, diff_flags::NONE);
|
||||
}
|
||||
|
||||
size_t items_per_iteration() {
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
|
||||
struct simdjson_dom;
|
||||
|
||||
template<typename I> simdjson_really_inline static void top_tweet(benchmark::State &state) {
|
||||
json_benchmark::run_json_benchmark<runner<I>, runner<simdjson_dom>>(state);
|
||||
}
|
||||
|
||||
} // namespace top_tweet
|
|
@ -0,0 +1,68 @@
|
|||
#pragma once
|
||||
|
||||
#ifdef SIMDJSON_COMPETITION_YYJSON
|
||||
|
||||
#include "top_tweet.h"
|
||||
|
||||
namespace top_tweet {
|
||||
|
||||
struct yyjson_base {
|
||||
using StringType=std::string_view;
|
||||
|
||||
bool run(yyjson_doc *doc, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
|
||||
result.retweet_count = -1;
|
||||
|
||||
yyjson_val *top_tweet{};
|
||||
|
||||
if (!doc) { return false; }
|
||||
yyjson_val *root = yyjson_doc_get_root(doc);
|
||||
if (!yyjson_is_obj(root)) { return false; }
|
||||
yyjson_val *statuses = yyjson_obj_get(root, "statuses");
|
||||
if (!yyjson_is_arr(statuses)) { return false; }
|
||||
|
||||
// Walk the document, parsing the tweets as we go
|
||||
size_t tweet_idx, tweets_max;
|
||||
yyjson_val *tweet;
|
||||
yyjson_arr_foreach(statuses, tweet_idx, tweets_max, tweet) {
|
||||
if (!yyjson_is_obj(tweet)) { return false; }
|
||||
|
||||
auto retweet_count_val = yyjson_obj_get(tweet, "retweet_count");
|
||||
if (!yyjson_is_uint(retweet_count_val)) { return false; }
|
||||
int64_t retweet_count = yyjson_get_uint(retweet_count_val);
|
||||
if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) {
|
||||
result.retweet_count = retweet_count;
|
||||
top_tweet = tweet;
|
||||
}
|
||||
}
|
||||
|
||||
auto text = yyjson_obj_get(top_tweet, "text");
|
||||
if (!yyjson_is_str(text)) { return false; }
|
||||
result.text = { yyjson_get_str(text), yyjson_get_len(text) };
|
||||
|
||||
auto user = yyjson_obj_get(top_tweet, "user");
|
||||
if (!yyjson_is_obj(user)) { return false; }
|
||||
auto screen_name = yyjson_obj_get(user, "screen_name");
|
||||
if (!yyjson_is_str(screen_name)) { return false; }
|
||||
result.screen_name = { yyjson_get_str(screen_name), yyjson_get_len(screen_name) };
|
||||
|
||||
return result.retweet_count != -1;
|
||||
}
|
||||
};
|
||||
|
||||
struct yyjson : yyjson_base {
|
||||
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
|
||||
return yyjson_base::run(yyjson_read(json.data(), json.size(), 0), max_retweet_count, result);
|
||||
}
|
||||
};
|
||||
BENCHMARK_TEMPLATE(top_tweet, yyjson)->UseManualTime();
|
||||
|
||||
struct yyjson_insitu : yyjson_base {
|
||||
bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result<StringType> &result) {
|
||||
return yyjson_base::run(yyjson_read_opts(json.data(), json.size(), YYJSON_READ_INSITU, 0, 0), max_retweet_count, result);
|
||||
}
|
||||
};
|
||||
BENCHMARK_TEMPLATE(top_tweet, yyjson_insitu)->UseManualTime();
|
||||
|
||||
} // namespace top_tweet
|
||||
|
||||
#endif // SIMDJSON_COMPETITION_YYJSON
|
Loading…
Reference in New Issue