diff --git a/benchmark/bench_ondemand.cpp b/benchmark/bench_ondemand.cpp index fd3f3bf3..b12781e3 100644 --- a/benchmark/bench_ondemand.cpp +++ b/benchmark/bench_ondemand.cpp @@ -62,4 +62,11 @@ SIMDJSON_POP_DISABLE_WARNINGS #include "find_tweet/rapidjson.h" #include "find_tweet/nlohmann_json.h" +#include "top_tweet/simdjson_dom.h" +#include "top_tweet/simdjson_ondemand.h" +#include "top_tweet/yyjson.h" +#include "top_tweet/sajson.h" +#include "top_tweet/rapidjson.h" +#include "top_tweet/nlohmann_json.h" + BENCHMARK_MAIN(); diff --git a/benchmark/distinct_user_id/yyjson.h b/benchmark/distinct_user_id/yyjson.h index 8a1614a2..e949ef14 100644 --- a/benchmark/distinct_user_id/yyjson.h +++ b/benchmark/distinct_user_id/yyjson.h @@ -12,7 +12,7 @@ struct yyjson_base { yyjson_val *root = yyjson_doc_get_root(doc); if (!yyjson_is_obj(root)) { return false; } yyjson_val *statuses = yyjson_obj_get(root, "statuses"); - if (!yyjson_is_arr(statuses)) { return "Statuses is not an array!"; } + if (!yyjson_is_arr(statuses)) { return false; } // Walk the document, parsing the tweets as we go size_t tweet_idx, tweets_max; diff --git a/benchmark/find_tweet/yyjson.h b/benchmark/find_tweet/yyjson.h index a1ff827a..3a8a6477 100644 --- a/benchmark/find_tweet/yyjson.h +++ b/benchmark/find_tweet/yyjson.h @@ -14,7 +14,7 @@ struct yyjson_base { yyjson_val *root = yyjson_doc_get_root(doc); if (!yyjson_is_obj(root)) { return false; } yyjson_val *statuses = yyjson_obj_get(root, "statuses"); - if (!yyjson_is_arr(statuses)) { return "Statuses is not an array!"; } + if (!yyjson_is_arr(statuses)) { return false; } // Walk the document, parsing the tweets as we go size_t tweet_idx, tweets_max; diff --git a/benchmark/partial_tweets/yyjson.h b/benchmark/partial_tweets/yyjson.h index 7ca47ded..a1586f2a 100644 --- a/benchmark/partial_tweets/yyjson.h +++ b/benchmark/partial_tweets/yyjson.h @@ -37,7 +37,7 @@ struct yyjson_base { yyjson_val *root = yyjson_doc_get_root(doc); if (!yyjson_is_obj(root)) { return false; } yyjson_val *statuses = yyjson_obj_get(root, "statuses"); - if (!yyjson_is_arr(statuses)) { return "Statuses is not an array!"; } + if (!yyjson_is_arr(statuses)) { return false; } // Walk the document, parsing the tweets as we go size_t tweet_idx, tweets_max; diff --git a/benchmark/top_tweet/README.md b/benchmark/top_tweet/README.md new file mode 100644 index 00000000..b8fdc5d5 --- /dev/null +++ b/benchmark/top_tweet/README.md @@ -0,0 +1,49 @@ +# Top Tweet Benchmark + +The top_tweet benchmark finds the most-retweeted tweet in a twitter API response. + +## Purpose + +This scenario tends to measure an implementation's laziness: its ability to avoid parsing unneeded +values, without knowing beforehand which values are needed. + +To find the top tweet, an implementation needs to iterate through all tweets, remembering which one +had the highest retweet count. While it scans, it will find many "candidate" tweets with the highest +retweet count *up to that point.* However, While the implementation iterates through tweets, it will +have many "candidate" tweets. Essentially, it has to keep track of the "top tweet so far" while it +searches. However, only the text and screen_name of the *final* top tweet need to be parsed. +Therefore, JSON parsers that can only parse values on the first pass (such as DOM or streaming +parsers) will be forced to parse text and screen_name of every candidate (if not every single +tweet). Parsers which can delay parsing of values until later will therefore shine in scenarios like +this. + +## Rules + +The benchmark will be called with `run(padded_string &json, int64_t max_retweet_count, top_tweet_result &result)`. +The benchmark must: +- Find the tweet with the highest retweet_count at the top level of the "statuses" array. +- Find the *last* such tweet: if multiple tweets have the same top retweet_count, the last one + should be returned. +- Exclude tweets with retweet_count above max_retweet_count. This restriction is solely here because + the default twitter.json has a rather high retweet count in the third tweet, and to test laziness + the matching tweet needs to be further down in the file. +- Fill in top_tweet_result with the corresponding fields from the matching tweet. + +### Abridged Schema + +The abridged schema (objects contain more fields than listed here): + +```json +{ + "statuses": [ + { + "text": "i like to tweet", // text containing UTF-8 and escape characters + "user": { + "screen_name": "AlexanderHamilton" // string containing UTF-8 (and escape characters?) + }, + "retweet_count": 2, // uint32 + }, + ... + ] +} +``` diff --git a/benchmark/top_tweet/nlohmann_json.h b/benchmark/top_tweet/nlohmann_json.h new file mode 100644 index 00000000..b16b203f --- /dev/null +++ b/benchmark/top_tweet/nlohmann_json.h @@ -0,0 +1,39 @@ +#pragma once + +#if SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "top_tweet.h" + +namespace top_tweet { + +using namespace simdjson; + +struct nlohmann_json { + using StringType=std::string; + + dom::parser parser{}; + + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + nlohmann::json top_tweet{}; + + auto root = nlohmann::json::parse(json.data(), json.data() + json.size()); + for (auto tweet : root["statuses"]) { + int64_t retweet_count = tweet["retweet_count"]; + if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) { + result.retweet_count = retweet_count; + top_tweet = tweet; + } + } + + result.text = top_tweet["text"]; + result.screen_name = top_tweet["user"]["screen_name"]; + return result.retweet_count != -1; + } +}; + +BENCHMARK_TEMPLATE(top_tweet, nlohmann_json)->UseManualTime(); + +} // namespace top_tweet + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON \ No newline at end of file diff --git a/benchmark/top_tweet/rapidjson.h b/benchmark/top_tweet/rapidjson.h new file mode 100644 index 00000000..4c47bf16 --- /dev/null +++ b/benchmark/top_tweet/rapidjson.h @@ -0,0 +1,69 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "top_tweet.h" + +namespace top_tweet { + +using namespace rapidjson; + +struct rapidjson_base { + using StringType=std::string_view; + + Document doc{}; + + bool run(Document &root, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + + // Loop over the tweets + if (root.HasParseError() || !root.IsObject()) { return false; } + const auto &statuses = root.FindMember("statuses"); + if (statuses == root.MemberEnd() || !statuses->value.IsArray()) { return false; } + for (const Value &tweet : statuses->value.GetArray()) { + if (!tweet.IsObject()) { return false; } + + // Check if this tweet has a higher retweet count than the current top tweet + const auto &retweet_count_json = tweet.FindMember("retweet_count"); + if (retweet_count_json == tweet.MemberEnd() || !retweet_count_json->value.IsInt64()) { return false; } + int64_t retweet_count = retweet_count_json->value.GetInt64(); + if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) { + result.retweet_count = retweet_count; + + // TODO I can't figure out if there's a way to keep the Value to use outside the loop ... + + // Get text and screen_name of top tweet + const auto &text = tweet.FindMember("text"); + if (text == tweet.MemberEnd() || !text->value.IsString()) { return false; } + result.text = { text->value.GetString(), text->value.GetStringLength() }; + + const auto &user = tweet.FindMember("user"); + if (user == tweet.MemberEnd() || !user->value.IsObject()) { return false; } + const auto &screen_name = user->value.FindMember("screen_name"); + if (screen_name == user->value.MemberEnd() || !screen_name->value.IsString()) { return false; } + result.screen_name = { screen_name->value.GetString(), screen_name->value.GetStringLength() }; + + } + } + + return result.retweet_count != -1; + } +}; + +struct rapidjson : rapidjson_base { + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + return rapidjson_base::run(doc.Parse(json.data()), max_retweet_count, result); + } +}; +BENCHMARK_TEMPLATE(top_tweet, rapidjson)->UseManualTime(); + +struct rapidjson_insitu : rapidjson_base { + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + return rapidjson_base::run(doc.ParseInsitu(json.data()), max_retweet_count, result); + } +}; +BENCHMARK_TEMPLATE(top_tweet, rapidjson_insitu)->UseManualTime(); + +} // namespace partial_tweets + +#endif // SIMDJSON_COMPETITION_RAPIDJSON diff --git a/benchmark/top_tweet/sajson.h b/benchmark/top_tweet/sajson.h new file mode 100644 index 00000000..5f4cc055 --- /dev/null +++ b/benchmark/top_tweet/sajson.h @@ -0,0 +1,62 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_SAJSON + +#include "top_tweet.h" + +namespace top_tweet { + +struct sajson { + using StringType=std::string_view; + + size_t ast_buffer_size{0}; + size_t *ast_buffer{nullptr}; + + bool run(simdjson::padded_string &json, int32_t max_retweet_count, top_tweet_result &result) { + if (!ast_buffer) { + ast_buffer_size = json.size(); + ast_buffer = (size_t *)std::malloc(ast_buffer_size * sizeof(size_t)); + } + auto doc = ::sajson::parse( + ::sajson::bounded_allocation(ast_buffer, ast_buffer_size), + ::sajson::mutable_string_view(json.size(), json.data()) + ); + if (!doc.is_valid()) { return false; } + + auto root = doc.get_root(); + if (root.get_type() != ::sajson::TYPE_OBJECT) { return false; } + auto statuses = root.get_value_of_key({ "statuses", strlen("statuses") }); + if (statuses.get_type() != ::sajson::TYPE_ARRAY) { return false; } + + for (size_t i=0; i= result.retweet_count) { + result.retweet_count = retweet_count; + + auto text = tweet.get_value_of_key({ "text", strlen("text") }); + if (text.get_type() != ::sajson::TYPE_STRING) { return false; } + result.text = { text.as_cstring(), text.get_string_length() }; + + auto user = tweet.get_value_of_key({ "user", strlen("user") }); + if (user.get_type() != ::sajson::TYPE_OBJECT) { return false; } + auto screen_name = user.get_value_of_key({ "screen_name", strlen("screen_name") }); + if (screen_name.get_type() != ::sajson::TYPE_STRING) { return false; } + result.screen_name = { screen_name.as_cstring(), screen_name.get_string_length() }; + } + } + + return result.retweet_count != -1; + } +}; + +BENCHMARK_TEMPLATE(top_tweet, sajson)->UseManualTime(); + +} // namespace top_tweet + +#endif // SIMDJSON_COMPETITION_SAJSON \ No newline at end of file diff --git a/benchmark/top_tweet/simdjson_dom.h b/benchmark/top_tweet/simdjson_dom.h new file mode 100644 index 00000000..3a6648f1 --- /dev/null +++ b/benchmark/top_tweet/simdjson_dom.h @@ -0,0 +1,39 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "top_tweet.h" + +namespace top_tweet { + +using namespace simdjson; + +struct simdjson_dom { + using StringType=std::string_view; + + dom::parser parser{}; + + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + dom::element top_tweet{}; + + auto doc = parser.parse(json); + for (auto tweet : doc["statuses"]) { + int64_t retweet_count = tweet["retweet_count"]; + if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) { + result.retweet_count = retweet_count; + top_tweet = tweet; + } + } + + result.text = top_tweet["text"]; + result.screen_name = top_tweet["user"]["screen_name"]; + return result.retweet_count != -1; + } +}; + +BENCHMARK_TEMPLATE(top_tweet, simdjson_dom)->UseManualTime(); + +} // namespace top_tweet + +#endif // SIMDJSON_EXCEPTIONS \ No newline at end of file diff --git a/benchmark/top_tweet/simdjson_ondemand.h b/benchmark/top_tweet/simdjson_ondemand.h new file mode 100644 index 00000000..10943df8 --- /dev/null +++ b/benchmark/top_tweet/simdjson_ondemand.h @@ -0,0 +1,81 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "top_tweet.h" + +namespace top_tweet { + +using namespace simdjson; +using namespace simdjson::builtin; + +struct simdjson_ondemand { + using StringType=std::string_view; + + ondemand::parser parser{}; + + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + // We save these DOM values for later so we don't have to parse them + // into string_views until we're sure which ones we want to parse + // NOTE: simdjson does not presently support reuse of objects or arrays--just scalars. This is + // why we have to grab the text and screen_name fields instead of just saving the tweet object. + ondemand::value screen_name, text; + + auto doc = parser.iterate(json); + for (auto tweet : doc["statuses"]) { + // Since text, user.screen_name, and retweet_count generally appear in order, it's nearly free + // for us to retrieve them here (and will cost a bit more if we do it in the if + // statement). + auto tweet_text = tweet["text"]; + auto tweet_screen_name = tweet["user"]["screen_name"]; + int64_t retweet_count = tweet["retweet_count"]; + if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) { + result.retweet_count = retweet_count; + // TODO std::move should not be necessary + text = std::move(tweet_text); + screen_name = std::move(tweet_screen_name); + } + } + + // Now that we know which was the most retweeted, parse the values in it + result.screen_name = screen_name; + result.text = text; + return result.retweet_count != -1; + } +}; + +BENCHMARK_TEMPLATE(top_tweet, simdjson_ondemand)->UseManualTime(); + +struct simdjson_ondemand_forward_only { + using StringType=std::string_view; + + ondemand::parser parser{}; + + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + + auto doc = parser.iterate(json); + for (auto tweet : doc["statuses"]) { + // Since text, user.screen_name, and retweet_count generally appear in order, it's nearly free + // for us to retrieve them here (and will cost a bit more if we do it in the if + // statement). + auto tweet_text = tweet["text"]; + auto tweet_screen_name = tweet["user"]["screen_name"]; + int64_t retweet_count = tweet["retweet_count"]; + if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) { + result.retweet_count = retweet_count; + result.text = tweet_text; + result.screen_name = tweet_screen_name; + } + } + + return result.retweet_count != -1; + } +}; + +BENCHMARK_TEMPLATE(top_tweet, simdjson_ondemand_forward_only)->UseManualTime(); + +} // namespace top_tweet + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/top_tweet/top_tweet.h b/benchmark/top_tweet/top_tweet.h new file mode 100644 index 00000000..d64b42a1 --- /dev/null +++ b/benchmark/top_tweet/top_tweet.h @@ -0,0 +1,67 @@ + +#pragma once + +#include "json_benchmark/file_runner.h" + +namespace top_tweet { + +using namespace json_benchmark; + +template +struct top_tweet_result { + int64_t retweet_count{}; + StringType screen_name{}; + StringType text{}; + template + simdjson_really_inline bool operator==(const top_tweet_result &other) const { + return retweet_count == other.retweet_count && + screen_name == other.screen_name && + text == other.text; + } + template + simdjson_really_inline bool operator!=(const top_tweet_result &other) const { return !(*this == other); } +}; + +template +simdjson_unused static std::ostream &operator<<(std::ostream &o, const top_tweet_result &t) { + o << "retweet_count: " << t.retweet_count << std::endl; + o << "screen_name: " << t.screen_name << std::endl; + o << "text: " << t.text << std::endl; + return o; +} + +template +struct runner : public file_runner { + top_tweet_result result{}; + + bool setup(benchmark::State &state) { + return this->load_json(state, TWITTER_JSON); + } + + bool before_run(benchmark::State &state) { + if (!file_runner::before_run(state)) { return false; } + result.retweet_count = -1; + return true; + } + + bool run(benchmark::State &) { + return this->implementation.run(this->json, 60, result); + } + + template + bool diff(benchmark::State &state, runner &reference) { + return diff_results(state, result, reference.result, diff_flags::NONE); + } + + size_t items_per_iteration() { + return 1; + } +}; + +struct simdjson_dom; + +template simdjson_really_inline static void top_tweet(benchmark::State &state) { + json_benchmark::run_json_benchmark, runner>(state); +} + +} // namespace top_tweet diff --git a/benchmark/top_tweet/yyjson.h b/benchmark/top_tweet/yyjson.h new file mode 100644 index 00000000..d7b310e5 --- /dev/null +++ b/benchmark/top_tweet/yyjson.h @@ -0,0 +1,68 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_YYJSON + +#include "top_tweet.h" + +namespace top_tweet { + +struct yyjson_base { + using StringType=std::string_view; + + bool run(yyjson_doc *doc, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + + yyjson_val *top_tweet{}; + + if (!doc) { return false; } + yyjson_val *root = yyjson_doc_get_root(doc); + if (!yyjson_is_obj(root)) { return false; } + yyjson_val *statuses = yyjson_obj_get(root, "statuses"); + if (!yyjson_is_arr(statuses)) { return false; } + + // Walk the document, parsing the tweets as we go + size_t tweet_idx, tweets_max; + yyjson_val *tweet; + yyjson_arr_foreach(statuses, tweet_idx, tweets_max, tweet) { + if (!yyjson_is_obj(tweet)) { return false; } + + auto retweet_count_val = yyjson_obj_get(tweet, "retweet_count"); + if (!yyjson_is_uint(retweet_count_val)) { return false; } + int64_t retweet_count = yyjson_get_uint(retweet_count_val); + if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) { + result.retweet_count = retweet_count; + top_tweet = tweet; + } + } + + auto text = yyjson_obj_get(top_tweet, "text"); + if (!yyjson_is_str(text)) { return false; } + result.text = { yyjson_get_str(text), yyjson_get_len(text) }; + + auto user = yyjson_obj_get(top_tweet, "user"); + if (!yyjson_is_obj(user)) { return false; } + auto screen_name = yyjson_obj_get(user, "screen_name"); + if (!yyjson_is_str(screen_name)) { return false; } + result.screen_name = { yyjson_get_str(screen_name), yyjson_get_len(screen_name) }; + + return result.retweet_count != -1; + } +}; + +struct yyjson : yyjson_base { + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + return yyjson_base::run(yyjson_read(json.data(), json.size(), 0), max_retweet_count, result); + } +}; +BENCHMARK_TEMPLATE(top_tweet, yyjson)->UseManualTime(); + +struct yyjson_insitu : yyjson_base { + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + return yyjson_base::run(yyjson_read_opts(json.data(), json.size(), YYJSON_READ_INSITU, 0, 0), max_retweet_count, result); + } +}; +BENCHMARK_TEMPLATE(top_tweet, yyjson_insitu)->UseManualTime(); + +} // namespace top_tweet + +#endif // SIMDJSON_COMPETITION_YYJSON