From c7fd7353a8fbd10e7c7913ca9bb4574aff856a71 Mon Sep 17 00:00:00 2001 From: Nicolas Boyer <83141563+NicolasJiaxin@users.noreply.github.com> Date: Thu, 3 Jun 2021 16:41:00 -0400 Subject: [PATCH] Add RapidJSON and nlohmann_json SAX to top_tweet benchmark (#1599) * Add rapidjson_sax.h and fix typo in rapidjson.h * Add nlohmann_json_sax.h and add user key check for screen_name in rapidjson_sax * Change std::string_view assignement for text and screen_name. --- benchmark/bench_ondemand.cpp | 2 + benchmark/top_tweet/nlohmann_json_sax.h | 99 +++++++++++++++++++++++ benchmark/top_tweet/rapidjson.h | 2 +- benchmark/top_tweet/rapidjson_sax.h | 101 ++++++++++++++++++++++++ 4 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 benchmark/top_tweet/nlohmann_json_sax.h create mode 100644 benchmark/top_tweet/rapidjson_sax.h diff --git a/benchmark/bench_ondemand.cpp b/benchmark/bench_ondemand.cpp index fa766fc2..8c821719 100644 --- a/benchmark/bench_ondemand.cpp +++ b/benchmark/bench_ondemand.cpp @@ -75,6 +75,8 @@ SIMDJSON_POP_DISABLE_WARNINGS #include "top_tweet/yyjson.h" #include "top_tweet/sajson.h" #include "top_tweet/rapidjson.h" +#include "top_tweet/rapidjson_sax.h" #include "top_tweet/nlohmann_json.h" +#include "top_tweet/nlohmann_json_sax.h" BENCHMARK_MAIN(); diff --git a/benchmark/top_tweet/nlohmann_json_sax.h b/benchmark/top_tweet/nlohmann_json_sax.h new file mode 100644 index 00000000..4deafd04 --- /dev/null +++ b/benchmark/top_tweet/nlohmann_json_sax.h @@ -0,0 +1,99 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "top_tweet.h" + +namespace top_tweet { + +using json = nlohmann::json; + +struct nlohmann_json_sax { + using StringType=std::string; + + struct Handler : json::json_sax_t + { + // Assume every tweet/retweet starts with "metadata" key and ends with "retweeted" key. Ignore everything in a retweet. + // Assume that the first valid key encountered outside a retweet is the correct key. + enum state { // Bitset to store state of search + key_text = (1<<0), + key_screen_name = (1<<1), + key_rt = (1<<2), + found_text = (1<<3), + found_screen_name = (1<<4), + found_rt = (1<<5) + }; + int values = state::key_text; + bool userobject = false; // If in a user object + bool inretweet = false; + int64_t max_rt; + int rt; + string_t screen_name; + string_t text; + top_tweet_result& result; + + Handler(top_tweet_result &r,int64_t m) : result(r), max_rt(m) { } + + bool key(string_t& val) override { + if (!inretweet) { // If not in a retweet object, find relevant keys + if (val.compare("retweeted_status") == 0) { inretweet = true; } // Check if entering retweet + else if (val.compare("metadata") == 0) { values = 0; } // Reset + else if (!(values & found_text) && (val.compare("text") == 0)) { values |= (key_text); } + else if ((val.compare("user") == 0)) { userobject = true; } + else if (!(values & found_screen_name) && userobject && (val.compare("screen_name") == 0)) { values |= (key_screen_name); } + else if (!(values & found_rt) && (val.compare("retweet_count") == 0)) { values |= (key_rt); } + } + else if (val.compare("retweeted") == 0) { inretweet = false; } // Check if end of retweet + return true; + } + bool number_unsigned(number_unsigned_t val) override { + if (values & key_rt && !(values & found_rt)) { // retweet_count + rt = val; + values &= ~(key_rt); + values |= (found_rt); + if (rt <= max_rt && rt >= result.retweet_count) { // Check if current tweet has more retweet than previous top tweet + result.retweet_count = rt; + result.text = text; + result.screen_name = screen_name; + } + } + return true; + } + bool string(string_t& val) override { + if (values & key_text && !(values & found_text)) { // text + text = val; + values &= ~(key_text); + values |= (found_text); + } + else if (values & key_screen_name && !(values & found_screen_name)) { // user.screen_name + screen_name = val; + userobject = false; + values &= ~(key_screen_name); + values |= (found_screen_name); + } + return true; + } + // Irrelevant events + bool null() override { return true; } + bool boolean(bool val) override { return true; } + bool number_float(number_float_t val, const string_t& s) override { return true; } + bool number_integer(number_integer_t val) override { return true; } + bool start_object(std::size_t elements) override { return true; } + bool end_object() override { return true; } + bool start_array(std::size_t elements) override { return true; } + bool end_array() override { return true; } + bool binary(json::binary_t& val) override { return true; } + bool parse_error(std::size_t position, const std::string& last_token, const json::exception& ex) override { return false; } + }; // Handler + + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + Handler handler(result,max_retweet_count); + json::sax_parse(json.data(), &handler); + return true; + } +}; // nlohmann_json_sax +BENCHMARK_TEMPLATE(top_tweet, nlohmann_json_sax)->UseManualTime(); +} // namespace top_tweet + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON \ No newline at end of file diff --git a/benchmark/top_tweet/rapidjson.h b/benchmark/top_tweet/rapidjson.h index 4c47bf16..64105dd1 100644 --- a/benchmark/top_tweet/rapidjson.h +++ b/benchmark/top_tweet/rapidjson.h @@ -64,6 +64,6 @@ struct rapidjson_insitu : rapidjson_base { }; BENCHMARK_TEMPLATE(top_tweet, rapidjson_insitu)->UseManualTime(); -} // namespace partial_tweets +} // namespace top_tweet #endif // SIMDJSON_COMPETITION_RAPIDJSON diff --git a/benchmark/top_tweet/rapidjson_sax.h b/benchmark/top_tweet/rapidjson_sax.h new file mode 100644 index 00000000..74c8d4d5 --- /dev/null +++ b/benchmark/top_tweet/rapidjson_sax.h @@ -0,0 +1,101 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "top_tweet.h" +#include + +namespace top_tweet { + +using namespace rapidjson; + +struct rapidjson_sax { + using StringType=std::string_view; + struct Handler { + // Assume every tweet/retweet starts with "metadata" key and ends with "retweeted" key. Ignore everything in a retweet. + // Assume that the first valid key encountered outside a retweet is the correct key. + enum state { // Bit set to keep track of state of search for keys + key_text = (1<<0), + key_screen_name = (1<<1), + key_rt = (1<<2), + found_text = (1<<3), + found_screen_name = (1<<4), + found_rt = (1<<5) + }; + int values = state::key_text; + int rt; + StringType text; + StringType screen_name; + bool inretweet = false; + bool userobject = false; + top_tweet_result& result; + int64_t max_rt; + + Handler(top_tweet_result &r,int64_t m) : result(r), max_rt(m) { } + + bool Key(const char* key, SizeType length, bool copy) { + if (!inretweet) { + if ((length == 16) && (memcmp(key,"retweeted_status",16) == 0)) { inretweet = true; } // Check if entering retweet + else if ((length == 8) && (memcmp(key,"metadata",8) == 0)) { values = 0; } // Reset + else if (!(values & found_text) && (length == 4) && (memcmp(key,"text",4) == 0)) { values |= (key_text); } + else if ((length == 4) && (memcmp(key,"user",4) == 0)) { userobject = true; } + else if (!(values & found_screen_name) && userobject && (length == 11) && memcmp(key,"screen_name",11) == 0) { values |= (key_screen_name); } + else if (!(values & found_rt) && (length == 13) && (memcmp(key,"retweet_count",13) == 0)) { values |= (key_rt); } + } + else if ((length == 9) && (memcmp(key,"retweeted",9) == 0)) { inretweet = false; } // Check if end of retweet + return true; + } + bool String(const char* str, SizeType length, bool copy) { + if (values & key_text && !(values & found_text)) { // text + text = {str,length}; + values &= ~(key_text); + values |= (found_text); + } + else if (values & key_screen_name && !(values & found_screen_name)) { // user.screen_name + screen_name = {str,length}; + values &= ~(key_screen_name); + values |= (found_screen_name); + userobject = false; + } + return true; + } + bool Uint(unsigned i) { + if (values & key_rt && !(values & found_rt)) { // retweet_count + rt = i; + values &= ~(key_rt); + values |= (found_rt); + if (rt <= max_rt && rt >= result.retweet_count) { // Check if current tweet has more retweet than previous top tweet + result.retweet_count = rt; + result.text = text; + result.screen_name = screen_name; + } + } + return true; + } + // Irrelevant events + bool Null() { return true; } + bool Bool(bool b) { return true; } + bool Double(double d) { return true; } + bool Int(int i) { return true; } + bool Int64(int64_t i) { return true; } + bool Uint64(uint64_t i) { return true; } + bool RawNumber(const char* str, SizeType length, bool copy) { return true; } + bool StartObject() { return true; } + bool EndObject(SizeType memberCount) { return true; } + bool StartArray() { return true; } + bool EndArray(SizeType elementCount) { return true; } + }; // handler + + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + Reader reader; + Handler handler(result,max_retweet_count); + InsituStringStream ss(json.data()); + reader.Parse(ss,handler); + return true; + } +}; // rapidjson_sax +BENCHMARK_TEMPLATE(top_tweet, rapidjson_sax)->UseManualTime(); +} // namespace top_tweet + +#endif // SIMDJSON_COMPETITION_RAPIDJSON