Handle in situ document copies outside of the loop

This commit is contained in:
John Keiser 2021-01-05 10:42:17 -08:00
parent dcd2e13aec
commit bc6907d280
29 changed files with 83 additions and 60 deletions

View File

@ -15,6 +15,7 @@ struct runner : public json_benchmark::file_runner<I> {
}
bool before_run(benchmark::State &state) {
if (!json_benchmark::file_runner<I>::before_run(state)) { return false; }
ids.clear();
return true;
}
@ -24,6 +25,7 @@ struct runner : public json_benchmark::file_runner<I> {
}
bool after_run(benchmark::State &state) {
if (!json_benchmark::file_runner<I>::after_run(state)) { return false; }
std::sort(ids.begin(), ids.end());
auto last = std::unique(ids.begin(), ids.end());
ids.erase(last, ids.end());

View File

@ -12,24 +12,25 @@ struct rapidjson_base {
Document doc{};
bool run(Document &root, std::vector<uint64_t> &ids) {
if (root.HasParseError() || !root.IsObject()) { return false; }
if (root.HasParseError()) { printf("parse error\n"); return false; }
if (!root.IsObject()) { printf("root is not an object\n"); return false; }
auto statuses = root.FindMember("statuses");
if (statuses == root.MemberEnd() || !statuses->value.IsArray()) { return false; }
if (statuses == root.MemberEnd() || !statuses->value.IsArray()) { printf("statuses is not an array\n"); return false; }
for (auto &tweet : statuses->value.GetArray()) {
if (!tweet.IsObject()) { return false; }
auto user = tweet.FindMember("user");
if (user == tweet.MemberEnd() || !user->value.IsObject()) { return false; }
if (user == tweet.MemberEnd() || !user->value.IsObject()) { printf("user is not an object\n"); return false; }
auto id = user->value.FindMember("id");
if (id == user->value.MemberEnd() || !id->value.IsUint64()) { return false; }
if (id == user->value.MemberEnd() || !id->value.IsUint64()) { printf("id is not an int\n"); return false; }
ids.push_back(id->value.GetUint64());
auto retweet = tweet.FindMember("retweeted_status");
if (retweet != tweet.MemberEnd()) {
if (!retweet->value.IsObject()) { return false; }
if (!retweet->value.IsObject()) { printf("retweet is not an object\n"); return false; }
user = retweet->value.FindMember("user");
if (user == retweet->value.MemberEnd() || !user->value.IsObject()) { return false; }
if (user == retweet->value.MemberEnd() || !user->value.IsObject()) { printf("rewtweet.user is not an object\n"); return false; }
id = user->value.FindMember("id");
if (id == user->value.MemberEnd() || !id->value.IsUint64()) { return false; }
if (id == user->value.MemberEnd() || !id->value.IsUint64()) { printf("retweet.id is not an int\n"); return false; }
ids.push_back(id->value.GetUint64());
}
}
@ -39,16 +40,15 @@ struct rapidjson_base {
};
struct rapidjson : public rapidjson_base {
bool run(const padded_string &json, std::vector<uint64_t> &ids) {
bool run(simdjson::padded_string &json, std::vector<uint64_t> &ids) {
return rapidjson_base::run(doc.Parse<kParseValidateEncodingFlag>(json.data()), ids);
}
};
BENCHMARK_TEMPLATE(distinct_user_id, rapidjson)->UseManualTime();
struct rapidjson_insitu : public rapidjson_base {
bool run(const padded_string &json, std::vector<uint64_t> &ids) {
padded_string json_copy{json.data(), json.size()};
return rapidjson_base::run(doc.ParseInsitu<kParseValidateEncodingFlag>(json_copy.data()), ids);
bool run(simdjson::padded_string &json, std::vector<uint64_t> &ids) {
return rapidjson_base::run(doc.ParseInsitu<kParseValidateEncodingFlag>(json.data()), ids);
}
};
BENCHMARK_TEMPLATE(distinct_user_id, rapidjson_insitu)->UseManualTime();

View File

@ -11,7 +11,7 @@ using namespace simdjson;
struct simdjson_dom {
dom::parser parser{};
bool run(const simdjson::padded_string &json, std::vector<uint64_t> &ids) {
bool run(simdjson::padded_string &json, std::vector<uint64_t> &ids) {
// Walk the document, parsing as we go
auto doc = parser.parse(json);
for (dom::object tweet : doc["statuses"]) {

View File

@ -12,7 +12,7 @@ using namespace simdjson::builtin;
struct simdjson_ondemand {
ondemand::parser parser{};
bool run(const simdjson::padded_string &json, std::vector<uint64_t> &ids) {
bool run(simdjson::padded_string &json, std::vector<uint64_t> &ids) {
// Walk the document, parsing as we go
auto doc = parser.iterate(json);
for (ondemand::object tweet : doc.find_field("statuses")) {

View File

@ -7,7 +7,7 @@
namespace distinct_user_id {
struct yyjson {
bool run(const simdjson::padded_string &json, std::vector<uint64_t> &ids) {
bool run(simdjson::padded_string &json, std::vector<uint64_t> &ids) {
// Walk the document, parsing the tweets as we go
yyjson_doc *doc = yyjson_read(json.data(), json.size(), 0);
if (!doc) { return false; }

View File

@ -14,6 +14,7 @@ struct runner : public json_benchmark::file_runner<I> {
}
bool before_run(benchmark::State &state) {
if (!json_benchmark::file_runner<I>::before_run(state)) { return false; }
text = "";
return true;
}

View File

@ -32,16 +32,15 @@ struct rapidjson_base {
};
struct rapidjson : public rapidjson_base {
bool run(const padded_string &json, uint64_t find_id, std::string_view &text) {
bool run(simdjson::padded_string &json, uint64_t find_id, std::string_view &text) {
return rapidjson_base::run(doc.Parse<kParseValidateEncodingFlag>(json.data()), find_id, text);
}
};
BENCHMARK_TEMPLATE(find_tweet, rapidjson)->UseManualTime();
struct rapidjson_insitu : public rapidjson_base {
bool run(const padded_string &json, uint64_t find_id, std::string_view &text) {
padded_string json_copy{json.data(), json.size()};
return rapidjson_base::run(doc.ParseInsitu<kParseValidateEncodingFlag>(json_copy.data()), find_id, text);
bool run(simdjson::padded_string &json, uint64_t find_id, std::string_view &text) {
return rapidjson_base::run(doc.ParseInsitu<kParseValidateEncodingFlag>(json.data()), find_id, text);
}
};
BENCHMARK_TEMPLATE(find_tweet, rapidjson_insitu)->UseManualTime();

View File

@ -11,7 +11,7 @@ using namespace simdjson;
struct simdjson_dom {
dom::parser parser{};
bool run(const simdjson::padded_string &json, uint64_t find_id, std::string_view &text) {
bool run(simdjson::padded_string &json, uint64_t find_id, std::string_view &text) {
text = "";
auto doc = parser.parse(json);
for (auto tweet : doc["statuses"]) {

View File

@ -12,7 +12,7 @@ using namespace simdjson::builtin;
struct simdjson_ondemand {
ondemand::parser parser{};
bool run(const simdjson::padded_string &json, uint64_t find_id, std::string_view &text) {
bool run(simdjson::padded_string &json, uint64_t find_id, std::string_view &text) {
// Walk the document, parsing as we go
auto doc = parser.iterate(json);
for (auto tweet : doc.find_field("statuses")) {

View File

@ -7,7 +7,7 @@
namespace find_tweet {
struct yyjson {
bool run(const simdjson::padded_string &json, uint64_t find_id, std::string_view &text) {
bool run(simdjson::padded_string &json, uint64_t find_id, std::string_view &text) {
// Walk the document, parsing the tweets as we go
yyjson_doc *doc = yyjson_read(json.data(), json.size(), 0);
if (!doc) { return false; }

View File

@ -7,16 +7,25 @@ namespace json_benchmark {
template<typename I>
struct file_runner : public runner_base<I> {
simdjson::padded_string original_json{};
simdjson::padded_string json{};
bool load_json(benchmark::State &state, const char *file) {
simdjson_warn_unused bool load_json(benchmark::State &state, const char *file) {
simdjson::error_code error;
if ((error = simdjson::padded_string::load(file).get(json))) {
if ((error = simdjson::padded_string::load(file).get(original_json))) {
std::stringstream err;
err << "error loading " << file << ": " << error;
state.SkipWithError(err.str().data());
return false;
}
json = simdjson::padded_string(original_json.data(), original_json.size());
return true;
}
simdjson_warn_unused bool before_run(benchmark::State &state) {
if (!runner_base<I>::after_run(state)) { return false; };
// Copy the original json in case we did *in situ* last time
std::memcpy(json.data(), original_json.data(), original_json.size());
return true;
}

View File

@ -24,11 +24,15 @@ template<typename B, typename R> static void run_json_benchmark(benchmark::State
// Warmup and equality check (make sure the data is right!)
B bench;
if (!bench.setup(state)) { return; }
if (!bench.before_run(state)) { state.SkipWithError("warmup document before_run failed"); return; }
if (!bench.run(state)) { state.SkipWithError("warmup document reading failed"); return; }
if (!bench.after_run(state)) { state.SkipWithError("warmup document after_run failed"); return; }
{
R reference;
if (!reference.setup(state)) { return; }
if (!reference.before_run(state)) { state.SkipWithError("reference before_run failed"); };
if (!reference.run(state)) { state.SkipWithError("reference document reading failed"); return; }
if (!reference.after_run(state)) { state.SkipWithError("reference before_run failed"); };
if (!bench.diff(state, reference)) { return; }
}

View File

@ -6,9 +6,17 @@
namespace json_benchmark {
template<typename I>
struct const_json_runner : public runner_base<I> {
const simdjson::padded_string &json;
const_json_runner(const simdjson::padded_string &_json) : json{_json} {}
struct string_runner : public runner_base<I> {
const simdjson::padded_string &original_json;
simdjson::padded_string json;
string_runner(const simdjson::padded_string &_json) : original_json{_json}, json(original_json.data(), original_json.size()) {}
simdjson_warn_unused bool before_run(benchmark::State &state) {
if (!runner_base<I>::after_run(state)) { return false; };
// Copy the original json in case we did *in situ*
std::memcpy(json.data(), original_json.data(), original_json.size());
return true;
}
/** Get the total number of bytes processed in each iteration. Used for metrics like bytes/second. */
size_t bytes_per_iteration() {

View File

@ -2,7 +2,7 @@
#if SIMDJSON_EXCEPTIONS
#include "json_benchmark/const_json_runner.h"
#include "json_benchmark/string_runner.h"
#include <vector>
#include <random>
@ -27,12 +27,13 @@ simdjson_unused static std::ostream &operator<<(std::ostream &o, const point &p)
}
template<typename I>
struct runner : public json_benchmark::const_json_runner<I> {
struct runner : public json_benchmark::string_runner<I> {
std::vector<point> points;
runner() : json_benchmark::const_json_runner<I>(get_built_json_array()) {}
runner() : json_benchmark::string_runner<I>(get_built_json_array()) {}
bool before_run(benchmark::State &state) {
if (!json_benchmark::string_runner<I>::before_run(state)) { return false; }
points.clear();
return true;
}

View File

@ -34,23 +34,22 @@ struct rapidjson_base {
};
struct rapidjson : public rapidjson_base {
bool run(const padded_string &json, std::vector<point> &points) {
bool run(simdjson::padded_string &json, std::vector<point> &points) {
return rapidjson_base::run(doc.Parse<kParseValidateEncodingFlag>(json.data()), points);
}
};
BENCHMARK_TEMPLATE(kostya, rapidjson)->UseManualTime();
struct rapidjson_lossless : public rapidjson_base {
bool run(const padded_string &json, std::vector<point> &points) {
bool run(simdjson::padded_string &json, std::vector<point> &points) {
return rapidjson_base::run(doc.Parse<kParseValidateEncodingFlag | kParseFullPrecisionFlag>(json.data()), points);
}
};
BENCHMARK_TEMPLATE(kostya, rapidjson_lossless)->UseManualTime();
struct rapidjson_insitu : public rapidjson_base {
bool run(const padded_string &json, std::vector<point> &points) {
padded_string json_copy{json.data(), json.size()};
return rapidjson_base::run(doc.ParseInsitu<kParseValidateEncodingFlag>(json_copy.data()), points);
bool run(simdjson::padded_string &json, std::vector<point> &points) {
return rapidjson_base::run(doc.ParseInsitu<kParseValidateEncodingFlag>(json.data()), points);
}
};
BENCHMARK_TEMPLATE(kostya, rapidjson_insitu)->UseManualTime();

View File

@ -11,7 +11,7 @@ using namespace simdjson;
struct simdjson_dom {
dom::parser parser{};
bool run(const simdjson::padded_string &json, std::vector<point> &points) {
bool run(simdjson::padded_string &json, std::vector<point> &points) {
for (auto point : parser.parse(json)["coordinates"]) {
points.emplace_back(kostya::point{point["x"], point["y"], point["z"]});
}

View File

@ -12,7 +12,7 @@ using namespace simdjson::builtin;
struct simdjson_ondemand {
ondemand::parser parser{};
bool run(const simdjson::padded_string &json, std::vector<point> &points) {
bool run(simdjson::padded_string &json, std::vector<point> &points) {
auto doc = parser.iterate(json);
for (ondemand::object point : doc.find_field("coordinates")) {
points.emplace_back(kostya::point{point.find_field("x"), point.find_field("y"), point.find_field("z")});

View File

@ -24,7 +24,7 @@ struct yyjson {
}
}
bool run(const simdjson::padded_string &json, std::vector<point> &points) {
bool run(simdjson::padded_string &json, std::vector<point> &points) {
yyjson_doc *doc = yyjson_read(json.data(), json.size(), 0);
if (!doc) { return false; }
yyjson_val *root = yyjson_doc_get_root(doc);

View File

@ -1,6 +1,6 @@
#pragma once
#include "json_benchmark/const_json_runner.h"
#include "json_benchmark/string_runner.h"
#include <random>
namespace large_random {
@ -24,12 +24,13 @@ simdjson_unused static std::ostream &operator<<(std::ostream &o, const point &p)
}
template<typename I>
struct runner : public json_benchmark::const_json_runner<I> {
struct runner : public json_benchmark::string_runner<I> {
std::vector<point> points;
runner() : json_benchmark::const_json_runner<I>(get_built_json_array()) {}
runner() : json_benchmark::string_runner<I>(get_built_json_array()) {}
bool before_run(benchmark::State &state) {
if (!json_benchmark::string_runner<I>::before_run(state)) { return false; }
points.clear();
return true;
}

View File

@ -31,23 +31,22 @@ struct rapidjson_base {
};
struct rapidjson : public rapidjson_base {
bool run(const simdjson::padded_string &json, std::vector<point> &points) {
bool run(simdjson::padded_string &json, std::vector<point> &points) {
return rapidjson_base::run(doc.Parse<kParseValidateEncodingFlag>(json.data()), points);
}
};
BENCHMARK_TEMPLATE(large_random, rapidjson)->UseManualTime();
struct rapidjson_lossless : public rapidjson_base {
bool run(const simdjson::padded_string &json, std::vector<point> &points) {
bool run(simdjson::padded_string &json, std::vector<point> &points) {
return rapidjson_base::run(doc.Parse<kParseValidateEncodingFlag | kParseFullPrecisionFlag>(json.data()), points);
}
};
BENCHMARK_TEMPLATE(large_random, rapidjson_lossless)->UseManualTime();
struct rapidjson_insitu : public rapidjson_base {
bool run(const simdjson::padded_string &json, std::vector<point> &points) {
padded_string json_copy{json.data(), json.size()};
return rapidjson_base::run(doc.ParseInsitu<kParseValidateEncodingFlag>(json_copy.data()), points);
bool run(simdjson::padded_string &json, std::vector<point> &points) {
return rapidjson_base::run(doc.ParseInsitu<kParseValidateEncodingFlag>(json.data()), points);
}
};
BENCHMARK_TEMPLATE(large_random, rapidjson_insitu)->UseManualTime();

View File

@ -11,7 +11,7 @@ using namespace simdjson;
struct simdjson_dom {
dom::parser parser{};
bool run(const simdjson::padded_string &json, std::vector<point> &points) {
bool run(simdjson::padded_string &json, std::vector<point> &points) {
for (auto point : parser.parse(json)) {
points.emplace_back(large_random::point{point["x"], point["y"], point["z"]});
}

View File

@ -12,7 +12,7 @@ using namespace simdjson::builtin;
struct simdjson_ondemand {
ondemand::parser parser{};
bool run(const simdjson::padded_string &json, std::vector<point> &points) {
bool run(simdjson::padded_string &json, std::vector<point> &points) {
auto doc = parser.iterate(json);
for (ondemand::object coord : doc) {
points.emplace_back(point{coord.find_field("x"), coord.find_field("y"), coord.find_field("z")});

View File

@ -12,7 +12,7 @@ using namespace simdjson::builtin;
struct simdjson_ondemand_unordered {
ondemand::parser parser{};
bool run(const simdjson::padded_string &json, std::vector<point> &points) {
bool run(simdjson::padded_string &json, std::vector<point> &points) {
auto doc = parser.iterate(json);
for (ondemand::object coord : doc) {
points.emplace_back(large_random::point{coord["x"], coord["y"], coord["z"]});

View File

@ -24,7 +24,7 @@ struct yyjson {
}
}
bool run(const simdjson::padded_string &json, std::vector<point> &points) {
bool run(simdjson::padded_string &json, std::vector<point> &points) {
// Walk the document, parsing the tweets as we go
yyjson_doc *doc = yyjson_read(json.data(), json.size(), 0);
if (!doc) { return false; }

View File

@ -16,6 +16,7 @@ struct runner : public json_benchmark::file_runner<I> {
}
bool before_run(benchmark::State &state) {
if (!json_benchmark::file_runner<I>::before_run(state)) { return false; }
tweets.clear();
return true;
}

View File

@ -60,19 +60,18 @@ struct rapidjson_base {
};
struct rapidjson : public rapidjson_base {
bool run(const padded_string &json, std::vector<tweet> &tweets) {
bool run(simdjson::padded_string &json, std::vector<tweet> &tweets) {
return rapidjson_base::run(doc.Parse<kParseValidateEncodingFlag>(json.data()), tweets);
}
};
BENCHMARK_TEMPLATE(partial_tweets, rapidjson)->UseManualTime();
// TODO this fails!
// struct rapidjson_insitu : public rapidjson_base {
// bool run(simdjson::padded_string &json, std::vector<tweet> &tweets) {
// return rapidjson_base::run(doc.ParseInsitu<kParseValidateEncodingFlag>(json.data()), tweets);
// }
// };
// BENCHMARK_TEMPLATE(partial_tweets, rapidjson_insitu)->UseManualTime();
struct rapidjson_insitu : public rapidjson_base {
bool run(simdjson::padded_string &json, std::vector<tweet> &tweets) {
return rapidjson_base::run(doc.ParseInsitu<kParseValidateEncodingFlag>(json.data()), tweets);
}
};
BENCHMARK_TEMPLATE(partial_tweets, rapidjson_insitu)->UseManualTime();
} // namespace partial_tweets

View File

@ -16,7 +16,7 @@ struct simdjson_dom {
return element;
}
bool run(const padded_string &json, std::vector<tweet> &tweets) {
bool run(simdjson::padded_string &json, std::vector<tweet> &tweets) {
for (dom::element tweet : parser.parse(json)["statuses"]) {
auto user = tweet["user"];
tweets.emplace_back(partial_tweets::tweet{

View File

@ -21,7 +21,7 @@ struct simdjson_ondemand {
return { user.find_field("id"), user.find_field("screen_name") };
}
bool run(const padded_string &json, std::vector<tweet> &tweets) {
bool run(simdjson::padded_string &json, std::vector<tweet> &tweets) {
// Walk the document, parsing the tweets as we go
auto doc = parser.iterate(json);
for (ondemand::object tweet : doc.find_field("statuses")) {

View File

@ -30,7 +30,7 @@ struct yyjson {
return { get_uint64(user, "id"), get_string_view(user, "screen_name") };
}
bool run(const padded_string &json, std::vector<tweet> &tweets) {
bool run(simdjson::padded_string &json, std::vector<tweet> &tweets) {
// Walk the document, parsing the tweets as we go
yyjson_doc *doc = yyjson_read(json.data(), json.size(), 0);
if (!doc) { return false; }