[WIP] tweet reader SAX benchmark

This commit is contained in:
John Keiser 2020-08-08 15:19:56 -07:00
parent ce8d0f8135
commit 7e74d30f45
3 changed files with 500 additions and 7 deletions

View File

@ -1,5 +1,13 @@
include_directories( . linux )
link_libraries(simdjson simdjson-flags simdjson-windows-headers test-data)
link_libraries(simdjson-windows-headers test-data)
if (TARGET benchmark::benchmark)
add_executable(bench_sax bench_sax.cpp)
target_link_libraries(bench_sax simdjson-internal-flags simdjson-include-source benchmark::benchmark)
endif (TARGET benchmark::benchmark)
link_libraries(simdjson simdjson-flags)
add_executable(benchfeatures benchfeatures.cpp)
add_executable(get_corpus_benchmark get_corpus_benchmark.cpp)
add_executable(perfdiff perfdiff.cpp)
@ -14,12 +22,6 @@ target_compile_definitions(parse_nonumberparsing PRIVATE SIMDJSON_SKIPNUMBERPARS
add_executable(parse_nostringparsing parse.cpp)
target_compile_definitions(parse_nostringparsing PRIVATE SIMDJSON_SKIPSTRINGPARSING)
if (TARGET benchmark::benchmark)
link_libraries(benchmark::benchmark)
add_executable(bench_parse_call bench_parse_call.cpp)
add_executable(bench_dom_api bench_dom_api.cpp)
endif()
if (TARGET competition-all)
add_executable(distinctuseridcompetition distinctuseridcompetition.cpp)
target_link_libraries(distinctuseridcompetition competition-core)
@ -34,4 +36,10 @@ if (TARGET competition-all)
target_compile_definitions(allparsingcompetition PRIVATE ALLPARSER)
endif()
if (TARGET benchmark::benchmark)
link_libraries(benchmark::benchmark)
add_executable(bench_parse_call bench_parse_call.cpp)
add_executable(bench_dom_api bench_dom_api.cpp)
endif()
include(checkperf.cmake)

264
benchmark/bench_sax.cpp Normal file
View File

@ -0,0 +1,264 @@
#define SIMDJSON_IMPLEMENTATION_FALLBACK 0
#define SIMDJSON_IMPLEMENTATION_WESTMERE 0
#define SIMDJSON_IMPLEMENTATION_AMD64 0
#include "simdjson.h"
#include "simdjson.cpp"
using namespace simdjson;
using namespace haswell;
using namespace haswell::stage2;
SIMDJSON_TARGET_HASWELL
namespace twitter {
#define KEY_IS(KEY, MATCH) (!strncmp((const char *)KEY, "\"" MATCH "\"", strlen("\"" MATCH "\"")))
struct twitter_user {
uint64_t id{};
std::string_view screen_name{};
};
struct tweet {
uint64_t id{};
std::string_view text{};
std::string_view created_at{};
uint64_t in_reply_to_status_id{};
uint64_t retweet_count{};
uint64_t favorite_count{};
twitter_user user{};
};
struct sax_tweet_reader {
std::vector<tweet> tweets;
std::unique_ptr<uint8_t[]> string_buf;
size_t capacity;
dom_parser_implementation dom_parser;
sax_tweet_reader();
error_code set_capacity(size_t new_capacity);
error_code read_tweets(padded_string &json);
}; // struct tweet_reader
} // namespace twitter
namespace twitter {
struct sax_tweet_reader_visitor {
bool in_statuses{false};
bool in_user{false};
std::vector<tweet> &tweets;
uint8_t *current_string_buf_loc;
uint64_t *expect_int{};
std::string_view *expect_string{};
sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf);
simdjson_really_inline error_code visit_document_start(json_iterator &iter);
simdjson_really_inline error_code visit_object_start(json_iterator &iter);
simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key);
simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value);
simdjson_really_inline error_code visit_array_start(json_iterator &iter);
simdjson_really_inline error_code visit_array_end(json_iterator &iter);
simdjson_really_inline error_code visit_object_end(json_iterator &iter);
simdjson_really_inline error_code visit_document_end(json_iterator &iter);
simdjson_really_inline error_code visit_empty_array(json_iterator &iter);
simdjson_really_inline error_code visit_empty_object(json_iterator &iter);
simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value);
simdjson_really_inline error_code increment_count(json_iterator &iter);
}; // sax_tweet_reader_visitor
sax_tweet_reader::sax_tweet_reader() : tweets{}, string_buf{}, capacity{0}, dom_parser() {}
error_code sax_tweet_reader::set_capacity(size_t new_capacity) {
// string_capacity copied from document::allocate
size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * new_capacity / 3 + 32, 64);
string_buf.reset(new (std::nothrow) uint8_t[string_capacity]);
if (auto error = dom_parser.set_capacity(new_capacity)) { return error; }
if (capacity == 0) { // set max depth the first time only
if (auto error = dom_parser.set_max_depth(DEFAULT_MAX_DEPTH)) { return error; }
}
capacity = new_capacity;
return SUCCESS;
}
// NOTE: this assumes the dom_parser is already allocated
error_code sax_tweet_reader::read_tweets(padded_string &json) {
// Allocate capacity if needed
tweets.clear();
if (capacity < json.size()) {
if (auto error = set_capacity(capacity)) { return error; }
}
// Run stage 1 first.
if (auto error = dom_parser.stage1((uint8_t *)json.data(), json.size(), false)) { return error; }
// Then walk the document, parsing the tweets as we go
json_iterator iter(dom_parser, 0);
sax_tweet_reader_visitor visitor(tweets, string_buf.get());
if (auto error = iter.walk_document<false>(visitor)) { return error; }
return SUCCESS;
}
sax_tweet_reader_visitor::sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf)
: tweets{_tweets},
current_string_buf_loc{string_buf} {
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_start(json_iterator &iter) {
iter.log_start_value("document");
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_start(json_iterator &iter) {
// iter.log_start_value("array");
// if we expected an int or string and got an array or object, it's an error
if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; }
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_start(json_iterator &iter) {
// iter.log_start_value("object");
// if we expected an int or string and got an array or object, it's an error
if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; }
// { "statuses": [ {
if (in_statuses && iter.depth == 3) {
iter.log_start_value("tweet");
tweets.push_back({});
}
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_key(json_iterator &iter, const uint8_t *key) {
// iter.log_value("key");
if (in_statuses) {
switch (iter.depth) {
case 3: // in tweet: { "statuses": [ { <key>
// NOTE: the way we're comparing key (fairly naturally) means the caller doesn't have to check " for us at all
if (KEY_IS(key, "user")) { iter.log_start_value("user"); in_user = true; }
else if (KEY_IS(key, "id")) { iter.log_value("id"); expect_int = &tweets.back().id; }
else if (KEY_IS(key, "in_reply_to_status_id")) { iter.log_value("in_reply_to_status_id"); expect_int = &tweets.back().in_reply_to_status_id; }
else if (KEY_IS(key, "retweet_count")) { iter.log_value("retweet_count"); expect_int = &tweets.back().retweet_count; }
else if (KEY_IS(key, "favorite_count")) { iter.log_value("favorite_count"); expect_int = &tweets.back().favorite_count; }
else if (KEY_IS(key, "text")) { iter.log_value("text"); expect_string = &tweets.back().text; }
else if (KEY_IS(key, "created_at")) { iter.log_value("created_at"); expect_string = &tweets.back().created_at; }
break;
case 4:
if (in_user) { // in user: { "statuses": [ { "user": { <key>
if (KEY_IS(key, "id")) { iter.log_value("id"); expect_int = &tweets.back().user.id; }
else if (KEY_IS(key, "screen_name")) { iter.log_value("screen_name"); expect_string = &tweets.back().user.screen_name; }
}
break;
default: break;
}
} else {
if (iter.depth == 1 && KEY_IS(key, "statuses")) {
iter.log_start_value("statuses");
in_statuses = true;
}
}
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_primitive(json_iterator &iter, const uint8_t *value) {
// iter.log_value("primitive");
if (expect_int) {
iter.log_value("int");
if (auto error = numberparsing::parse_unsigned(value).get(*expect_int)) {
// If number parsing failed, check if it's null before returning the error
if (!atomparsing::is_valid_null_atom(value)) { iter.log_error("expected number or null"); return error; }
}
expect_int = nullptr;
} else if (expect_string) {
iter.log_value("string");
// Must be a string!
if (value[0] != '"') { iter.log_error("expected string"); return STRING_ERROR; }
auto end = stringparsing::parse_string(value, current_string_buf_loc);
if (!end) { iter.log_error("error parsing string"); return STRING_ERROR; }
*expect_string = std::string_view((const char *)current_string_buf_loc, end-current_string_buf_loc);
current_string_buf_loc = end;
expect_string = nullptr;
}
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_end(json_iterator &iter) {
// iter.log_end_value("array");
// When we hit the end of { "statuses": [ ... ], we're done with statuses.
if (in_statuses && iter.depth == 2) { iter.log_end_value("statuses"); in_statuses = false; }
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_end(json_iterator &iter) {
// iter.log_end_value("object");
// When we hit the end of { "statuses": [ { "user": { ... }, we're done with the user
if (in_user && iter.depth == 4) { iter.log_end_value("user"); in_user = false; }
if (in_statuses && iter.depth == 3) { iter.log_end_value("tweet"); }
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_end(json_iterator &iter) {
iter.log_end_value("document");
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_array(json_iterator &iter) {
// if we expected an int or string and got an array or object, it's an error
// iter.log_value("empty array");
if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; }
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_object(json_iterator &iter) {
// if we expected an int or string and got an array or object, it's an error
// iter.log_value("empty object");
if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; }
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_root_primitive(json_iterator &iter, const uint8_t *) {
// iter.log_value("root primitive");
iter.log_error("unexpected root primitive");
return TAPE_ERROR;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::increment_count(json_iterator &) { return SUCCESS; }
} // namespace twitter
SIMDJSON_UNTARGET_REGION
SIMDJSON_PUSH_DISABLE_ALL_WARNINGS
#include <benchmark/benchmark.h>
SIMDJSON_POP_DISABLE_WARNINGS
using namespace benchmark;
using namespace std;
const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json";
static void sax_tweets(State& state) {
// Load twitter.json to a buffer
padded_string json;
if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
// Allocate
twitter::sax_tweet_reader reader;
if (auto error = reader.set_capacity(json.size())) { cerr << error << endl; return; }
// Make the tweet_reader
size_t bytes = 0;
size_t tweets = 0;
for (SIMDJSON_UNUSED auto _ : state) {
if (auto error = reader.read_tweets(json)) { throw error; }
bytes += json.size();
tweets += reader.tweets.size();
}
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
state.counters["Gigabytes"] = benchmark::Counter(
double(bytes), benchmark::Counter::kIsRate,
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
state.counters["tweets"] = Counter(double(tweets), benchmark::Counter::kIsRate);
}
BENCHMARK(sax_tweets)->Repetitions(10)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
return *(std::max_element(std::begin(v), std::end(v)));
})->DisplayAggregatesOnly(true);
BENCHMARK_MAIN();

View File

@ -501,6 +501,227 @@ simdjson_really_inline bool parse_number(const uint8_t *const src, W &writer) {
return is_structural_or_whitespace(*p);
}
// Parse any number from 0 to 18,446,744,073,709,551,615
simdjson_really_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
const uint8_t *p = src;
//
// Parse the integer part.
//
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
const uint8_t *const start_digits = p;
uint64_t i = 0;
while (parse_digit(*p, i)) { p++; }
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
int digit_count = int(p - start_digits);
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return NUMBER_ERROR; }
if (!is_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
// The longest positive 64-bit number is 20 digits.
// We do it this way so we don't trigger this branch unless we must.
if (digit_count > 20) { return NUMBER_ERROR; }
if (digit_count == 20) {
// Positive overflow check:
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
// biggest uint64_t.
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
// If we got here, it's a 20 digit number starting with the digit "1".
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
// than 1,553,255,926,290,448,384.
// - That is smaller than the smallest possible 20-digit number the user could write:
// 10,000,000,000,000,000,000.
// - Therefore, if the number is positive and lower than that, it's overflow.
// - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
//
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return NUMBER_ERROR; }
}
return i;
}
// Parse any number from 0 to 18,446,744,073,709,551,615
// Call this version of the method if you regularly expect 8- or 16-digit numbers.
// simdjson_really_inline simdjson_result<uint64_t> parse_large_unsigned(const uint8_t * const src) noexcept {
// const uint8_t *p = src;
// //
// // Parse the integer part.
// //
// const uint8_t *const start_digits = p;
// uint64_t i = 0;
// if (is_made_of_eight_digits_fast(p)) {
// i = i * 100000000 + parse_eight_digits_unrolled(p);
// p += 8;
// if (is_made_of_eight_digits_fast(p)) {
// i = i * 100000000 + parse_eight_digits_unrolled(p);
// p += 8;
// if (parse_digit(*p, i)) { // digit 17
// p++;
// if (parse_digit(*p, i)) { // digit 18
// p++;
// if (parse_digit(*p, i)) { // digit 19
// p++;
// if (parse_digit(*p, i)) { // digit 20
// p++;
// if (parse_digit(*p, i)) { return NUMBER_ERROR; } // 21 digits is an error
// // Positive overflow check:
// // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
// // biggest uint64_t.
// // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
// // If we got here, it's a 20 digit number starting with the digit "1".
// // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
// // than 1,553,255,926,290,448,384.
// // - That is smaller than the smallest possible 20-digit number the user could write:
// // 10,000,000,000,000,000,000.
// // - Therefore, if the number is positive and lower than that, it's overflow.
// // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
// //
// if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return NUMBER_ERROR; }
// }
// }
// }
// }
// } // 16 digits
// } else { // 8 digits
// // Less than 8 digits can't overflow, simpler logic here.
// if (parse_digit(*p, i)) { p++; } else { return NUMBER_ERROR; }
// while (parse_digit(*p, i)) { p++; }
// }
// if (!is_structural_or_whitespace(*p, i)) { return NUMBER_ERROR; }
// // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
// int digit_count = int(p - src);
// if (digit_count == 0 || ('0' == *src && digit_count > 1)) { return NUMBER_ERROR; }
// return i;
// }
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
simdjson_really_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
//
// Check for minus sign
//
bool negative = (*src == '-');
const uint8_t *p = src + negative;
//
// Parse the integer part.
//
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
const uint8_t *const start_digits = p;
uint64_t i = 0;
while (parse_digit(*p, i)) { p++; }
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
int digit_count = int(p - start_digits);
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return NUMBER_ERROR; }
if (!is_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
// The longest negative 64-bit number is 19 digits.
// The longest positive 64-bit number is 20 digits.
// We do it this way so we don't trigger this branch unless we must.
int longest_digit_count = negative ? 19 : 20;
if (digit_count > longest_digit_count) { return NUMBER_ERROR; }
if (digit_count == longest_digit_count) {
if(negative) {
// Anything negative above INT64_MAX+1 is invalid
if (i > uint64_t(INT64_MAX)+1) { return NUMBER_ERROR; }
return ~i+1;
// Positive overflow check:
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
// biggest uint64_t.
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
// If we got here, it's a 20 digit number starting with the digit "1".
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
// than 1,553,255,926,290,448,384.
// - That is smaller than the smallest possible 20-digit number the user could write:
// 10,000,000,000,000,000,000.
// - Therefore, if the number is positive and lower than that, it's overflow.
// - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
//
} else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return NUMBER_ERROR; }
}
return negative ? (~i+1) : i;
}
// simdjson_really_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
// //
// // Check for minus sign
// //
// bool negative = (*src == '-');
// src += negative;
// //
// // Parse the integer part.
// //
// uint64_t i = 0;
// const uint8_t *p = src;
// p += parse_digit(*p, i);
// bool leading_zero = (i == 0);
// while (parse_digit(*p, i)) { p++; }
// // no integer digits, or 0123 (zero must be solo)
// if ( p == src || (leading_zero && p != src+1)) { return NUMBER_ERROR; }
// //
// // Parse the decimal part.
// //
// int64_t exponent = 0;
// bool overflow;
// if (likely(*p == '.')) {
// p++;
// const uint8_t *start_decimal_digits = p;
// if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
// p++;
// while (parse_digit(*p, i)) { p++; }
// exponent = -(p - start_decimal_digits);
// // Overflow check. 19 digits (minus the decimal) may be overflow.
// overflow = p-src-1 >= 19;
// if (SIMDJSON_unlikely(overflow && leading_zero)) {
// // Skip leading 0.00000 and see if it still overflows
// const uint8_t *start_digits = src + 2;
// while (*start_digits == '0') { start_digits++; }
// overflow = start_digits-src >= 19;
// }
// } else {
// overflow = p-src >= 19;
// }
// //
// // Parse the exponent
// //
// if (*p == 'e' || *p == 'E') {
// p++;
// bool exp_neg = *p == '-';
// p += exp_neg || *p == '+';
// uint64_t exp = 0;
// const uint8_t *start_exp_digits = p;
// while (parse_digit(*p, exp)) { p++; }
// // no exp digits, or 20+ exp digits
// if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
// exponent += exp_neg ? 0-exp : exp;
// overflow = overflow || exponent < FASTFLOAT_SMALLEST_POWER || exponent > FASTFLOAT_LARGEST_POWER;
// }
// //
// // Assemble (or slow-parse) the float
// //
// if (likely(!overflow)) {
// bool success = false;
// double d = compute_float_64(exponent, i, negative, &success);
// if (success) { return d; }
// }
// double d;
// if (!parse_float_strtod(src-negative, &d)) {
// return NUMBER_ERROR;
// }
// return d;
// }
#endif // SIMDJSON_SKIPNUMBERPARSING
} // namespace numberparsing