Trying to migrate distinctuseridcompetition to new API. (#624)

* Trying to migrate distinctuseridcompetition to new API.

* Ok. Good performance + got rid of old API.
This commit is contained in:
Daniel Lemire 2020-03-26 12:06:28 -04:00 committed by GitHub
parent 06587824be
commit ab0e22a316
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 69 additions and 20 deletions

View File

@ -30,28 +30,74 @@ void print_vec(const std::vector<int64_t> &v) {
std::cout << std::endl;
}
void simdjson_scan(std::vector<int64_t> &answer, simdjson::ParsedJson::Iterator i) {
while (i.move_forward()) {
if (i.get_scope_type() == '{') {
bool found_user = (i.get_string_length() == 4) &&
(memcmp(i.get_string(), "user", 4) == 0);
i.move_to_value();
if (found_user) {
if (i.is_object() && i.move_to_key("id", 2)) {
if (i.is_integer()) {
answer.push_back(i.get_integer());
// clang-format off
// simdjson_recurse below come be implemented like so but it is slow:
/*void simdjson_recurse(std::vector<int64_t> & v, simdjson::document::element element) {
if (element.is_array()) {
auto [array, array_error] = element.as_array();
for (auto child : array) {
if (child.is_array() || child.is_object()) {
simdjson_recurse(v, child);
}
}
} else if (element.is_object()) {
auto [object, error] = element.as_object();
int64_t id;
object["user"]["id"].as_int64_t().tie(id,error);
if(!error) {
v.push_back(id);
}
for (auto [key, value] : object) {
if (value.is_array() || value.is_object()) {
simdjson_recurse(v, value);
}
}
}
}*/
// clang-format on
void simdjson_recurse(std::vector<int64_t> & v, simdjson::document::element element) {
if (element.is_array()) {
auto array = element.as_array();
for (auto child : array) {
if (child.is_array() || child.is_object()) {
simdjson_recurse(v, child);
}
}
} else if (element.is_object()) {
auto object = element.as_object();
for (auto [key, value] : object) {
if((key.size() == 4) && (memcmp(key.data(), "user", 4) == 0)) {
// we are in an object under the key "user"
if(value.is_object()) {
auto child_object = value.as_object();
for (auto [child_key, child_value] : child_object) {
if((child_key.size() == 2) && (memcmp(child_key.data(), "id", 2) == 0)) {
if(child_value.is_integer()) {
v.push_back(child_value.as_int64_t());
}
}
if (child_value.is_array() || child_value.is_object()) {
simdjson_recurse(v, child_value);
}
}
i.up();
} else if (value.is_array()) {
simdjson_recurse(v, value);
}
// end of: we are in an object under the key "user"
} else if (value.is_array() || value.is_object()) {
simdjson_recurse(v, value);
}
}
}
}
__attribute__((noinline)) std::vector<int64_t>
simdjson_just_dom(simdjson::ParsedJson &pj) {
simdjson_just_dom(simdjson::document &doc) {
std::vector<int64_t> answer;
simdjson_scan(answer, pj);
simdjson_recurse(answer, doc.root());
remove_duplicates(answer);
return answer;
}
@ -59,15 +105,16 @@ simdjson_just_dom(simdjson::ParsedJson &pj) {
__attribute__((noinline)) std::vector<int64_t>
simdjson_compute_stats(const simdjson::padded_string &p) {
std::vector<int64_t> answer;
ParsedJson pj = simdjson::build_parsed_json(p);
simdjson_scan(answer, pj);
simdjson::document::parser parser;
simdjson::document &doc = parser.parse(p);
simdjson_recurse(answer, doc.root());
remove_duplicates(answer);
return answer;
}
__attribute__((noinline)) bool
simdjson_just_parse(const simdjson::padded_string &p) {
return simdjson::document::parse(p).error() != simdjson::SUCCESS;
return simdjson::document::parse(p).error() == simdjson::SUCCESS;
}
void sajson_traverse(std::vector<int64_t> &answer, const sajson::value &node) {
@ -307,7 +354,7 @@ int main(int argc, char *argv[]) {
printf(
"name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
}
BEST_TIME("simdjson ", simdjson_compute_stats(p).size(), size, , repeat,
BEST_TIME("simdjson ", simdjson_compute_stats(p).size(), size, , repeat,
volume, !just_data);
BEST_TIME("rapid ", rapid_compute_stats(p).size(), size, , repeat, volume,
!just_data);
@ -319,10 +366,12 @@ int main(int argc, char *argv[]) {
volume, !just_data);
BEST_TIME("sasjon (just parse) ", sasjon_just_parse(p), false, , repeat,
volume, !just_data);
ParsedJson dsimdjson = build_parsed_json(p);
BEST_TIME("simdjson (just dom) ", simdjson_just_dom(dsimdjson).size(), size,
simdjson::document::parser parser;
simdjson::document &doc = parser.parse(p);
BEST_TIME("simdjson (just dom)", simdjson_just_dom(doc).size(), size,
, repeat, volume, !just_data);
char *buffer = (char *)malloc(p.size());
char *buffer = (char *)malloc(p.size() + 1);
buffer[p.size()] = '\0';
memcpy(buffer, p.data(), p.size());
rapidjson::Document drapid;
drapid.ParseInsitu<kParseValidateEncodingFlag>(buffer);