From b0e6bfa84c2b53513de81aa8cf28e6c0392a0df2 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Wed, 12 Jun 2019 16:29:24 -0400 Subject: [PATCH] Simpler iteration code (#190) * Adding convenience method to simplify code. * Simplifying the iteration code. --- README.md | 45 +++++--------------- benchmark/distinctuseridcompetition.cpp | 56 +++++++------------------ include/simdjson/parsedjson.h | 29 ++++++++++--- 3 files changed, 47 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index b640b9bb..5ae63ace 100644 --- a/README.md +++ b/README.md @@ -378,46 +378,21 @@ void compute_dump(ParsedJson::iterator &pjh) { The following function will find all user.id integers: ```C -void simdjson_traverse(std::vector &answer, ParsedJson::iterator &i) { - switch (i.get_type()) { - case '{': - if (i.down()) { - do { - bool founduser = equals(i.get_string(), "user"); - i.next(); // move to value - if (i.is_object()) { - if (founduser && i.move_to_key("id")) { +void simdjson_scan(std::vector &answer, ParsedJson::iterator &i) { + while(i.move_forward()) { + if(i.get_scope_type() == '{') { + bool founduser = (i.get_string_length() == 4) && (memcmp(i.get_string(), "user", 4) == 0); + i.move_to_value(); + if(founduser) { + if(i.is_object() && i.move_to_key("id",2)) { if (i.is_integer()) { answer.push_back(i.get_integer()); } i.up(); } - simdjson_traverse(answer, i); - } else if (i.is_array()) { - simdjson_traverse(answer, i); - } - } while (i.next()); - i.up(); - } - break; - case '[': - if (i.down()) { - do { - if (i.is_object_or_array()) { - simdjson_traverse(answer, i); - } - } while (i.next()); - i.up(); - } - break; - case 'l': - case 'd': - case 'n': - case 't': - case 'f': - default: - break; - } + } + } + } } ``` diff --git a/benchmark/distinctuseridcompetition.cpp b/benchmark/distinctuseridcompetition.cpp index a99f11d1..d62217a7 100644 --- a/benchmark/distinctuseridcompetition.cpp +++ b/benchmark/distinctuseridcompetition.cpp @@ -30,54 +30,28 @@ void print_vec(const std::vector &v) { std::cout << std::endl; } -void simdjson_traverse(std::vector &answer, ParsedJson::iterator &i) { - switch (i.get_type()) { - case '{': - if (i.down()) { - do { - bool founduser = (i.get_string_length() == 4) && (memcmp(i.get_string(), "user", 4) == 0); - i.move_to_value(); // move to value - if (i.is_object()) { - if (founduser && i.move_to_key("id")) { +void simdjson_scan(std::vector &answer, ParsedJson::iterator &i) { + while(i.move_forward()) { + if(i.get_scope_type() == '{') { + bool founduser = (i.get_string_length() == 4) && (memcmp(i.get_string(), "user", 4) == 0); + i.move_to_value(); + if(founduser) { + if(i.is_object() && i.move_to_key("id",2)) { if (i.is_integer()) { answer.push_back(i.get_integer()); - } + } i.up(); - } - simdjson_traverse(answer, i); - } else if (i.is_array()) { - simdjson_traverse(answer, i); - } - } while (i.next()); - i.up(); - } - break; - case '[': - if (i.down()) { - do { - if (i.is_object_or_array()) { - simdjson_traverse(answer, i); - } - } while (i.next()); - i.up(); - } - break; - case 'l': - case 'd': - case 'n': - case 't': - case 'f': - default: - break; - } + } + } + } + } } __attribute__ ((noinline)) std::vector simdjson_justdom(ParsedJson &pj) { std::vector answer; ParsedJson::iterator i(pj); - - simdjson_traverse(answer, i); + simdjson_scan(answer,i); remove_duplicates(answer); return answer; } @@ -90,8 +64,7 @@ std::vector simdjson_computestats(const padded_string &p) { return answer; } ParsedJson::iterator i(pj); - - simdjson_traverse(answer, i); + simdjson_scan(answer,i); remove_duplicates(answer); return answer; } @@ -338,7 +311,6 @@ int main(int argc, char *argv[]) { } BEST_TIME("simdjson ", simdjson_computestats(p).size(), size, , repeat, volume, !justdata); - BEST_TIME("rapid ", rapid_computestats(p).size(), size, , repeat, volume, !justdata); BEST_TIME("sasjon ", sasjon_computestats(p).size(), size, , repeat, volume, diff --git a/include/simdjson/parsedjson.h b/include/simdjson/parsedjson.h index e74be117..1d04ec9a 100644 --- a/include/simdjson/parsedjson.h +++ b/include/simdjson/parsedjson.h @@ -207,11 +207,17 @@ public: // when at {, go one level deep, looking for a given key // if successful, we are left pointing at the value, // if not, we are still pointing at the object ({) - // (in case of repeated keys, this only finds the first one) + // (in case of repeated keys, this only finds the first one). // We seek the key using C's strcmp so if your JSON strings contain // NULL chars, this would trigger a false positive: if you expect that // to be the case, take extra precautions. inline bool move_to_key(const char * key); + // when at {, go one level deep, looking for a given key + // if successful, we are left pointing at the value, + // if not, we are still pointing at the object ({) + // (in case of repeated keys, this only finds the first one). + // The string we search for can contain NULL values. + inline bool move_to_key(const char * key, uint32_t length); // when at a key location within an object, this moves to the accompanying value (located next to it). // this is equivalent but much faster than calling "next()". @@ -355,10 +361,6 @@ bool ParsedJson::iterator::move_forward() { } else if ((current_type == ']') || (current_type == '}')) { // Leaving a scope. depth--; - if(depth == 0) { - // Should not be necessary - return false; - } } else if ((current_type == 'd') || (current_type == 'l')) { // d and l types use 2 locations on the tape, not just one. location += 1; @@ -393,6 +395,21 @@ bool ParsedJson::iterator::move_to_key(const char * key) { return false; } +bool ParsedJson::iterator::move_to_key(const char * key, uint32_t length) { + if(down()) { + do { + assert(is_string()); + bool rightkey = ((get_string_length() == length) && (memcmp(get_string(),key,length)==0)); + move_to_value(); + if(rightkey) { + return true; + } + } while(next()); + assert(up());// not found + } + return false; +} + bool ParsedJson::iterator::prev() { if(location - 1 < depthindex[depth].start_of_scope) { @@ -456,7 +473,7 @@ void ParsedJson::iterator::to_start_scope() { } bool ParsedJson::iterator::next() { - size_t npos; // next position + size_t npos; if ((current_type == '[') || (current_type == '{')){ // we need to jump npos = ( current_val & JSONVALUEMASK);