diff --git a/include/simdjson/document.h b/include/simdjson/document.h index f54ffd2a..e6ce79c2 100644 --- a/include/simdjson/document.h +++ b/include/simdjson/document.h @@ -613,6 +613,8 @@ public: really_inline bool is_number() const noexcept; /** Whether this is a JSON integer (e.g. 1 or -1, but *not* 1.0 or 1e2) */ really_inline bool is_integer() const noexcept; + /** Whether this is a JSON number but not an integer */ + really_inline bool is_float() const noexcept; /** Whether this is a JSON string (e.g. "abc") */ really_inline bool is_string() const noexcept; /** Whether this is a JSON array (e.g. []) */ diff --git a/include/simdjson/inline/document.h b/include/simdjson/inline/document.h index d5b45424..ff8fc309 100644 --- a/include/simdjson/inline/document.h +++ b/include/simdjson/inline/document.h @@ -914,6 +914,9 @@ really_inline bool document::element::is_bool() const noexcept { really_inline bool document::element::is_number() const noexcept { return type() == internal::tape_type::UINT64 || type() == internal::tape_type::INT64 || type() == internal::tape_type::DOUBLE; } +really_inline bool document::element::is_float() const noexcept { + return type() == internal::tape_type::DOUBLE; +} really_inline bool document::element::is_integer() const noexcept { return type() == internal::tape_type::UINT64 || type() == internal::tape_type::INT64; } diff --git a/tools/jsonstats.cpp b/tools/jsonstats.cpp index 2a122958..86c88b27 100644 --- a/tools/jsonstats.cpp +++ b/tools/jsonstats.cpp @@ -31,83 +31,118 @@ struct stat_s { size_t false_count; size_t byte_count; size_t structural_indexes_count; + size_t key_count; + size_t key_maximum_length; + size_t maximum_depth; + size_t ascii_key_count; + size_t ascii_string_count; + size_t maximum_object_size; + size_t maximum_array_size; + size_t string_maximum_length; bool valid; }; using stat_t = struct stat_s; -stat_t simdjson_compute_stats(const simdjson::padded_string &p) { - stat_t answer; - simdjson::ParsedJson pj = simdjson::build_parsed_json(p); - answer.valid = pj.is_valid(); - if (!answer.valid) { - std::cerr << pj.get_error_message() << std::endl; - return answer; - } - answer.backslash_count = - count_backslash(reinterpret_cast(p.data()), p.size()); - answer.non_ascii_byte_count = count_nonasciibytes( - reinterpret_cast(p.data()), p.size()); - answer.byte_count = p.size(); - answer.integer_count = 0; - answer.float_count = 0; - answer.object_count = 0; - answer.array_count = 0; - answer.null_count = 0; - answer.true_count = 0; - answer.false_count = 0; - answer.string_count = 0; - answer.structural_indexes_count = pj.n_structural_indexes; - size_t tape_idx = 0; - uint64_t tape_val = pj.doc.tape[tape_idx++]; - uint8_t type = (tape_val >> 56); - size_t how_many = 0; - assert(type == 'r'); - how_many = tape_val & simdjson::internal::JSON_VALUE_MASK; - for (; tape_idx < how_many; tape_idx++) { - tape_val = pj.doc.tape[tape_idx]; - // uint64_t payload = tape_val & simdjson::internal::JSON_VALUE_MASK; - type = (tape_val >> 56); - switch (type) { - case 'l': // we have a long int - answer.integer_count++; - tape_idx++; // skipping the integer - break; - case 'u': // we have a long uint - answer.integer_count++; - tape_idx++; // skipping the integer - break; - case 'd': // we have a double - answer.float_count++; - tape_idx++; // skipping the double - break; - case 'n': // we have a null - answer.null_count++; - break; - case 't': // we have a true - answer.true_count++; - break; - case 'f': // we have a false - answer.false_count++; - break; - case '{': // we have an object - answer.object_count++; - break; - case '}': // we end an object - break; - case '[': // we start an array - answer.array_count++; - break; - case ']': // we end an array - break; - case '"': // we have a string - answer.string_count++; - break; - default: - break; // ignore +bool is_ascii(const std::string_view &v) { + for (size_t i = 0; i < v.size(); i++) { + if (static_cast(v[i]) >= 128) { + return false; } } - return answer; + return true; +} + +void recurse(simdjson::document::element element, stat_t &s, size_t depth) { + if (depth > s.maximum_depth) { + s.maximum_depth = depth; + } + if (element.is_array()) { + s.array_count++; + auto [array, array_error] = element.as_array(); + if (!array_error) { + size_t counter = 0; + for (auto child : array) { + counter++; + recurse(child, s, depth + 1); + } + if (counter > s.maximum_array_size) { + s.maximum_array_size = counter; + } + } + } else if (element.is_object()) { + s.object_count++; + auto [object, object_error] = element.as_object(); + if (!object_error) { + size_t counter = 0; + for (auto [key, value] : object) { + counter++; + if (is_ascii(key)) { + s.ascii_key_count++; + s.ascii_string_count++; + } + if (key.size() > s.key_maximum_length) { + s.key_maximum_length = key.size(); + } + if (key.size() > s.string_maximum_length) { + s.string_maximum_length = key.size(); + } + s.string_count++; + s.key_count++; + recurse(value, s, depth + 1); + } + if (counter > s.maximum_object_size) { + s.maximum_object_size = counter; + } + } + } else { + if (element.is_float()) { + s.float_count++; + } else if (element.is_integer()) { + s.integer_count++; + } else if (element.is_bool()) { + if (element.as_bool()) { + s.true_count++; + } else { + s.false_count++; + } + } else if (element.is_null()) { + s.null_count++; + } else if (element.is_string()) { + s.string_count++; + if (is_ascii(element.as_string())) { + s.ascii_string_count++; + } + const std::string_view strval = element.as_string(); + if (strval.size() > s.string_maximum_length) { + s.string_maximum_length = strval.size(); + } + } else { + throw std::runtime_error("unrecognized node."); + } + } +} + +stat_t simdjson_compute_stats(const simdjson::padded_string &p) { + stat_t s{}; + simdjson::document::parser parser; + auto [doc, error] = parser.parse(p); + if (error) { + s.valid = false; + std::cerr << error << std::endl; + return s; + } + s.valid = true; + s.backslash_count = + count_backslash(reinterpret_cast(p.data()), p.size()); + s.non_ascii_byte_count = count_nonasciibytes( + reinterpret_cast(p.data()), p.size()); + s.byte_count = p.size(); + s.structural_indexes_count = parser.n_structural_indexes; + + // simdjson::document::iterator iter(doc); + recurse(doc.root(), s, 0); + return s; } int main(int argc, char *argv[]) { @@ -122,6 +157,7 @@ int main(int argc, char *argv[]) { std::cerr << "warning: ignoring everything after " << argv[myoptind + 1] << std::endl; } + auto [p, error] = simdjson::padded_string::load(filename); if (error) { std::cerr << "Could not load the file " << filename << std::endl; @@ -132,13 +168,37 @@ int main(int argc, char *argv[]) { std::cerr << "not a valid JSON" << std::endl; return EXIT_FAILURE; } + // Future work: the proper way to do the what follows would be to create + // a JSON object and then to serialize it. - printf("# integer_count float_count string_count backslash_count " - "non_ascii_byte_count object_count array_count null_count true_count " - "false_count byte_count structural_indexes_count\n"); - printf("%zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu\n", s.integer_count, - s.float_count, s.string_count, s.backslash_count, - s.non_ascii_byte_count, s.object_count, s.array_count, s.null_count, - s.true_count, s.false_count, s.byte_count, s.structural_indexes_count); - return EXIT_SUCCESS; + printf(R"({ + "integer_count" = %10zu, + "float_count" = %10zu, + "string_count" = %10zu, + "ascii_string_count" = %10zu, + "string_maximum_length" = %10zu, + "backslash_count" = %10zu, + "non_ascii_byte_count" = %10zu, + "object_count" = %10zu, + "maximum_object_size" = %10zu, + "array_count" = %10zu, + "maximum_array_size" = %10zu, + "null_count" = %10zu, + "true_count" = %10zu, + "false_count" = %10zu, + "byte_count" = %10zu, + "structural_indexes_count" = %10zu, + "key_count" = %10zu, + "ascii_key_count" = %10zu, + "key_maximum_length" = %10zu, + "maximum_depth" = %10zu } +)", + s.integer_count, s.float_count, s.string_count, s.ascii_string_count, + s.string_maximum_length, s.backslash_count, s.non_ascii_byte_count, + s.object_count, s.maximum_object_size, s.array_count, + s.maximum_array_size, s.null_count, s.true_count, s.false_count, + s.byte_count, s.structural_indexes_count, s.key_count, + s.ascii_key_count, s.key_maximum_length, s.maximum_depth); + return EXIT_SUCCESS; +} \ No newline at end of file