2018-12-23 01:13:42 +08:00
|
|
|
#include <iostream>
|
2020-05-19 22:31:27 +08:00
|
|
|
#include <set>
|
2018-12-23 01:13:42 +08:00
|
|
|
|
2020-03-03 06:23:19 +08:00
|
|
|
#include "simdjson.h"
|
2020-06-24 04:44:25 +08:00
|
|
|
|
|
|
|
SIMDJSON_PUSH_DISABLE_ALL_WARNINGS
|
2020-05-30 03:39:23 +08:00
|
|
|
#ifndef __cpp_exceptions
|
|
|
|
#define CXXOPTS_NO_EXCEPTIONS
|
|
|
|
#endif
|
|
|
|
#include "cxxopts.hpp"
|
2020-06-24 04:44:25 +08:00
|
|
|
SIMDJSON_POP_DISABLE_WARNINGS
|
2018-12-23 01:13:42 +08:00
|
|
|
|
2019-07-31 05:18:10 +08:00
|
|
|
size_t count_nonasciibytes(const uint8_t *input, size_t length) {
|
2018-12-23 01:13:42 +08:00
|
|
|
size_t count = 0;
|
2019-07-31 05:18:10 +08:00
|
|
|
for (size_t i = 0; i < length; i++) {
|
2018-12-23 01:13:42 +08:00
|
|
|
count += input[i] >> 7;
|
|
|
|
}
|
|
|
|
return count;
|
2019-07-31 05:18:10 +08:00
|
|
|
}
|
2018-12-23 01:13:42 +08:00
|
|
|
|
2019-07-31 05:18:10 +08:00
|
|
|
size_t count_backslash(const uint8_t *input, size_t length) {
|
2018-12-23 01:13:42 +08:00
|
|
|
size_t count = 0;
|
2019-07-31 05:18:10 +08:00
|
|
|
for (size_t i = 0; i < length; i++) {
|
|
|
|
count += (input[i] == '\\') ? 1 : 0;
|
2018-12-23 01:13:42 +08:00
|
|
|
}
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct stat_s {
|
|
|
|
size_t integer_count;
|
2020-05-13 21:31:32 +08:00
|
|
|
size_t integer32_count;
|
|
|
|
size_t unsigned_integer32_count;
|
|
|
|
size_t unsigned_integer_count;
|
2018-12-23 01:13:42 +08:00
|
|
|
size_t float_count;
|
|
|
|
size_t string_count;
|
2020-05-19 22:31:27 +08:00
|
|
|
size_t string_byte_count;
|
2018-12-23 01:13:42 +08:00
|
|
|
size_t backslash_count;
|
2019-07-31 05:18:10 +08:00
|
|
|
size_t non_ascii_byte_count;
|
2018-12-23 01:13:42 +08:00
|
|
|
size_t object_count;
|
|
|
|
size_t array_count;
|
|
|
|
size_t null_count;
|
|
|
|
size_t true_count;
|
|
|
|
size_t false_count;
|
|
|
|
size_t byte_count;
|
|
|
|
size_t structural_indexes_count;
|
2020-03-26 06:01:23 +08:00
|
|
|
size_t key_count;
|
|
|
|
size_t key_maximum_length;
|
|
|
|
size_t maximum_depth;
|
|
|
|
size_t ascii_key_count;
|
|
|
|
size_t ascii_string_count;
|
|
|
|
size_t maximum_object_size;
|
|
|
|
size_t maximum_array_size;
|
|
|
|
size_t string_maximum_length;
|
2020-05-19 22:31:27 +08:00
|
|
|
size_t repeated_key_byte_count;
|
|
|
|
|
2018-12-23 01:13:42 +08:00
|
|
|
bool valid;
|
2020-05-19 22:31:27 +08:00
|
|
|
std::set<std::string_view> all_keys;
|
|
|
|
std::set<std::string_view> repeated_keys;
|
2018-12-23 01:13:42 +08:00
|
|
|
};
|
|
|
|
|
2019-02-24 00:28:20 +08:00
|
|
|
using stat_t = struct stat_s;
|
2018-12-23 01:13:42 +08:00
|
|
|
|
2020-03-26 06:01:23 +08:00
|
|
|
bool is_ascii(const std::string_view &v) {
|
|
|
|
for (size_t i = 0; i < v.size(); i++) {
|
|
|
|
if (static_cast<unsigned char>(v[i]) >= 128) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-03-29 02:43:41 +08:00
|
|
|
void recurse(simdjson::dom::element element, stat_t &s, size_t depth) {
|
2020-03-26 06:01:23 +08:00
|
|
|
if (depth > s.maximum_depth) {
|
|
|
|
s.maximum_depth = depth;
|
|
|
|
}
|
2020-06-20 07:00:11 +08:00
|
|
|
simdjson::error_code error;
|
2020-03-29 05:48:43 +08:00
|
|
|
if (element.is<simdjson::dom::array>()) {
|
2020-03-26 06:01:23 +08:00
|
|
|
s.array_count++;
|
2020-06-20 07:00:11 +08:00
|
|
|
simdjson::dom::array array;
|
2020-06-21 03:04:23 +08:00
|
|
|
if (not (error = element.get(array))) {
|
2020-03-26 06:01:23 +08:00
|
|
|
size_t counter = 0;
|
|
|
|
for (auto child : array) {
|
|
|
|
counter++;
|
|
|
|
recurse(child, s, depth + 1);
|
|
|
|
}
|
|
|
|
if (counter > s.maximum_array_size) {
|
|
|
|
s.maximum_array_size = counter;
|
|
|
|
}
|
|
|
|
}
|
2020-03-29 05:48:43 +08:00
|
|
|
} else if (element.is<simdjson::dom::object>()) {
|
2020-03-26 06:01:23 +08:00
|
|
|
s.object_count++;
|
2020-06-20 07:00:11 +08:00
|
|
|
simdjson::dom::object object;
|
2020-06-21 03:04:23 +08:00
|
|
|
if (not (error = element.get(object))) {
|
2020-03-26 06:01:23 +08:00
|
|
|
size_t counter = 0;
|
|
|
|
for (auto [key, value] : object) {
|
|
|
|
counter++;
|
2020-05-19 22:31:27 +08:00
|
|
|
if(s.all_keys.find(key) != s.all_keys.end()) {
|
|
|
|
s.repeated_keys.insert(key);
|
|
|
|
s.repeated_key_byte_count += key.size();
|
|
|
|
} else {
|
|
|
|
s.all_keys.insert(key);
|
|
|
|
}
|
2020-03-26 06:01:23 +08:00
|
|
|
if (is_ascii(key)) {
|
|
|
|
s.ascii_key_count++;
|
|
|
|
s.ascii_string_count++;
|
|
|
|
}
|
|
|
|
if (key.size() > s.key_maximum_length) {
|
|
|
|
s.key_maximum_length = key.size();
|
|
|
|
}
|
|
|
|
if (key.size() > s.string_maximum_length) {
|
|
|
|
s.string_maximum_length = key.size();
|
|
|
|
}
|
|
|
|
s.string_count++;
|
2020-05-19 22:31:27 +08:00
|
|
|
s.string_byte_count+= key.size();
|
2020-03-26 06:01:23 +08:00
|
|
|
s.key_count++;
|
|
|
|
recurse(value, s, depth + 1);
|
|
|
|
}
|
|
|
|
if (counter > s.maximum_object_size) {
|
|
|
|
s.maximum_object_size = counter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2020-05-13 21:31:32 +08:00
|
|
|
if (element.is<int64_t>()) {
|
|
|
|
s.integer_count++; // because an int can be sometimes represented as a double, we
|
|
|
|
// to check whether it is an integer first!!!
|
|
|
|
int64_t v;
|
2020-06-21 03:04:23 +08:00
|
|
|
error = element.get(v);
|
|
|
|
SIMDJSON_ASSUME(!error);
|
2020-05-13 21:31:32 +08:00
|
|
|
if((v >= std::numeric_limits<int32_t>::min()) and (v <= std::numeric_limits<int32_t>::max()) ) {
|
|
|
|
s.integer32_count++;
|
|
|
|
}
|
|
|
|
if((v >= std::numeric_limits<uint32_t>::min()) and (v <= std::numeric_limits<uint32_t>::max()) ) {
|
|
|
|
s.unsigned_integer32_count++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(element.is<uint64_t>()) { // the else is intentionally missing
|
|
|
|
s.unsigned_integer_count++;
|
|
|
|
} else if (element.is<double>()) {
|
2020-03-26 06:01:23 +08:00
|
|
|
s.float_count++;
|
2020-03-29 05:48:43 +08:00
|
|
|
} else if (element.is<bool>()) {
|
2020-04-16 04:45:36 +08:00
|
|
|
bool v;
|
2020-06-21 03:04:23 +08:00
|
|
|
error = element.get(v);
|
|
|
|
SIMDJSON_ASSUME(!error);
|
2020-04-16 04:45:36 +08:00
|
|
|
if (v) {
|
2020-03-26 06:01:23 +08:00
|
|
|
s.true_count++;
|
|
|
|
} else {
|
|
|
|
s.false_count++;
|
|
|
|
}
|
|
|
|
} else if (element.is_null()) {
|
|
|
|
s.null_count++;
|
2020-03-29 05:48:43 +08:00
|
|
|
} else if (element.is<std::string_view>()) {
|
2020-03-26 06:01:23 +08:00
|
|
|
s.string_count++;
|
2020-04-16 04:45:36 +08:00
|
|
|
std::string_view v;
|
2020-06-21 03:04:23 +08:00
|
|
|
error = element.get(v);
|
|
|
|
SIMDJSON_ASSUME(!error);
|
2020-04-16 04:45:36 +08:00
|
|
|
if (is_ascii(v)) {
|
2020-03-26 06:01:23 +08:00
|
|
|
s.ascii_string_count++;
|
|
|
|
}
|
2020-05-19 22:31:27 +08:00
|
|
|
s.string_byte_count+= v.size();
|
|
|
|
if (v.size() > s.string_maximum_length) {
|
|
|
|
s.string_maximum_length = v.size();
|
2020-03-26 06:01:23 +08:00
|
|
|
}
|
|
|
|
} else {
|
2020-06-21 03:04:23 +08:00
|
|
|
SIMDJSON_UNREACHABLE();
|
2020-03-26 06:01:23 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-31 05:18:10 +08:00
|
|
|
stat_t simdjson_compute_stats(const simdjson::padded_string &p) {
|
2020-03-26 06:01:23 +08:00
|
|
|
stat_t s{};
|
2020-03-29 02:43:41 +08:00
|
|
|
simdjson::dom::parser parser;
|
2020-06-22 06:26:44 +08:00
|
|
|
simdjson::dom::element doc;
|
|
|
|
auto error = parser.parse(p).get(doc);
|
2020-03-26 06:01:23 +08:00
|
|
|
if (error) {
|
|
|
|
s.valid = false;
|
|
|
|
std::cerr << error << std::endl;
|
|
|
|
return s;
|
2018-12-23 01:13:42 +08:00
|
|
|
}
|
2020-03-26 06:01:23 +08:00
|
|
|
s.valid = true;
|
|
|
|
s.backslash_count =
|
2019-07-31 05:18:10 +08:00
|
|
|
count_backslash(reinterpret_cast<const uint8_t *>(p.data()), p.size());
|
2020-03-26 06:01:23 +08:00
|
|
|
s.non_ascii_byte_count = count_nonasciibytes(
|
2019-07-31 05:18:10 +08:00
|
|
|
reinterpret_cast<const uint8_t *>(p.data()), p.size());
|
2020-03-26 06:01:23 +08:00
|
|
|
s.byte_count = p.size();
|
2020-06-02 03:14:09 +08:00
|
|
|
s.structural_indexes_count = parser.implementation->n_structural_indexes;
|
2020-03-26 06:01:23 +08:00
|
|
|
|
|
|
|
// simdjson::document::iterator iter(doc);
|
2020-03-27 04:51:38 +08:00
|
|
|
recurse(doc, s, 0);
|
2020-03-26 06:01:23 +08:00
|
|
|
return s;
|
2018-12-23 01:13:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int main(int argc, char *argv[]) {
|
2020-05-30 03:39:23 +08:00
|
|
|
#ifdef __cpp_exceptions
|
|
|
|
try {
|
|
|
|
#endif
|
|
|
|
std::string progName = "jsonstat";
|
|
|
|
std::string progUsage = "Reads json, prints stats.\n";
|
|
|
|
progUsage += argv[0];
|
|
|
|
progUsage += " <jsonfile>";
|
|
|
|
|
|
|
|
cxxopts::Options options(progName, progUsage);
|
|
|
|
|
|
|
|
options.add_options()
|
|
|
|
("h,help", "Print usage.")
|
|
|
|
("f,file", "File name.", cxxopts::value<std::string>())
|
|
|
|
;
|
|
|
|
|
|
|
|
options.parse_positional({"file"});
|
|
|
|
auto result = options.parse(argc, argv);
|
|
|
|
|
|
|
|
if(result.count("help")) {
|
|
|
|
std::cerr << options.help() << std::endl;
|
|
|
|
return EXIT_SUCCESS;
|
2018-12-23 01:13:42 +08:00
|
|
|
}
|
2020-05-30 03:39:23 +08:00
|
|
|
|
|
|
|
if(!result.count("file")) {
|
|
|
|
std::cerr << "No filename specified." << std::endl;
|
|
|
|
std::cerr << options.help() << std::endl;
|
|
|
|
return EXIT_FAILURE;
|
2018-12-23 01:13:42 +08:00
|
|
|
}
|
2020-03-26 06:01:23 +08:00
|
|
|
|
2020-05-30 03:39:23 +08:00
|
|
|
const char *filename = result["file"].as<std::string>().c_str();
|
|
|
|
|
2020-06-22 06:26:44 +08:00
|
|
|
simdjson::padded_string p;
|
|
|
|
auto error = simdjson::padded_string::load(filename).get(p);
|
2020-03-07 10:14:34 +08:00
|
|
|
if (error) {
|
2018-12-23 01:13:42 +08:00
|
|
|
std::cerr << "Could not load the file " << filename << std::endl;
|
|
|
|
return EXIT_FAILURE;
|
|
|
|
}
|
2019-07-31 05:18:10 +08:00
|
|
|
stat_t s = simdjson_compute_stats(p);
|
|
|
|
if (!s.valid) {
|
2018-12-23 01:13:42 +08:00
|
|
|
std::cerr << "not a valid JSON" << std::endl;
|
|
|
|
return EXIT_FAILURE;
|
|
|
|
}
|
2020-03-26 06:01:23 +08:00
|
|
|
// Future work: the proper way to do the what follows would be to create
|
|
|
|
// a JSON object and then to serialize it.
|
2018-12-23 01:13:42 +08:00
|
|
|
|
2020-03-26 06:01:23 +08:00
|
|
|
printf(R"({
|
2020-05-30 03:39:23 +08:00
|
|
|
"integer_count" = %10zu,
|
|
|
|
"integer32_count" = %10zu,
|
|
|
|
"unsigned_integer32_count" = %10zu,
|
|
|
|
"unsigned_integer_count" = %10zu,
|
|
|
|
"float_count" = %10zu,
|
|
|
|
"string_count" = %10zu,
|
|
|
|
"string_byte_count" = %10zu,
|
|
|
|
"ascii_string_count" = %10zu,
|
|
|
|
"string_maximum_length" = %10zu,
|
|
|
|
"backslash_count" = %10zu,
|
|
|
|
"non_ascii_byte_count" = %10zu,
|
|
|
|
"object_count" = %10zu,
|
|
|
|
"maximum_object_size" = %10zu,
|
|
|
|
"array_count" = %10zu,
|
|
|
|
"maximum_array_size" = %10zu,
|
|
|
|
"null_count" = %10zu,
|
|
|
|
"true_count" = %10zu,
|
|
|
|
"false_count" = %10zu,
|
|
|
|
"byte_count" = %10zu,
|
|
|
|
"structural_indexes_count" = %10zu,
|
|
|
|
"key_count" = %10zu,
|
|
|
|
"ascii_key_count" = %10zu,
|
|
|
|
"key_maximum_length" = %10zu,
|
|
|
|
"key_distinct_count" = %10zu,
|
2020-05-19 22:31:27 +08:00
|
|
|
"repeated_key_distinct_count"= %10zu,
|
2020-05-30 03:39:23 +08:00
|
|
|
"repeated_key_byte_count" = %10zu;
|
|
|
|
"maximum_depth" = %10zu
|
2018-12-23 01:13:42 +08:00
|
|
|
}
|
2020-03-26 06:01:23 +08:00
|
|
|
)",
|
2020-05-13 21:31:32 +08:00
|
|
|
s.integer_count,s.integer32_count,s.unsigned_integer32_count,s.unsigned_integer_count,
|
2020-05-19 22:31:27 +08:00
|
|
|
s.float_count, s.string_count, s.string_byte_count, s.ascii_string_count,
|
2020-03-26 06:01:23 +08:00
|
|
|
s.string_maximum_length, s.backslash_count, s.non_ascii_byte_count,
|
|
|
|
s.object_count, s.maximum_object_size, s.array_count,
|
|
|
|
s.maximum_array_size, s.null_count, s.true_count, s.false_count,
|
|
|
|
s.byte_count, s.structural_indexes_count, s.key_count,
|
2020-05-19 22:31:27 +08:00
|
|
|
s.ascii_key_count, s.key_maximum_length, s.all_keys.size(), s.repeated_keys.size(),
|
|
|
|
s.repeated_key_byte_count, s.maximum_depth);
|
2020-03-26 06:01:23 +08:00
|
|
|
return EXIT_SUCCESS;
|
2020-05-30 03:39:23 +08:00
|
|
|
#ifdef __cpp_exceptions
|
|
|
|
} catch (const cxxopts::OptionException& e) {
|
|
|
|
std::cout << "error parsing options: " << e.what() << std::endl;
|
|
|
|
return EXIT_FAILURE;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|