parent
7cde65aa6e
commit
8769e42a56
|
@ -613,6 +613,8 @@ public:
|
||||||
really_inline bool is_number() const noexcept;
|
really_inline bool is_number() const noexcept;
|
||||||
/** Whether this is a JSON integer (e.g. 1 or -1, but *not* 1.0 or 1e2) */
|
/** Whether this is a JSON integer (e.g. 1 or -1, but *not* 1.0 or 1e2) */
|
||||||
really_inline bool is_integer() const noexcept;
|
really_inline bool is_integer() const noexcept;
|
||||||
|
/** Whether this is a JSON number but not an integer */
|
||||||
|
really_inline bool is_float() const noexcept;
|
||||||
/** Whether this is a JSON string (e.g. "abc") */
|
/** Whether this is a JSON string (e.g. "abc") */
|
||||||
really_inline bool is_string() const noexcept;
|
really_inline bool is_string() const noexcept;
|
||||||
/** Whether this is a JSON array (e.g. []) */
|
/** Whether this is a JSON array (e.g. []) */
|
||||||
|
|
|
@ -914,6 +914,9 @@ really_inline bool document::element::is_bool() const noexcept {
|
||||||
really_inline bool document::element::is_number() const noexcept {
|
really_inline bool document::element::is_number() const noexcept {
|
||||||
return type() == internal::tape_type::UINT64 || type() == internal::tape_type::INT64 || type() == internal::tape_type::DOUBLE;
|
return type() == internal::tape_type::UINT64 || type() == internal::tape_type::INT64 || type() == internal::tape_type::DOUBLE;
|
||||||
}
|
}
|
||||||
|
really_inline bool document::element::is_float() const noexcept {
|
||||||
|
return type() == internal::tape_type::DOUBLE;
|
||||||
|
}
|
||||||
really_inline bool document::element::is_integer() const noexcept {
|
really_inline bool document::element::is_integer() const noexcept {
|
||||||
return type() == internal::tape_type::UINT64 || type() == internal::tape_type::INT64;
|
return type() == internal::tape_type::UINT64 || type() == internal::tape_type::INT64;
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,83 +31,118 @@ struct stat_s {
|
||||||
size_t false_count;
|
size_t false_count;
|
||||||
size_t byte_count;
|
size_t byte_count;
|
||||||
size_t structural_indexes_count;
|
size_t structural_indexes_count;
|
||||||
|
size_t key_count;
|
||||||
|
size_t key_maximum_length;
|
||||||
|
size_t maximum_depth;
|
||||||
|
size_t ascii_key_count;
|
||||||
|
size_t ascii_string_count;
|
||||||
|
size_t maximum_object_size;
|
||||||
|
size_t maximum_array_size;
|
||||||
|
size_t string_maximum_length;
|
||||||
bool valid;
|
bool valid;
|
||||||
};
|
};
|
||||||
|
|
||||||
using stat_t = struct stat_s;
|
using stat_t = struct stat_s;
|
||||||
|
|
||||||
stat_t simdjson_compute_stats(const simdjson::padded_string &p) {
|
bool is_ascii(const std::string_view &v) {
|
||||||
stat_t answer;
|
for (size_t i = 0; i < v.size(); i++) {
|
||||||
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
|
if (static_cast<unsigned char>(v[i]) >= 128) {
|
||||||
answer.valid = pj.is_valid();
|
return false;
|
||||||
if (!answer.valid) {
|
|
||||||
std::cerr << pj.get_error_message() << std::endl;
|
|
||||||
return answer;
|
|
||||||
}
|
|
||||||
answer.backslash_count =
|
|
||||||
count_backslash(reinterpret_cast<const uint8_t *>(p.data()), p.size());
|
|
||||||
answer.non_ascii_byte_count = count_nonasciibytes(
|
|
||||||
reinterpret_cast<const uint8_t *>(p.data()), p.size());
|
|
||||||
answer.byte_count = p.size();
|
|
||||||
answer.integer_count = 0;
|
|
||||||
answer.float_count = 0;
|
|
||||||
answer.object_count = 0;
|
|
||||||
answer.array_count = 0;
|
|
||||||
answer.null_count = 0;
|
|
||||||
answer.true_count = 0;
|
|
||||||
answer.false_count = 0;
|
|
||||||
answer.string_count = 0;
|
|
||||||
answer.structural_indexes_count = pj.n_structural_indexes;
|
|
||||||
size_t tape_idx = 0;
|
|
||||||
uint64_t tape_val = pj.doc.tape[tape_idx++];
|
|
||||||
uint8_t type = (tape_val >> 56);
|
|
||||||
size_t how_many = 0;
|
|
||||||
assert(type == 'r');
|
|
||||||
how_many = tape_val & simdjson::internal::JSON_VALUE_MASK;
|
|
||||||
for (; tape_idx < how_many; tape_idx++) {
|
|
||||||
tape_val = pj.doc.tape[tape_idx];
|
|
||||||
// uint64_t payload = tape_val & simdjson::internal::JSON_VALUE_MASK;
|
|
||||||
type = (tape_val >> 56);
|
|
||||||
switch (type) {
|
|
||||||
case 'l': // we have a long int
|
|
||||||
answer.integer_count++;
|
|
||||||
tape_idx++; // skipping the integer
|
|
||||||
break;
|
|
||||||
case 'u': // we have a long uint
|
|
||||||
answer.integer_count++;
|
|
||||||
tape_idx++; // skipping the integer
|
|
||||||
break;
|
|
||||||
case 'd': // we have a double
|
|
||||||
answer.float_count++;
|
|
||||||
tape_idx++; // skipping the double
|
|
||||||
break;
|
|
||||||
case 'n': // we have a null
|
|
||||||
answer.null_count++;
|
|
||||||
break;
|
|
||||||
case 't': // we have a true
|
|
||||||
answer.true_count++;
|
|
||||||
break;
|
|
||||||
case 'f': // we have a false
|
|
||||||
answer.false_count++;
|
|
||||||
break;
|
|
||||||
case '{': // we have an object
|
|
||||||
answer.object_count++;
|
|
||||||
break;
|
|
||||||
case '}': // we end an object
|
|
||||||
break;
|
|
||||||
case '[': // we start an array
|
|
||||||
answer.array_count++;
|
|
||||||
break;
|
|
||||||
case ']': // we end an array
|
|
||||||
break;
|
|
||||||
case '"': // we have a string
|
|
||||||
answer.string_count++;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
break; // ignore
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return answer;
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void recurse(simdjson::document::element element, stat_t &s, size_t depth) {
|
||||||
|
if (depth > s.maximum_depth) {
|
||||||
|
s.maximum_depth = depth;
|
||||||
|
}
|
||||||
|
if (element.is_array()) {
|
||||||
|
s.array_count++;
|
||||||
|
auto [array, array_error] = element.as_array();
|
||||||
|
if (!array_error) {
|
||||||
|
size_t counter = 0;
|
||||||
|
for (auto child : array) {
|
||||||
|
counter++;
|
||||||
|
recurse(child, s, depth + 1);
|
||||||
|
}
|
||||||
|
if (counter > s.maximum_array_size) {
|
||||||
|
s.maximum_array_size = counter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (element.is_object()) {
|
||||||
|
s.object_count++;
|
||||||
|
auto [object, object_error] = element.as_object();
|
||||||
|
if (!object_error) {
|
||||||
|
size_t counter = 0;
|
||||||
|
for (auto [key, value] : object) {
|
||||||
|
counter++;
|
||||||
|
if (is_ascii(key)) {
|
||||||
|
s.ascii_key_count++;
|
||||||
|
s.ascii_string_count++;
|
||||||
|
}
|
||||||
|
if (key.size() > s.key_maximum_length) {
|
||||||
|
s.key_maximum_length = key.size();
|
||||||
|
}
|
||||||
|
if (key.size() > s.string_maximum_length) {
|
||||||
|
s.string_maximum_length = key.size();
|
||||||
|
}
|
||||||
|
s.string_count++;
|
||||||
|
s.key_count++;
|
||||||
|
recurse(value, s, depth + 1);
|
||||||
|
}
|
||||||
|
if (counter > s.maximum_object_size) {
|
||||||
|
s.maximum_object_size = counter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (element.is_float()) {
|
||||||
|
s.float_count++;
|
||||||
|
} else if (element.is_integer()) {
|
||||||
|
s.integer_count++;
|
||||||
|
} else if (element.is_bool()) {
|
||||||
|
if (element.as_bool()) {
|
||||||
|
s.true_count++;
|
||||||
|
} else {
|
||||||
|
s.false_count++;
|
||||||
|
}
|
||||||
|
} else if (element.is_null()) {
|
||||||
|
s.null_count++;
|
||||||
|
} else if (element.is_string()) {
|
||||||
|
s.string_count++;
|
||||||
|
if (is_ascii(element.as_string())) {
|
||||||
|
s.ascii_string_count++;
|
||||||
|
}
|
||||||
|
const std::string_view strval = element.as_string();
|
||||||
|
if (strval.size() > s.string_maximum_length) {
|
||||||
|
s.string_maximum_length = strval.size();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error("unrecognized node.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stat_t simdjson_compute_stats(const simdjson::padded_string &p) {
|
||||||
|
stat_t s{};
|
||||||
|
simdjson::document::parser parser;
|
||||||
|
auto [doc, error] = parser.parse(p);
|
||||||
|
if (error) {
|
||||||
|
s.valid = false;
|
||||||
|
std::cerr << error << std::endl;
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
s.valid = true;
|
||||||
|
s.backslash_count =
|
||||||
|
count_backslash(reinterpret_cast<const uint8_t *>(p.data()), p.size());
|
||||||
|
s.non_ascii_byte_count = count_nonasciibytes(
|
||||||
|
reinterpret_cast<const uint8_t *>(p.data()), p.size());
|
||||||
|
s.byte_count = p.size();
|
||||||
|
s.structural_indexes_count = parser.n_structural_indexes;
|
||||||
|
|
||||||
|
// simdjson::document::iterator iter(doc);
|
||||||
|
recurse(doc.root(), s, 0);
|
||||||
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
|
@ -122,6 +157,7 @@ int main(int argc, char *argv[]) {
|
||||||
std::cerr << "warning: ignoring everything after " << argv[myoptind + 1]
|
std::cerr << "warning: ignoring everything after " << argv[myoptind + 1]
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto [p, error] = simdjson::padded_string::load(filename);
|
auto [p, error] = simdjson::padded_string::load(filename);
|
||||||
if (error) {
|
if (error) {
|
||||||
std::cerr << "Could not load the file " << filename << std::endl;
|
std::cerr << "Could not load the file " << filename << std::endl;
|
||||||
|
@ -132,13 +168,37 @@ int main(int argc, char *argv[]) {
|
||||||
std::cerr << "not a valid JSON" << std::endl;
|
std::cerr << "not a valid JSON" << std::endl;
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
|
// Future work: the proper way to do the what follows would be to create
|
||||||
|
// a JSON object and then to serialize it.
|
||||||
|
|
||||||
printf("# integer_count float_count string_count backslash_count "
|
printf(R"({
|
||||||
"non_ascii_byte_count object_count array_count null_count true_count "
|
"integer_count" = %10zu,
|
||||||
"false_count byte_count structural_indexes_count\n");
|
"float_count" = %10zu,
|
||||||
printf("%zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu\n", s.integer_count,
|
"string_count" = %10zu,
|
||||||
s.float_count, s.string_count, s.backslash_count,
|
"ascii_string_count" = %10zu,
|
||||||
s.non_ascii_byte_count, s.object_count, s.array_count, s.null_count,
|
"string_maximum_length" = %10zu,
|
||||||
s.true_count, s.false_count, s.byte_count, s.structural_indexes_count);
|
"backslash_count" = %10zu,
|
||||||
return EXIT_SUCCESS;
|
"non_ascii_byte_count" = %10zu,
|
||||||
|
"object_count" = %10zu,
|
||||||
|
"maximum_object_size" = %10zu,
|
||||||
|
"array_count" = %10zu,
|
||||||
|
"maximum_array_size" = %10zu,
|
||||||
|
"null_count" = %10zu,
|
||||||
|
"true_count" = %10zu,
|
||||||
|
"false_count" = %10zu,
|
||||||
|
"byte_count" = %10zu,
|
||||||
|
"structural_indexes_count" = %10zu,
|
||||||
|
"key_count" = %10zu,
|
||||||
|
"ascii_key_count" = %10zu,
|
||||||
|
"key_maximum_length" = %10zu,
|
||||||
|
"maximum_depth" = %10zu
|
||||||
}
|
}
|
||||||
|
)",
|
||||||
|
s.integer_count, s.float_count, s.string_count, s.ascii_string_count,
|
||||||
|
s.string_maximum_length, s.backslash_count, s.non_ascii_byte_count,
|
||||||
|
s.object_count, s.maximum_object_size, s.array_count,
|
||||||
|
s.maximum_array_size, s.null_count, s.true_count, s.false_count,
|
||||||
|
s.byte_count, s.structural_indexes_count, s.key_count,
|
||||||
|
s.ascii_key_count, s.key_maximum_length, s.maximum_depth);
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
Loading…
Reference in New Issue