407 lines
11 KiB
C++
407 lines
11 KiB
C++
#include <cassert>
|
|
#include <climits>
|
|
#include <cstring>
|
|
#include <inttypes.h>
|
|
#include <iostream>
|
|
#include <math.h>
|
|
#include <stdbool.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
#ifndef JSON_TEST_STRINGS
|
|
#define JSON_TEST_STRINGS
|
|
#endif
|
|
|
|
#ifndef _MSC_VER
|
|
#include <dirent.h>
|
|
#else
|
|
#include <dirent_portable.h>
|
|
#endif
|
|
#include "simdjson.h"
|
|
|
|
char *fullpath;
|
|
|
|
size_t bad_string;
|
|
size_t good_string;
|
|
size_t empty_string;
|
|
|
|
size_t total_string_length;
|
|
bool probable_bug;
|
|
// borrowed code (sajson?)
|
|
|
|
static inline bool read_hex(const char *p, unsigned &u) {
|
|
unsigned v = 0;
|
|
int i = 4;
|
|
while (i--) {
|
|
unsigned char c = *p++;
|
|
if (c >= '0' && c <= '9') {
|
|
c = static_cast<unsigned char>(c - '0');
|
|
} else if (c >= 'a' && c <= 'f') {
|
|
c = static_cast<unsigned char>(c - 'a' + 10);
|
|
} else if (c >= 'A' && c <= 'F') {
|
|
c = static_cast<unsigned char>(c - 'A' + 10);
|
|
} else {
|
|
return false;
|
|
}
|
|
v = (v << 4) + c;
|
|
}
|
|
|
|
u = v;
|
|
return true;
|
|
}
|
|
|
|
static inline void write_utf8(unsigned codepoint, char *&end) {
|
|
if (codepoint < 0x80) {
|
|
*end++ = static_cast<char>(codepoint);
|
|
} else if (codepoint < 0x800) {
|
|
*end++ = static_cast<char>(0xC0 | (codepoint >> 6));
|
|
*end++ = static_cast<char>(0x80 | (codepoint & 0x3F));
|
|
} else if (codepoint < 0x10000) {
|
|
*end++ = static_cast<char>(0xE0 | (codepoint >> 12));
|
|
*end++ = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
|
|
*end++ = static_cast<char>(0x80 | (codepoint & 0x3F));
|
|
} else {
|
|
assert(codepoint < 0x200000);
|
|
*end++ = static_cast<char>(0xF0 | (codepoint >> 18));
|
|
*end++ = static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
|
|
*end++ = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
|
|
*end++ = static_cast<char>(0x80 | (codepoint & 0x3F));
|
|
}
|
|
}
|
|
|
|
static bool parse_string(const char *p, char *output, char **end) {
|
|
if (*p != '"')
|
|
return false;
|
|
p++;
|
|
|
|
for (;;) {
|
|
#if (CHAR_MIN < 0) || (!defined(CHAR_MIN)) // the '!defined' is just paranoia
|
|
// in this path, char is *signed*
|
|
if ((*p >= 0 && *p < 0x20)) {
|
|
return false; // unescaped
|
|
}
|
|
#else
|
|
// we have unsigned chars
|
|
if (*p < 0x20) {
|
|
return false; // unescaped
|
|
}
|
|
#endif
|
|
|
|
switch (*p) {
|
|
case '"':
|
|
*output = '\0'; // end
|
|
*end = output;
|
|
return true;
|
|
case '\\':
|
|
++p;
|
|
|
|
char replacement;
|
|
switch (*p) {
|
|
case '"':
|
|
replacement = '"';
|
|
goto replace;
|
|
case '\\':
|
|
replacement = '\\';
|
|
goto replace;
|
|
case '/':
|
|
replacement = '/';
|
|
goto replace;
|
|
case 'b':
|
|
replacement = '\b';
|
|
goto replace;
|
|
case 'f':
|
|
replacement = '\f';
|
|
goto replace;
|
|
case 'n':
|
|
replacement = '\n';
|
|
goto replace;
|
|
case 'r':
|
|
replacement = '\r';
|
|
goto replace;
|
|
case 't':
|
|
replacement = '\t';
|
|
goto replace;
|
|
replace:
|
|
*output++ = replacement;
|
|
++p;
|
|
break;
|
|
case 'u': {
|
|
++p;
|
|
unsigned u;
|
|
if (!read_hex(p, u))
|
|
return false;
|
|
|
|
p += 4;
|
|
if (u >= 0xD800 && u <= 0xDBFF) {
|
|
char p0 = p[0];
|
|
char p1 = p[1];
|
|
if (p0 != '\\' || p1 != 'u') {
|
|
return false;
|
|
}
|
|
p += 2;
|
|
unsigned v;
|
|
if (!read_hex(p, v))
|
|
return false;
|
|
|
|
p += 4;
|
|
|
|
if (v < 0xDC00 || v > 0xDFFF) {
|
|
return false;
|
|
}
|
|
u = 0x10000 + (((u - 0xD800) << 10) | (v - 0xDC00));
|
|
}
|
|
write_utf8(u, output);
|
|
break;
|
|
}
|
|
default:
|
|
return false;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
// validate UTF-8
|
|
unsigned char c0 = p[0];
|
|
if (c0 < 128) {
|
|
*output++ = *p++;
|
|
} else if (c0 < 224) {
|
|
unsigned char c1 = p[1];
|
|
if (c1 < 128 || c1 >= 192) {
|
|
return false;
|
|
}
|
|
output[0] = c0;
|
|
output[1] = c1;
|
|
output += 2;
|
|
p += 2;
|
|
} else if (c0 < 240) {
|
|
unsigned char c1 = p[1];
|
|
if (c1 < 128 || c1 >= 192) {
|
|
return false;
|
|
}
|
|
unsigned char c2 = p[2];
|
|
if (c2 < 128 || c2 >= 192) {
|
|
return false;
|
|
}
|
|
output[0] = c0;
|
|
output[1] = c1;
|
|
output[2] = c2;
|
|
output += 3;
|
|
p += 3;
|
|
} else if (c0 < 248) {
|
|
unsigned char c1 = p[1];
|
|
if (c1 < 128 || c1 >= 192) {
|
|
return false;
|
|
}
|
|
unsigned char c2 = p[2];
|
|
if (c2 < 128 || c2 >= 192) {
|
|
return false;
|
|
}
|
|
unsigned char c3 = p[3];
|
|
if (c3 < 128 || c3 >= 192) {
|
|
return false;
|
|
}
|
|
output[0] = c0;
|
|
output[1] = c1;
|
|
output[2] = c2;
|
|
output[3] = c3;
|
|
output += 4;
|
|
p += 4;
|
|
} else {
|
|
return false;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
// end of borrowed code
|
|
char *big_buffer; // global variable
|
|
|
|
void found_bad_string(const uint8_t *buf) {
|
|
bad_string++;
|
|
char *end;
|
|
if (parse_string((const char *)buf, big_buffer, &end)) {
|
|
printf("WARNING: Sajson-like parser seems to think that the string is "
|
|
"valid %32s \n",
|
|
buf);
|
|
probable_bug = true;
|
|
}
|
|
}
|
|
|
|
void print_hex(const char *s, size_t len) {
|
|
for (size_t i = 0; i < len; i++) {
|
|
printf("%02x ", s[i] & 0xFF);
|
|
}
|
|
}
|
|
|
|
void print_cmp_hex(const char *s1, const char *s2, size_t len) {
|
|
for (size_t i = 0; i < len; i++) {
|
|
printf("%02x ", (s1[i] ^ s2[i]) & 0xFF);
|
|
}
|
|
}
|
|
|
|
void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
|
|
const uint8_t *parsed_end) {
|
|
size_t this_len = parsed_end - parsed_begin;
|
|
total_string_length += this_len;
|
|
good_string++;
|
|
char *end = NULL;
|
|
if (!parse_string((const char *)buf, big_buffer, &end)) {
|
|
printf("WARNING: reference parser seems to think that the string is NOT "
|
|
"valid %32s \n",
|
|
buf);
|
|
}
|
|
if (end == big_buffer) {
|
|
// we have a zero-length string
|
|
if (parsed_begin != parsed_end) {
|
|
printf("WARNING: We have a zero-length but gap is %zu \n",
|
|
(size_t)(parsed_end - parsed_begin));
|
|
probable_bug = true;
|
|
}
|
|
empty_string++;
|
|
return;
|
|
}
|
|
size_t len = end - big_buffer;
|
|
if (len != this_len) {
|
|
printf("WARNING: lengths on parsed strings disagree %zu %zu \n", this_len,
|
|
len);
|
|
printf("\nour parsed string : '%*s'\n\n", (int)this_len,
|
|
(const char *)parsed_begin);
|
|
print_hex((const char *)parsed_begin, this_len);
|
|
printf("\n");
|
|
|
|
printf("reference parsing :'%*s'\n\n", (int)len, big_buffer);
|
|
print_hex((const char *)big_buffer, len);
|
|
printf("\n");
|
|
|
|
probable_bug = true;
|
|
}
|
|
if (memcmp(big_buffer, parsed_begin, this_len) != 0) {
|
|
printf("WARNING: parsed strings disagree \n");
|
|
printf("Lengths %zu %zu \n", this_len, len);
|
|
|
|
printf("\nour parsed string : '%*s'\n", (int)this_len,
|
|
(const char *)parsed_begin);
|
|
print_hex((const char *)parsed_begin, this_len);
|
|
printf("\n");
|
|
|
|
printf("reference parsing :'%*s'\n", (int)len, big_buffer);
|
|
print_hex((const char *)big_buffer, len);
|
|
printf("\n");
|
|
|
|
print_cmp_hex((const char *)parsed_begin, big_buffer, this_len);
|
|
|
|
probable_bug = true;
|
|
}
|
|
}
|
|
|
|
#include "simdjson.h"
|
|
#include "simdjson.cpp"
|
|
|
|
/**
|
|
* Does the file filename ends with the given extension.
|
|
*/
|
|
static bool has_extension(const char *filename, const char *extension) {
|
|
const char *ext = strrchr(filename, '.');
|
|
return (ext && !strcmp(ext, extension));
|
|
}
|
|
|
|
bool starts_with(const char *pre, const char *str) {
|
|
size_t lenpre = strlen(pre), lenstr = strlen(str);
|
|
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
|
|
}
|
|
|
|
bool validate(const char *dirname) {
|
|
size_t total_strings = 0;
|
|
probable_bug = false;
|
|
const char *extension = ".json";
|
|
size_t dirlen = strlen(dirname);
|
|
struct dirent **entry_list;
|
|
int c = scandir(dirname, &entry_list, 0, alphasort);
|
|
if (c < 0) {
|
|
printf("error accessing %s \n", dirname);
|
|
return false;
|
|
}
|
|
if (c == 0) {
|
|
printf("nothing in dir %s \n", dirname);
|
|
return false;
|
|
}
|
|
bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/');
|
|
for (int i = 0; i < c; i++) {
|
|
const char *name = entry_list[i]->d_name;
|
|
if (has_extension(name, extension)) {
|
|
size_t filelen = strlen(name);
|
|
fullpath = (char *)malloc(dirlen + filelen + 1 + 1);
|
|
strcpy(fullpath, dirname);
|
|
if (needsep) {
|
|
fullpath[dirlen] = '/';
|
|
strcpy(fullpath + dirlen + 1, name);
|
|
} else {
|
|
strcpy(fullpath + dirlen, name);
|
|
}
|
|
auto [p, error] = simdjson::padded_string::load(fullpath);
|
|
if (error) {
|
|
std::cerr << "Could not load the file " << fullpath << std::endl;
|
|
return EXIT_FAILURE;
|
|
}
|
|
big_buffer = (char *)malloc(p.size());
|
|
if (big_buffer == NULL) {
|
|
std::cerr << "can't allocate memory" << std::endl;
|
|
return false;
|
|
}
|
|
bad_string = 0;
|
|
good_string = 0;
|
|
total_string_length = 0;
|
|
empty_string = 0;
|
|
simdjson::dom::parser parser;
|
|
auto err = parser.parse(p).error();
|
|
bool isok = (err == simdjson::error_code::SUCCESS);
|
|
free(big_buffer);
|
|
if (good_string > 0) {
|
|
printf("File %40s %s --- bad strings: %10zu \tgood strings: %10zu\t "
|
|
"empty strings: %10zu "
|
|
"\taverage string length: %.1f \n",
|
|
name, isok ? " is valid " : " is not valid ", bad_string,
|
|
good_string, empty_string,
|
|
static_cast<double>(total_string_length) / static_cast<double>(good_string));
|
|
} else if (bad_string > 0) {
|
|
printf("File %40s %s --- bad strings: %10zu \n", name,
|
|
isok ? " is valid " : " is not valid ", bad_string);
|
|
}
|
|
total_strings += bad_string + good_string;
|
|
free(fullpath);
|
|
}
|
|
}
|
|
printf("%zu strings checked.\n", total_strings);
|
|
if (probable_bug) {
|
|
fprintf(stderr, "STRING PARSING FAILS?\n");
|
|
} else {
|
|
printf("All ok.\n");
|
|
}
|
|
for (int i = 0; i < c; ++i)
|
|
free(entry_list[i]);
|
|
free(entry_list);
|
|
return probable_bug == false;
|
|
}
|
|
|
|
int main(int argc, char *argv[]) {
|
|
if (argc != 2) {
|
|
std::cerr << "Usage: " << argv[0] << " <directorywithjsonfiles>"
|
|
<< std::endl;
|
|
#if defined(SIMDJSON_TEST_DATA_DIR) && defined(SIMDJSON_BENCHMARK_DATA_DIR)
|
|
std::cout << "We are going to assume you mean to use the '"
|
|
<< SIMDJSON_TEST_DATA_DIR << "' and '"
|
|
<< SIMDJSON_BENCHMARK_DATA_DIR << "'directories." << std::endl;
|
|
return validate(SIMDJSON_TEST_DATA_DIR) &&
|
|
validate(SIMDJSON_BENCHMARK_DATA_DIR)
|
|
? EXIT_SUCCESS
|
|
: EXIT_FAILURE;
|
|
#else
|
|
std::cout << "We are going to assume you mean to use the 'jsonchecker' and "
|
|
"'jsonexamples' directories."
|
|
<< std::endl;
|
|
return validate("jsonchecker/") && validate("jsonexamples/") ? EXIT_SUCCESS
|
|
: EXIT_FAILURE;
|
|
#endif
|
|
}
|
|
return validate(argv[1]) ? EXIT_SUCCESS : EXIT_FAILURE;
|
|
}
|