Added more thorough testing.
This commit is contained in:
parent
f0af315315
commit
18633e02d2
17
Makefile
17
Makefile
|
@ -7,7 +7,7 @@
|
|||
.PHONY: clean cleandist
|
||||
|
||||
CXXFLAGS = -std=c++11 -g2 -O3 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux -Idependencies/rapidjson/include -Idependencies/sajson/include
|
||||
EXECUTABLES=parse jsoncheck numberparsingcheck minifiercompetition parsingcompetition minify allparserscheckfile
|
||||
EXECUTABLES=parse jsoncheck numberparsingcheck stringparsingcheck minifiercompetition parsingcompetition minify allparserscheckfile
|
||||
|
||||
HEADERS= include/jsonparser/simdutf8check.h include/jsonparser/stringparsing.h include/jsonparser/numberparsing.h include/jsonparser/jsonparser.h include/jsonparser/common_defs.h include/jsonparser/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/jsonparser/simdjson_internal.h include/jsonparser/stage1_find_marks.h include/jsonparser/stage2_flatten.h include/jsonparser/stage34_unified.h include/jsonparser/jsoncharutils.h
|
||||
LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp src/stage2_flatten.cpp src/stage34_unified.cpp
|
||||
|
@ -24,9 +24,15 @@ LIBS=$(RAPIDJSON_INCLUDE) $(SAJSON_INCLUDE)
|
|||
|
||||
all: $(LIBS) $(EXECUTABLES)
|
||||
|
||||
test: jsoncheck numberparsingcheck
|
||||
-./numberparsingcheck
|
||||
test: jsoncheck numberparsingcheck stringparsingcheck
|
||||
./numberparsingcheck
|
||||
./stringparsingcheck
|
||||
./jsoncheck
|
||||
@echo
|
||||
@tput setaf 2
|
||||
@echo "It looks like the code is good!"
|
||||
@tput sgr0
|
||||
|
||||
|
||||
$(SAJSON_INCLUDE):
|
||||
git submodule update --init --recursive
|
||||
|
@ -48,6 +54,11 @@ jsoncheck:tests/jsoncheck.cpp $(HEADERS) $(LIBFILES)
|
|||
numberparsingcheck:tests/numberparsingcheck.cpp $(HEADERS) $(LIBFILES)
|
||||
$(CXX) $(CXXFLAGS) -o numberparsingcheck tests/numberparsingcheck.cpp src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp src/stage2_flatten.cpp -I. $(LIBFLAGS) -DJSON_TEST_NUMBERS
|
||||
|
||||
|
||||
stringparsingcheck:tests/stringparsingcheck.cpp $(HEADERS) $(LIBFILES)
|
||||
$(CXX) $(CXXFLAGS) -o stringparsingcheck tests/stringparsingcheck.cpp src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp src/stage2_flatten.cpp -I. $(LIBFLAGS) -DJSON_TEST_STRINGS
|
||||
|
||||
|
||||
minifiercompetition: benchmark/minifiercompetition.cpp $(HEADERS) $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIBFILES)
|
||||
$(CXX) $(CXXFLAGS) -o minifiercompetition $(LIBFILES) $(MINIFIERLIBFILES) benchmark/minifiercompetition.cpp -I. $(LIBFLAGS)
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@ static const u8 escape_map[256] = {
|
|||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
|
||||
0, 0, 0x08, 0, 0, 0, 0x12, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
|
||||
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
|
||||
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
@ -72,6 +72,9 @@ really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
|
|||
using namespace std;
|
||||
const u8 *src = &buf[offset + 1]; // we know that buf at offset is a "
|
||||
u8 *dst = pj.current_string_buf_loc;
|
||||
#ifdef JSON_TEST_STRINGS // for unit testing
|
||||
u8 *const start_of_string = dst;
|
||||
#endif
|
||||
#ifdef DEBUG
|
||||
cout << "Entering parse string with offset " << offset << "\n";
|
||||
#endif
|
||||
|
@ -104,6 +107,7 @@ really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
|
|||
m256 unitsep = _mm256_set1_epi8(0x1F);
|
||||
m256 unescaped_vec = _mm256_cmpeq_epi8(_mm256_max_epu8(unitsep,v),unitsep);// could do it with saturated subtraction
|
||||
#endif // CHECKUNESCAPED
|
||||
|
||||
u32 quote_dist = __builtin_ctz(quote_bits);
|
||||
u32 bs_dist = __builtin_ctz(bs_bits);
|
||||
// store to dest unconditionally - we can overwrite the bits we don't like
|
||||
|
@ -122,12 +126,20 @@ really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
|
|||
|
||||
pj.write_tape(depth, pj.current_string_buf_loc - pj.string_buf, '"');
|
||||
|
||||
pj.current_string_buf_loc = dst + quote_dist + 1;
|
||||
pj.current_string_buf_loc = dst + quote_dist + 1; // the +1 is due to the 0 value
|
||||
#ifdef CHECKUNESCAPED
|
||||
// check that there is no unescaped char before the quote
|
||||
u32 unescaped_bits = (u32)_mm256_movemask_epi8(unescaped_vec);
|
||||
return ((quote_bits - 1) & (~ quote_bits) & unescaped_bits) == 0;
|
||||
bool is_ok = ((quote_bits - 1) & (~ quote_bits) & unescaped_bits) == 0;
|
||||
#ifdef JSON_TEST_STRINGS // for unit testing
|
||||
if(is_ok) foundString(buf + offset,start_of_string,pj.current_string_buf_loc - 1);
|
||||
else foundBadString(buf + offset);
|
||||
#endif // JSON_TEST_STRINGS
|
||||
return is_ok;
|
||||
#else //CHECKUNESCAPED
|
||||
#ifdef JSON_TEST_STRINGS // for unit testing
|
||||
foundString(buf + offset,start_of_string,pj.current_string_buf_loc);
|
||||
#endif // JSON_TEST_STRINGS
|
||||
return true;
|
||||
#endif //CHECKUNESCAPED
|
||||
} else if (quote_dist > bs_dist) {
|
||||
|
@ -139,6 +151,9 @@ really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
|
|||
// we are going to need the unescaped_bits to check for unescaped chars
|
||||
u32 unescaped_bits = (u32)_mm256_movemask_epi8(unescaped_vec);
|
||||
if(((bs_bits - 1) & (~ bs_bits) & unescaped_bits) != 0) {
|
||||
#ifdef JSON_TEST_STRINGS // for unit testing
|
||||
foundBadString(buf + offset);
|
||||
#endif // JSON_TEST_STRINGS
|
||||
return false;
|
||||
}
|
||||
#endif //CHECKUNESCAPED
|
||||
|
@ -149,6 +164,9 @@ really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
|
|||
src += bs_dist;
|
||||
dst += bs_dist;
|
||||
if (!handle_unicode_codepoint(&src, &dst)) {
|
||||
#ifdef JSON_TEST_STRINGS // for unit testing
|
||||
foundBadString(buf + offset);
|
||||
#endif // JSON_TEST_STRINGS
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
|
@ -157,8 +175,12 @@ really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
|
|||
// note this may reach beyond the part of the buffer we've actually
|
||||
// seen. I think this is ok
|
||||
u8 escape_result = escape_map[escape_char];
|
||||
if (!escape_result)
|
||||
if (!escape_result) {
|
||||
#ifdef JSON_TEST_STRINGS // for unit testing
|
||||
foundBadString(buf + offset);
|
||||
#endif // JSON_TEST_STRINGS
|
||||
return false; // bogus escape value is an error
|
||||
}
|
||||
dst[bs_dist] = escape_result;
|
||||
src += bs_dist + 2;
|
||||
dst += bs_dist + 1;
|
||||
|
@ -171,6 +193,9 @@ really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
|
|||
#ifdef CHECKUNESCAPED
|
||||
// check for unescaped chars
|
||||
if(_mm256_testz_si256(unescaped_vec,unescaped_vec) != 1) {
|
||||
#ifdef JSON_TEST_STRINGS // for unit testing
|
||||
foundBadString(buf + offset);
|
||||
#endif // JSON_TEST_STRINGS
|
||||
return false;
|
||||
}
|
||||
#endif // CHECKUNESCAPED
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
["this is an unclosed string ]
|
|
@ -21,6 +21,11 @@ bool startsWith(const char *pre, const char *str) {
|
|||
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
|
||||
}
|
||||
|
||||
bool contains(const char *pre, const char *str) {
|
||||
return (strstr(str, pre) != NULL);
|
||||
}
|
||||
|
||||
|
||||
bool validate(const char *dirname) {
|
||||
bool everythingfine = true;
|
||||
// init_state_machine(); // no longer necessary
|
||||
|
@ -36,6 +41,7 @@ bool validate(const char *dirname) {
|
|||
printf("nothing in dir %s \n", dirname);
|
||||
return false;
|
||||
}
|
||||
size_t howmany = 0;
|
||||
bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/');
|
||||
for (int i = 0; i < c; i++) {
|
||||
const char *name = entry_list[i]->d_name;
|
||||
|
@ -51,15 +57,18 @@ bool validate(const char *dirname) {
|
|||
strcpy(fullpath + dirlen, name);
|
||||
}
|
||||
std::pair<u8 *, size_t> p = get_corpus(fullpath);
|
||||
// terrible hack but just to get it working
|
||||
ParsedJson *pj_ptr = allocate_ParsedJson(p.second);
|
||||
if(pj_ptr == NULL) {
|
||||
std::cerr<< "can't allocate memory"<<std::endl;
|
||||
return false;
|
||||
}
|
||||
++howmany;
|
||||
ParsedJson &pj(*pj_ptr);
|
||||
bool isok = json_parse(p.first, p.second, pj);
|
||||
if (startsWith("pass", name)) {
|
||||
if(contains("EXCLUDE",name)) {
|
||||
// skipping
|
||||
howmany--;
|
||||
} else if (startsWith("pass", name)) {
|
||||
if (!isok) {
|
||||
printf("warning: file %s should pass but it fails.\n", name);
|
||||
everythingfine = false;
|
||||
|
@ -81,6 +90,8 @@ bool validate(const char *dirname) {
|
|||
for (int i = 0; i < c; ++i)
|
||||
free(entry_list[i]);
|
||||
free(entry_list);
|
||||
printf("%zu files checked.\n", howmany);
|
||||
if(everythingfine) printf("All ok!\n");
|
||||
return everythingfine;
|
||||
}
|
||||
|
||||
|
|
|
@ -2,11 +2,10 @@
|
|||
#include <cstring>
|
||||
#include <dirent.h>
|
||||
#include <inttypes.h>
|
||||
#include <math.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <inttypes.h>
|
||||
#include <math.h>
|
||||
|
||||
#ifndef JSON_TEST_NUMBERS
|
||||
#define JSON_TEST_NUMBERS
|
||||
|
@ -16,55 +15,73 @@
|
|||
|
||||
int parse_error;
|
||||
char *fullpath;
|
||||
enum{PARSE_WARNING, PARSE_ERROR};
|
||||
enum { PARSE_WARNING, PARSE_ERROR };
|
||||
|
||||
size_t float_count;
|
||||
size_t int_count;
|
||||
size_t invalid_count;
|
||||
|
||||
inline void foundInvalidNumber(const u8 * buf) {
|
||||
// strings that start with these should not be parsed as numbers
|
||||
const char *really_bad[] = {"013}", "0x14", "0e]", "0e+]", "0e+-1]"};
|
||||
|
||||
bool startsWith(const char *pre, const char *str) {
|
||||
size_t lenpre = strlen(pre), lenstr = strlen(str);
|
||||
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
|
||||
}
|
||||
bool is_in_bad_list(char *buf) {
|
||||
for (size_t i = 0; i < sizeof(really_bad) / sizeof(really_bad[0]); i++)
|
||||
if (startsWith(really_bad[i], buf))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
inline void foundInvalidNumber(const u8 *buf) {
|
||||
invalid_count++;
|
||||
char * endptr;
|
||||
char *endptr;
|
||||
double expected = strtod((char *)buf, &endptr);
|
||||
if(endptr != (char *)buf) {
|
||||
printf("Warning: foundInvalidNumber %.32s whereas strtod parses it to %f, ", buf, expected);
|
||||
printf(" while parsing %s \n", fullpath);
|
||||
parse_error |= PARSE_WARNING;
|
||||
if (endptr != (char *)buf) {
|
||||
if (!is_in_bad_list((char *)buf)) {
|
||||
printf(
|
||||
"Warning: foundInvalidNumber %.32s whereas strtod parses it to %f, ",
|
||||
buf, expected);
|
||||
printf(" while parsing %s \n", fullpath);
|
||||
parse_error |= PARSE_WARNING;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void foundInteger(int64_t result, const u8 * buf) {
|
||||
inline void foundInteger(int64_t result, const u8 *buf) {
|
||||
int_count++;
|
||||
char * endptr;
|
||||
long long expected = strtoll((char *)buf, & endptr, 10);
|
||||
if((endptr == (char *)buf) || (expected != result)) {
|
||||
char *endptr;
|
||||
long long expected = strtoll((char *)buf, &endptr, 10);
|
||||
if ((endptr == (char *)buf) || (expected != result)) {
|
||||
printf("Error: parsed %" PRId64 " out of %.32s, ", result, buf);
|
||||
printf(" while parsing %s \n", fullpath);
|
||||
parse_error |= PARSE_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
inline void foundFloat(double result, const u8 * buf) {
|
||||
char * endptr;
|
||||
inline void foundFloat(double result, const u8 *buf) {
|
||||
char *endptr;
|
||||
float_count++;
|
||||
double expected = strtod((char *)buf, &endptr);
|
||||
if(endptr == (char *)buf) {
|
||||
printf("parsed %f from %.32s whereas strtod refuses to parse a float, ", result, buf);
|
||||
if (endptr == (char *)buf) {
|
||||
printf("parsed %f from %.32s whereas strtod refuses to parse a float, ",
|
||||
result, buf);
|
||||
printf(" while parsing %s \n", fullpath);
|
||||
parse_error |= PARSE_ERROR;
|
||||
}
|
||||
// we want to get some reasonable relative accuracy
|
||||
if(fabs(expected - result)/fmin(fabs(expected),fabs(result)) > 0.000000000000001) {
|
||||
if (fabs(expected - result) / fmin(fabs(expected), fabs(result)) >
|
||||
0.000000000000001) {
|
||||
printf("parsed %.32f from \n", result);
|
||||
printf(" %.32s whereas strtod gives\n", buf);
|
||||
printf(" %.32f,", expected);
|
||||
printf(" %.32f,", expected);
|
||||
printf(" while parsing %s \n", fullpath);
|
||||
parse_error |= PARSE_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#include "jsonparser/jsonparser.h"
|
||||
#include "src/stage34_unified.cpp"
|
||||
|
||||
|
@ -76,13 +93,10 @@ static bool hasExtension(const char *filename, const char *extension) {
|
|||
return (ext && !strcmp(ext, extension));
|
||||
}
|
||||
|
||||
bool startsWith(const char *pre, const char *str) {
|
||||
size_t lenpre = strlen(pre), lenstr = strlen(str);
|
||||
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
|
||||
}
|
||||
|
||||
bool validate(const char *dirname) {
|
||||
parse_error = 0;
|
||||
size_t total_count = 0;
|
||||
|
||||
// init_state_machine(); // no longer necessary
|
||||
const char *extension = ".json";
|
||||
size_t dirlen = strlen(dirname);
|
||||
|
@ -112,28 +126,32 @@ bool validate(const char *dirname) {
|
|||
std::pair<u8 *, size_t> p = get_corpus(fullpath);
|
||||
// terrible hack but just to get it working
|
||||
ParsedJson *pj_ptr = allocate_ParsedJson(p.second);
|
||||
if(pj_ptr == NULL) {
|
||||
std::cerr<< "can't allocate memory"<<std::endl;
|
||||
if (pj_ptr == NULL) {
|
||||
std::cerr << "can't allocate memory" << std::endl;
|
||||
return false;
|
||||
}
|
||||
float_count = 0;
|
||||
int_count = 0;
|
||||
invalid_count = 0;
|
||||
total_count += float_count + int_count + invalid_count;
|
||||
ParsedJson &pj(*pj_ptr);
|
||||
bool isok =
|
||||
json_parse(p.first, p.second, pj);
|
||||
if(int_count+float_count+invalid_count > 0) {
|
||||
printf("File %40s %s --- integers: %10zu floats: %10zu invalid: %10zu total numbers: %10zu \n", name,
|
||||
isok ? " is valid " :
|
||||
" is not valid ",int_count, float_count, invalid_count, int_count+float_count+invalid_count);
|
||||
bool isok = json_parse(p.first, p.second, pj);
|
||||
if (int_count + float_count + invalid_count > 0) {
|
||||
printf("File %40s %s --- integers: %10zu floats: %10zu invalid: %10zu "
|
||||
"total numbers: %10zu \n",
|
||||
name, isok ? " is valid " : " is not valid ", int_count,
|
||||
float_count, invalid_count,
|
||||
int_count + float_count + invalid_count);
|
||||
}
|
||||
free(p.first);
|
||||
free(fullpath);
|
||||
deallocate_ParsedJson(pj_ptr);
|
||||
}
|
||||
}
|
||||
if((parse_error & PARSE_ERROR) != 0) {
|
||||
if ((parse_error & PARSE_ERROR) != 0) {
|
||||
printf("NUMBER PARSING FAILS?\n");
|
||||
} else {
|
||||
printf("All ok.\n");
|
||||
}
|
||||
for (int i = 0; i < c; ++i)
|
||||
free(entry_list[i]);
|
||||
|
@ -145,10 +163,11 @@ int main(int argc, char *argv[]) {
|
|||
if (argc != 2) {
|
||||
std::cerr << "Usage: " << argv[0] << " <directorywithjsonfiles>"
|
||||
<< std::endl;
|
||||
std::cout
|
||||
<< "We are going to assume you mean to use the 'jsonchecker' and 'jsonexamples' directories."
|
||||
<< std::endl;
|
||||
return validate("jsonchecker/") && validate("jsonexamples/") ? EXIT_SUCCESS : EXIT_FAILURE;
|
||||
std::cout << "We are going to assume you mean to use the 'jsonchecker' and "
|
||||
"'jsonexamples' directories."
|
||||
<< std::endl;
|
||||
return validate("jsonchecker/") && validate("jsonexamples/") ? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
return validate(argv[1]) ? EXIT_SUCCESS : EXIT_FAILURE;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,381 @@
|
|||
#include <assert.h>
|
||||
#include <cstring>
|
||||
#include <dirent.h>
|
||||
#include <inttypes.h>
|
||||
#include <math.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifndef JSON_TEST_STRINGS
|
||||
#define JSON_TEST_STRINGS
|
||||
#endif
|
||||
|
||||
#include "jsonparser/common_defs.h"
|
||||
|
||||
char *fullpath;
|
||||
|
||||
size_t bad_string;
|
||||
size_t good_string;
|
||||
size_t empty_string;
|
||||
|
||||
size_t total_string_length;
|
||||
bool probable_bug;
|
||||
// borrowed code (sajson?)
|
||||
|
||||
static inline bool read_hex(const char *p, unsigned &u) {
|
||||
unsigned v = 0;
|
||||
int i = 4;
|
||||
while (i--) {
|
||||
unsigned char c = *p++;
|
||||
if (c >= '0' && c <= '9') {
|
||||
c -= '0';
|
||||
} else if (c >= 'a' && c <= 'f') {
|
||||
c = c - 'a' + 10;
|
||||
} else if (c >= 'A' && c <= 'F') {
|
||||
c = c - 'A' + 10;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
v = (v << 4) + c;
|
||||
}
|
||||
|
||||
u = v;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void write_utf8(unsigned codepoint, char *&end) {
|
||||
if (codepoint < 0x80) {
|
||||
*end++ = codepoint;
|
||||
} else if (codepoint < 0x800) {
|
||||
*end++ = 0xC0 | (codepoint >> 6);
|
||||
*end++ = 0x80 | (codepoint & 0x3F);
|
||||
} else if (codepoint < 0x10000) {
|
||||
*end++ = 0xE0 | (codepoint >> 12);
|
||||
*end++ = 0x80 | ((codepoint >> 6) & 0x3F);
|
||||
*end++ = 0x80 | (codepoint & 0x3F);
|
||||
} else {
|
||||
assert(codepoint < 0x200000);
|
||||
*end++ = 0xF0 | (codepoint >> 18);
|
||||
*end++ = 0x80 | ((codepoint >> 12) & 0x3F);
|
||||
*end++ = 0x80 | ((codepoint >> 6) & 0x3F);
|
||||
*end++ = 0x80 | (codepoint & 0x3F);
|
||||
}
|
||||
}
|
||||
|
||||
static bool parse_string(const char *p, char *output, char **end) {
|
||||
if (*p != '"')
|
||||
return false;
|
||||
p++;
|
||||
|
||||
for (;;) {
|
||||
|
||||
if ((*p >= 0 && *p < 0x20)) {
|
||||
return false; // unescaped
|
||||
}
|
||||
|
||||
switch (*p) {
|
||||
case '"':
|
||||
*output = '\0'; // end
|
||||
*end = output;
|
||||
return true;
|
||||
case '\\':
|
||||
++p;
|
||||
|
||||
char replacement;
|
||||
switch (*p) {
|
||||
case '"':
|
||||
replacement = '"';
|
||||
goto replace;
|
||||
case '\\':
|
||||
replacement = '\\';
|
||||
goto replace;
|
||||
case '/':
|
||||
replacement = '/';
|
||||
goto replace;
|
||||
case 'b':
|
||||
replacement = '\b';
|
||||
goto replace;
|
||||
case 'f':
|
||||
replacement = '\f';
|
||||
goto replace;
|
||||
case 'n':
|
||||
replacement = '\n';
|
||||
goto replace;
|
||||
case 'r':
|
||||
replacement = '\r';
|
||||
goto replace;
|
||||
case 't':
|
||||
replacement = '\t';
|
||||
goto replace;
|
||||
replace:
|
||||
*output++ = replacement;
|
||||
++p;
|
||||
break;
|
||||
case 'u': {
|
||||
++p;
|
||||
unsigned u;
|
||||
if (!read_hex(p, u))
|
||||
return false;
|
||||
|
||||
p += 4;
|
||||
if (u >= 0xD800 && u <= 0xDBFF) {
|
||||
char p0 = p[0];
|
||||
char p1 = p[1];
|
||||
if (p0 != '\\' || p1 != 'u') {
|
||||
return false;
|
||||
}
|
||||
p += 2;
|
||||
unsigned v;
|
||||
if (!read_hex(p, v))
|
||||
return false;
|
||||
|
||||
p += 4;
|
||||
|
||||
if (v < 0xDC00 || v > 0xDFFF) {
|
||||
return false;
|
||||
}
|
||||
u = 0x10000 + (((u - 0xD800) << 10) | (v - 0xDC00));
|
||||
}
|
||||
write_utf8(u, output);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
// validate UTF-8
|
||||
unsigned char c0 = p[0];
|
||||
if (c0 < 128) {
|
||||
*output++ = *p++;
|
||||
} else if (c0 < 224) {
|
||||
unsigned char c1 = p[1];
|
||||
if (c1 < 128 || c1 >= 192) {
|
||||
return false;
|
||||
}
|
||||
output[0] = c0;
|
||||
output[1] = c1;
|
||||
output += 2;
|
||||
p += 2;
|
||||
} else if (c0 < 240) {
|
||||
unsigned char c1 = p[1];
|
||||
if (c1 < 128 || c1 >= 192) {
|
||||
return false;
|
||||
}
|
||||
unsigned char c2 = p[2];
|
||||
if (c2 < 128 || c2 >= 192) {
|
||||
return false;
|
||||
}
|
||||
output[0] = c0;
|
||||
output[1] = c1;
|
||||
output[2] = c2;
|
||||
output += 3;
|
||||
p += 3;
|
||||
} else if (c0 < 248) {
|
||||
unsigned char c1 = p[1];
|
||||
if (c1 < 128 || c1 >= 192) {
|
||||
return false;
|
||||
}
|
||||
unsigned char c2 = p[2];
|
||||
if (c2 < 128 || c2 >= 192) {
|
||||
return false;
|
||||
}
|
||||
unsigned char c3 = p[3];
|
||||
if (c3 < 128 || c3 >= 192) {
|
||||
return false;
|
||||
}
|
||||
output[0] = c0;
|
||||
output[1] = c1;
|
||||
output[2] = c2;
|
||||
output[3] = c3;
|
||||
output += 4;
|
||||
p += 4;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// end of borrowed code
|
||||
|
||||
inline void foundBadString(const u8 *buf) {
|
||||
bad_string++;
|
||||
char *end;
|
||||
char bigbuffer[4096]; // if some strings exceeds 4k, this will fail!
|
||||
if (parse_string((const char *)buf, bigbuffer, &end)) {
|
||||
printf("WARNING: Sajson-like parser seems to think that the string is "
|
||||
"valid %32s \n",
|
||||
buf);
|
||||
probable_bug = true;
|
||||
}
|
||||
}
|
||||
|
||||
void print_hex(const char *s, size_t len) {
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
printf("%02x ", s[i] & 0xFF);
|
||||
}
|
||||
}
|
||||
|
||||
void print_cmp_hex(const char *s1, const char *s2, size_t len) {
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
printf("%02x ", (s1[i] ^ s2[i]) & 0xFF);
|
||||
}
|
||||
}
|
||||
|
||||
inline void foundString(const u8 *buf, const u8 *parsed_begin,
|
||||
const u8 *parsed_end) {
|
||||
size_t thislen = parsed_end - parsed_begin;
|
||||
total_string_length += thislen;
|
||||
good_string++;
|
||||
char *end;
|
||||
char bigbuffer[4096]; // if some strings exceeds 4k, this will fail!
|
||||
if (!parse_string((const char *)buf, bigbuffer, &end)) {
|
||||
printf("WARNING: reference parser seems to think that the string is NOT "
|
||||
"valid %32s \n",
|
||||
buf);
|
||||
}
|
||||
if (end == bigbuffer) {
|
||||
// we have a zero-length string
|
||||
if (parsed_begin != parsed_end) {
|
||||
printf("WARNING: We have a zero-length but gap is %zu \n",
|
||||
parsed_end - parsed_begin);
|
||||
probable_bug = true;
|
||||
}
|
||||
empty_string++;
|
||||
return;
|
||||
}
|
||||
size_t len = end - bigbuffer;
|
||||
if (len != thislen) {
|
||||
printf("WARNING: lengths on parsed strings disagree %zu %zu \n", thislen,
|
||||
len);
|
||||
printf("\nour parsed string : '%*s'\n\n", (int)thislen,
|
||||
(char *)parsed_begin);
|
||||
print_hex((char *)parsed_begin, thislen);
|
||||
printf("\n");
|
||||
|
||||
printf("reference parsing :'%*s'\n\n", (int)len, bigbuffer);
|
||||
print_hex((char *)bigbuffer, len);
|
||||
printf("\n");
|
||||
|
||||
probable_bug = true;
|
||||
}
|
||||
if (memcmp(bigbuffer, parsed_begin, thislen) != 0) {
|
||||
printf("WARNING: parsed strings disagree \n");
|
||||
printf("Lengths %zu %zu \n", thislen, len);
|
||||
|
||||
printf("\nour parsed string : '%*s'\n", (int)thislen,
|
||||
(char *)parsed_begin);
|
||||
print_hex((char *)parsed_begin, thislen);
|
||||
printf("\n");
|
||||
|
||||
printf("reference parsing :'%*s'\n", (int)len, bigbuffer);
|
||||
print_hex((char *)bigbuffer, len);
|
||||
printf("\n");
|
||||
|
||||
print_cmp_hex((char *)parsed_begin, bigbuffer, thislen);
|
||||
|
||||
probable_bug = true;
|
||||
}
|
||||
}
|
||||
|
||||
#include "jsonparser/jsonparser.h"
|
||||
#include "src/stage34_unified.cpp"
|
||||
|
||||
/**
|
||||
* Does the file filename ends with the given extension.
|
||||
*/
|
||||
static bool hasExtension(const char *filename, const char *extension) {
|
||||
const char *ext = strrchr(filename, '.');
|
||||
return (ext && !strcmp(ext, extension));
|
||||
}
|
||||
|
||||
bool startsWith(const char *pre, const char *str) {
|
||||
size_t lenpre = strlen(pre), lenstr = strlen(str);
|
||||
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
|
||||
}
|
||||
|
||||
bool validate(const char *dirname) {
|
||||
size_t total_strings = 0;
|
||||
probable_bug = false;
|
||||
const char *extension = ".json";
|
||||
size_t dirlen = strlen(dirname);
|
||||
struct dirent **entry_list;
|
||||
int c = scandir(dirname, &entry_list, 0, alphasort);
|
||||
if (c < 0) {
|
||||
printf("error accessing %s \n", dirname);
|
||||
return false;
|
||||
}
|
||||
if (c == 0) {
|
||||
printf("nothing in dir %s \n", dirname);
|
||||
return false;
|
||||
}
|
||||
bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/');
|
||||
for (int i = 0; i < c; i++) {
|
||||
const char *name = entry_list[i]->d_name;
|
||||
if (hasExtension(name, extension)) {
|
||||
size_t filelen = strlen(name);
|
||||
fullpath = (char *)malloc(dirlen + filelen + 1 + 1);
|
||||
strcpy(fullpath, dirname);
|
||||
if (needsep) {
|
||||
fullpath[dirlen] = '/';
|
||||
strcpy(fullpath + dirlen + 1, name);
|
||||
} else {
|
||||
strcpy(fullpath + dirlen, name);
|
||||
}
|
||||
std::pair<u8 *, size_t> p = get_corpus(fullpath);
|
||||
// terrible hack but just to get it working
|
||||
ParsedJson *pj_ptr = allocate_ParsedJson(p.second);
|
||||
if (pj_ptr == NULL) {
|
||||
std::cerr << "can't allocate memory" << std::endl;
|
||||
return false;
|
||||
}
|
||||
bad_string = 0;
|
||||
good_string = 0;
|
||||
total_string_length = 0;
|
||||
empty_string = 0;
|
||||
ParsedJson &pj(*pj_ptr);
|
||||
bool isok = json_parse(p.first, p.second, pj);
|
||||
if (good_string > 0) {
|
||||
printf("File %40s %s --- bad strings: %10zu \tgood strings: %10zu\t "
|
||||
"empty strings: %10zu "
|
||||
"\taverage string length: %.1f \n",
|
||||
name, isok ? " is valid " : " is not valid ", bad_string,
|
||||
good_string, empty_string,
|
||||
(double)total_string_length / good_string);
|
||||
} else if (bad_string > 0) {
|
||||
printf("File %40s %s --- bad strings: %10zu \n", name,
|
||||
isok ? " is valid " : " is not valid ", bad_string);
|
||||
}
|
||||
total_strings += bad_string + good_string;
|
||||
free(p.first);
|
||||
free(fullpath);
|
||||
deallocate_ParsedJson(pj_ptr);
|
||||
}
|
||||
}
|
||||
printf("%zu strings checked.\n", total_strings);
|
||||
if (probable_bug) {
|
||||
printf("STRING PARSING FAILS?\n");
|
||||
} else {
|
||||
printf("All ok.\n");
|
||||
}
|
||||
for (int i = 0; i < c; ++i)
|
||||
free(entry_list[i]);
|
||||
free(entry_list);
|
||||
return probable_bug == false;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc != 2) {
|
||||
std::cerr << "Usage: " << argv[0] << " <directorywithjsonfiles>"
|
||||
<< std::endl;
|
||||
std::cout << "We are going to assume you mean to use the 'jsonchecker' and "
|
||||
"'jsonexamples' directories."
|
||||
<< std::endl;
|
||||
return validate("jsonchecker/") && validate("jsonexamples/") ? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
return validate(argv[1]) ? EXIT_SUCCESS : EXIT_FAILURE;
|
||||
}
|
Loading…
Reference in New Issue