Merge branch 'master' of https://github.com/lemire/simdjson into HEAD

This commit is contained in:
ioioioio 2019-06-28 15:18:05 -04:00
commit c2e4b8ca9a
7 changed files with 81 additions and 58 deletions

View File

@ -12,6 +12,11 @@ JSON documents are everywhere on the Internet. Servers spend a lot of time parsi
<img src="images/logo.png" width="10%">
## Real-world usage
- [Microsoft FishStore](https://github.com/microsoft/FishStore)
- [Yandex ClickHouse](https://github.com/yandex/ClickHouse)
## Paper
A description of the design and implementation of simdjson appears at https://arxiv.org/abs/1902.08318 and an informal blog post providing some background and context is at https://branchfree.org/2019/02/25/paper-parsing-gigabytes-of-json-per-second/.
@ -517,5 +522,11 @@ This helps as we redefine some new characters as pseudo-structural such as the c
- Yang, Shiyang. Validation of XML Document Based on Parallel Bit Stream Technology. Diss. Applied Sciences: School of Computing Science, 2013.
- N. Nakasato, "Implementation of a parallel tree method on a GPU", Journal of Computational Science, vol. 3, no. 3, pp. 132-141, 2012.
## Funding
The work is supported by the Natural Sciences and Engineering Research Council of Canada under grant number RGPIN-2017-03910.
[license]: LICENSE
[license img]: https://img.shields.io/badge/License-Apache%202-blue.svg

View File

@ -105,8 +105,10 @@ int main(int argc, char *argv[]) {
rapid_correct_checkencoding ? "correct" : "invalid");
printf("sajson : %s \n",
sajson_correct ? "correct" : "invalid");
if(oursreturn == simdjson::DEPTH_ERROR) {
printf("simdjson encountered a DEPTH_ERROR, it was parametrized to reject documents with depth exceeding %zu.\n", maxdepth);
if (oursreturn == simdjson::DEPTH_ERROR) {
printf("simdjson encountered a DEPTH_ERROR, it was parametrized to "
"reject documents with depth exceeding %zu.\n",
maxdepth);
}
if ((ours_correct != rapid_correct_checkencoding) ||
(rapid_correct_checkencoding != sajson_correct) ||

View File

@ -15,10 +15,10 @@ bool skyprophet_test() {
std::vector<std::string> data;
char buf[1024];
for (size_t i = 0; i < n_records; ++i) {
auto n = sprintf(buf,
"{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
auto n =
sprintf(buf, "{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
"\"school\": {\"id\": %zu, \"name\": \"school%zu\"}}",
i, i, (i % 2) ? "male" : "female", i % 10, i % 10);
i, i, (i % 2) ? "male" : "female", i % 10, i % 10);
data.emplace_back(std::string(buf, n));
}
for (size_t i = 0; i < n_records; ++i) {

View File

@ -31,7 +31,6 @@ bool contains(const char *pre, const char *str) {
return (strstr(str, pre) != nullptr);
}
bool validate(const char *dirname) {
bool everythingfine = true;
const char *extension = ".json";
@ -46,9 +45,10 @@ bool validate(const char *dirname) {
printf("nothing in dir %s \n", dirname);
return false;
}
bool * isfileasexpected = new bool[c];
for(int i = 0; i < c; i++) { isfileasexpected[i] = true;
}
bool *isfileasexpected = new bool[c];
for (int i = 0; i < c; i++) {
isfileasexpected[i] = true;
}
size_t howmany = 0;
bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/');
for (int i = 0; i < c; i++) {
@ -68,47 +68,50 @@ bool validate(const char *dirname) {
padded_string p;
try {
get_corpus(fullpath).swap(p);
} catch (const std::exception& e) {
} catch (const std::exception &e) {
std::cerr << "Could not load the file " << fullpath << std::endl;
return EXIT_FAILURE;
}
ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size(), 1024);
if(!allocok) {
std::cerr << "can't allocate memory"<<std::endl;
if (!allocok) {
std::cerr << "can't allocate memory" << std::endl;
return false;
}
++howmany;
const int parseRes = json_parse(p, pj);
printf("%s\n", parseRes == 0 ? "ok" : "invalid");
if(contains("EXCLUDE",name)) {
if (contains("EXCLUDE", name)) {
// skipping
howmany--;
} else if (startsWith("pass", name) && parseRes != 0) {
isfileasexpected[i] = false;
printf("warning: file %s should pass but it fails. Error is: %s\n", name, simdjson::errorMsg(parseRes).data());
everythingfine = false;
isfileasexpected[i] = false;
printf("warning: file %s should pass but it fails. Error is: %s\n",
name, simdjson::errorMsg(parseRes).data());
everythingfine = false;
} else if (startsWith("fail", name) && parseRes == 0) {
isfileasexpected[i] = false;
printf("warning: file %s should fail but it passes.\n", name);
everythingfine = false;
isfileasexpected[i] = false;
printf("warning: file %s should fail but it passes.\n", name);
everythingfine = false;
}
free(fullpath);
}
}
printf("%zu files checked.\n", howmany);
if(everythingfine) {
if (everythingfine) {
printf("All ok!\n");
} else {
fprintf(stderr, "There were problems! Consider reviewing the following files:\n");
for(int i = 0; i < c; i++) {
if(!isfileasexpected[i]) { fprintf(stderr, "%s \n", entry_list[i]->d_name);
}
fprintf(stderr,
"There were problems! Consider reviewing the following files:\n");
for (int i = 0; i < c; i++) {
if (!isfileasexpected[i]) {
fprintf(stderr, "%s \n", entry_list[i]->d_name);
}
}
}
for (int i = 0; i < c; ++i) {
free(entry_list[i]);
}
}
free(entry_list);
delete[] isfileasexpected;
return everythingfine;
@ -124,9 +127,8 @@ int main(int argc, char *argv[]) {
<< std::endl;
return validate("jsonchecker/") ? EXIT_SUCCESS : EXIT_FAILURE;
#else
std::cout
<< "We are going to assume you mean to use the '"<< SIMDJSON_TEST_DATA_DIR <<"' directory."
<< std::endl;
std::cout << "We are going to assume you mean to use the '"
<< SIMDJSON_TEST_DATA_DIR << "' directory." << std::endl;
return validate(SIMDJSON_TEST_DATA_DIR) ? EXIT_SUCCESS : EXIT_FAILURE;
#endif
}

View File

@ -30,7 +30,8 @@ bool startsWith(const char *pre, const char *str) {
}
bool is_in_bad_list(const char *buf) {
if(buf[0] != '0') return false;
if (buf[0] != '0')
return false;
for (size_t i = 0; i < sizeof(really_bad) / sizeof(really_bad[0]); i++)
if (startsWith(really_bad[i], buf))
return true;
@ -68,19 +69,22 @@ inline void foundFloat(double result, const uint8_t *buf) {
float_count++;
double expected = strtod((const char *)buf, &endptr);
if (endptr == (const char *)buf) {
fprintf(stderr, "parsed %f from %.32s whereas strtod refuses to parse a float, ",
result, buf);
fprintf(stderr,
"parsed %f from %.32s whereas strtod refuses to parse a float, ",
result, buf);
fprintf(stderr, " while parsing %s \n", fullpath);
parse_error |= PARSE_ERROR;
}
if( fpclassify(expected) != fpclassify(result) ) {
fprintf(stderr, "floats not in the same category expected: %f observed: %f \n", expected, result);
if (fpclassify(expected) != fpclassify(result)) {
fprintf(stderr,
"floats not in the same category expected: %f observed: %f \n",
expected, result);
fprintf(stderr, "%.32s\n", buf);
parse_error |= PARSE_ERROR;
}
// we want to get some reasonable relative accuracy
else if (fabs(expected - result) >
1e-14 * fmin(fabs(expected), fabs(result))) {
else if (fabs(expected - result) >
1e-14 * fmin(fabs(expected), fabs(result))) {
fprintf(stderr, "parsed %.128e from \n", result);
fprintf(stderr, " %.32s whereas strtod gives\n", buf);
fprintf(stderr, " %.128e,", expected);
@ -131,7 +135,7 @@ bool validate(const char *dirname) {
padded_string p;
try {
get_corpus(fullpath).swap(p);
} catch (const std::exception& e) {
} catch (const std::exception &e) {
std::cout << "Could not load the file " << fullpath << std::endl;
return EXIT_FAILURE;
}
@ -172,12 +176,14 @@ int main(int argc, char *argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <directorywithjsonfiles>"
<< std::endl;
#if defined(SIMDJSON_TEST_DATA_DIR) && defined(SIMDJSON_BENCHMARK_DATA_DIR)
std::cout
<< "We are going to assume you mean to use the '"<< SIMDJSON_TEST_DATA_DIR <<"' and '"<< SIMDJSON_BENCHMARK_DATA_DIR <<"'directories."
<< std::endl;
return validate(SIMDJSON_TEST_DATA_DIR) && validate(SIMDJSON_BENCHMARK_DATA_DIR) ? EXIT_SUCCESS
: EXIT_FAILURE;
#if defined(SIMDJSON_TEST_DATA_DIR) && defined(SIMDJSON_BENCHMARK_DATA_DIR)
std::cout << "We are going to assume you mean to use the '"
<< SIMDJSON_TEST_DATA_DIR << "' and '"
<< SIMDJSON_BENCHMARK_DATA_DIR << "'directories." << std::endl;
return validate(SIMDJSON_TEST_DATA_DIR) &&
validate(SIMDJSON_BENCHMARK_DATA_DIR)
? EXIT_SUCCESS
: EXIT_FAILURE;
#else
std::cout << "We are going to assume you mean to use the 'jsonchecker' and "
"'jsonexamples' directories."

View File

@ -1,14 +1,14 @@
#include <iostream>
#include "../singleheader/simdjson.h"
#include <iostream>
int main() {
const char * filename = JSON_TEST_PATH;
const char *filename = JSON_TEST_PATH;
padded_string p = get_corpus(filename);
ParsedJson pj = build_parsed_json(p); // do the parsing
if( ! pj.isValid() ) {
if (!pj.isValid()) {
return EXIT_FAILURE;
}
if( ! pj.allocateCapacity(p.size()) ) {
if (!pj.allocateCapacity(p.size())) {
return EXIT_FAILURE;
}
const int res = json_parse(p, pj);

View File

@ -2,11 +2,11 @@
#include <cstring>
#include <dirent.h>
#include <inttypes.h>
#include <iostream>
#include <math.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#ifndef JSON_TEST_STRINGS
#define JSON_TEST_STRINGS
@ -201,7 +201,7 @@ static bool parse_string(const char *p, char *output, char **end) {
}
}
// end of borrowed code
char * bigbuffer; // global variable
char *bigbuffer; // global variable
inline void foundBadString(const uint8_t *buf) {
bad_string++;
@ -328,18 +328,18 @@ bool validate(const char *dirname) {
padded_string p;
try {
get_corpus(fullpath).swap(p);
} catch (const std::exception& e) {
} catch (const std::exception &e) {
std::cout << "Could not load the file " << fullpath << std::endl;
return EXIT_FAILURE;
}
}
ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size(), 1024);
if (!allocok) {
std::cerr << "can't allocate memory" << std::endl;
return false;
}
bigbuffer = (char *) malloc(p.size());
if(bigbuffer == NULL) {
bigbuffer = (char *)malloc(p.size());
if (bigbuffer == NULL) {
std::cerr << "can't allocate memory" << std::endl;
return false;
}
@ -380,12 +380,14 @@ int main(int argc, char *argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <directorywithjsonfiles>"
<< std::endl;
#if defined(SIMDJSON_TEST_DATA_DIR) && defined(SIMDJSON_BENCHMARK_DATA_DIR)
std::cout
<< "We are going to assume you mean to use the '"<< SIMDJSON_TEST_DATA_DIR <<"' and '"<< SIMDJSON_BENCHMARK_DATA_DIR <<"'directories."
<< std::endl;
return validate(SIMDJSON_TEST_DATA_DIR) && validate(SIMDJSON_BENCHMARK_DATA_DIR) ? EXIT_SUCCESS
: EXIT_FAILURE;
#if defined(SIMDJSON_TEST_DATA_DIR) && defined(SIMDJSON_BENCHMARK_DATA_DIR)
std::cout << "We are going to assume you mean to use the '"
<< SIMDJSON_TEST_DATA_DIR << "' and '"
<< SIMDJSON_BENCHMARK_DATA_DIR << "'directories." << std::endl;
return validate(SIMDJSON_TEST_DATA_DIR) &&
validate(SIMDJSON_BENCHMARK_DATA_DIR)
? EXIT_SUCCESS
: EXIT_FAILURE;
#else
std::cout << "We are going to assume you mean to use the 'jsonchecker' and "
"'jsonexamples' directories."