Use parse_many in examples/tests/docs

This commit is contained in:
John Keiser 2020-03-05 11:05:37 -08:00
parent cfef4ff2ad
commit 9a7c8fb5be
11 changed files with 289 additions and 256 deletions

4
.gitignore vendored
View File

@ -77,7 +77,6 @@ objs
/json2json
/jsoncheck
/jsoncheck_noavx
/jsonstream_test
/jsonpointer
/jsonstats
/integer_tests
@ -88,6 +87,7 @@ objs
/ossfuzz-out
/out
/parse
/parse_many_test
/parse_nonumberparsing
/parse_nostringparsing
/parse_noutf8validation
@ -110,7 +110,7 @@ objs
/tests/jsoncheck
/tests/pointercheck
/tests/integer_tests
/tests/jsonstream_test
/tests/parse_many_test
/tests/readme_examples
/tools/json2json
/tools/jsonstats

View File

@ -95,7 +95,7 @@ JSON_INCLUDE:=dependencies/json/single_include/nlohmann/json.hpp
EXTRAOBJECTS=ujdecode.o
MAINEXECUTABLES=parse minify json2json jsonstats statisticalmodel jsonpointer get_corpus_benchmark
TESTEXECUTABLES=jsoncheck jsoncheck_noavx integer_tests numberparsingcheck stringparsingcheck pointercheck jsonstream_test basictests readme_examples
TESTEXECUTABLES=jsoncheck jsoncheck_noavx integer_tests numberparsingcheck stringparsingcheck pointercheck parse_many_test basictests readme_examples
COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition distinctuseridcompetition allparserscheckfile allparsingcompetition
SUPPLEMENTARYEXECUTABLES=parse_noutf8validation parse_nonumberparsing parse_nostringparsing
@ -127,8 +127,8 @@ run_stringparsingcheck: stringparsingcheck
run_jsoncheck: jsoncheck
./jsoncheck
run_jsonstream_test: jsonstream_test
./jsonstream_test
run_parse_many_test: parse_many_test
./parse_many_test
run_jsoncheck_noavx: jsoncheck_noavx
./jsoncheck_noavx
@ -148,12 +148,12 @@ $(FEATURE_JSON_FILES): benchmark/genfeaturejson.rb
run_benchfeatures: benchfeatures $(FEATURE_JSON_FILES)
./benchfeatures -n 1000
test: run_basictests run_readme_examples run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck run_jsonstream_test run_pointercheck run_testjson2json_sh run_issue150_sh run_jsoncheck_noavx
test: run_basictests run_readme_examples run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck run_parse_many_test run_pointercheck run_testjson2json_sh run_issue150_sh run_jsoncheck_noavx
@echo "It looks like the code is good!"
quiettest: run_basictests run_readme_examples run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck run_jsoncheck run_jsonstream_test run_pointercheck run_testjson2json_sh run_issue150_sh run_jsoncheck_noavx
quiettest: run_basictests run_readme_examples run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck run_jsoncheck run_parse_many_test run_pointercheck run_testjson2json_sh run_issue150_sh run_jsoncheck_noavx
quicktests: run_basictests run_readme_examples run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck run_jsoncheck run_jsonstream_test run_pointercheck run_jsoncheck_noavx
quicktests: run_basictests run_readme_examples run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck run_jsoncheck run_parse_many_test run_pointercheck run_jsoncheck_noavx
slowtests: run_testjson2json_sh run_issue150_sh
@ -207,8 +207,8 @@ parse_nostringparsing: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
jsoncheck:tests/jsoncheck.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o jsoncheck tests/jsoncheck.cpp -I. $(LIBFILES) $(LIBFLAGS)
jsonstream_test:tests/jsonstream_test.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o jsonstream_test tests/jsonstream_test.cpp -I. $(LIBFILES) $(LIBFLAGS)
parse_many_test:tests/parse_many_test.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o parse_many_test tests/parse_many_test.cpp -I. $(LIBFILES) $(LIBFLAGS)
jsoncheck_noavx:tests/jsoncheck.cpp $(HEADERS) $(LIBFILES)

View File

@ -12,7 +12,6 @@ JSON documents are everywhere on the Internet. Servers spend a lot of time parsi
<img src="images/logo.png" width="10%">
## Real-world usage
- [Microsoft FishStore](https://github.com/microsoft/FishStore)
@ -189,12 +188,10 @@ for (padded_string json : { string("[1, 2, 3]"), string("true"), string("[ true,
## Newline-Delimited JSON (ndjson) and JSON lines
The simdjson library also support multithreaded JSON streaming through a large file containing many smaller JSON documents in either [ndjson](http://ndjson.org) or [JSON lines](http://jsonlines.org) format. We support files larger than 4GB.
**API and detailed documentation found [here](doc/JsonStream.md).**
The simdjson library also support multithreaded JSON streaming through a large file containing many smaller JSON documents in either [ndjson](http://ndjson.org) or [JSON lines](http://jsonlines.org) format. If your JSON documents all contain arrays or objects, we even support direct file concatenation without whitespace. The concatenated file has no size restrictions (including larger than 4GB), though each individual document must be less than 4GB.
Here is a simple example, using single header simdjson:
```cpp
#include "simdjson.h"
#include "simdjson.cpp"
@ -202,20 +199,15 @@ Here is a simple example, using single header simdjson:
int parse_file(const char *filename) {
simdjson::padded_string p = simdjson::get_corpus(filename);
simdjson::document::parser parser;
simdjson::JsonStream js{p};
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
parse_res = js.json_parse(parser);
//Do something with parser...
}
for (const document &doc : parser.parse_many(p)) {
// do something with the document ...
}
}
```
## Usage: easy single-header version
See the "singleheader" repository for a single header version. See the included
See the [singleheader](singleheader) directory for a single header version. See the included
file "amalgamation_demo.cpp" for usage. This requires no specific build system: just
copy the files in your project in your include path. You can then include them quite simply:
@ -238,10 +230,8 @@ int main(int argc, char *argv[]) {
}
```
Note: In some settings, it might be desirable to precompile `simdjson.cpp` instead of including it.
## Usage (old-school Makefile on platforms like Linux or macOS)
Requirements: recent clang or gcc, and make. We recommend at least GNU GCC/G++ 7 or LLVM clang 6. A 64-bit system like Linux or macOS is expected.

View File

@ -130,34 +130,30 @@ int main(int argc, char *argv[]) {
simdjson::padded_string p = simdjson::get_corpus(filename);
auto [doc, error] = simdjson::document::parse(p); // do the parsing
if (error) {
std::cout << "document::parse failed" << std::endl;
std::cout << "parse failed" << std::endl;
std::cout << "error code: " << error << std::endl;
std::cout << error_message(error) << std::endl;
} else {
std::cout << "document::parse valid" << std::endl;
std::cout << "parse valid" << std::endl;
}
if(argc == 2) {
return EXIT_SUCCESS;
}
//JsonStream
// parse_many
const char * filename2 = argv[2];
simdjson::padded_string p2 = simdjson::get_corpus(filename2);
simdjson::document::parser parser;
simdjson::JsonStream js{p2};
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
parse_res = js.json_parse(parser);
for (auto result : parser.parse_many(p2)) {
error = result.error;
}
if( ! parser.is_valid()) {
std::cout << "JsonStream not valid" << std::endl;
if (error) {
std::cout << "parse_many failed" << std::endl;
std::cout << "error code: " << error << std::endl;
std::cout << error_message(error) << std::endl;
} else {
std::cout << "JsonStream valid" << std::endl;
std::cout << "parse_many valid" << std::endl;
}
return EXIT_SUCCESS;
}
' >> "${DEMOCPP}"

View File

@ -2,9 +2,10 @@
#include <algorithm>
#include <chrono>
#include <vector>
#include "simdjson.h"
#include <map>
#include "simdjson.h"
#define NB_ITERATION 5
#define MIN_BATCH_SIZE 200000
#define MAX_BATCH_SIZE 10000000
@ -32,123 +33,120 @@ int main (int argc, char *argv[]){
std::cerr << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
if (test_baseline) {
std::wclog << "Baseline: Getline + normal parse... " << std::endl;
std::cout << "Gigabytes/second\t" << "Nb of documents parsed" << std::endl;
for (auto i = 0; i < 3; i++) {
//Actual test
simdjson::ParsedJson pj;
bool allocok = pj.allocate_capacity(p.size());
if (!allocok) {
std::cerr << "failed to allocate memory" << std::endl;
return EXIT_FAILURE;
}
std::istringstream ss(std::string(p.data(), p.size()));
auto start = std::chrono::steady_clock::now();
int count = 0;
std::string line;
int parse_res = simdjson::SUCCESS;
while (getline(ss, line)) {
parse_res = simdjson::json_parse(line, pj);
count++;
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
double speedinGBs = (p.size()) / (secs.count() * 1000000000.0);
std::cout << speedinGBs << "\t\t\t\t" << count << std::endl;
if (parse_res != simdjson::SUCCESS) {
std::cerr << "Parsing failed" << std::endl;
exit(1);
}
}
}
std::map<size_t, double> batch_size_res;
if(test_per_batch) {
std::wclog << "Jsonstream: Speed per batch_size... from " << MIN_BATCH_SIZE
<< " bytes to " << MAX_BATCH_SIZE << " bytes..." << std::endl;
std::cout << "Batch Size\t" << "Gigabytes/second\t" << "Nb of documents parsed" << std::endl;
for (size_t i = MIN_BATCH_SIZE; i <= MAX_BATCH_SIZE; i += (MAX_BATCH_SIZE - MIN_BATCH_SIZE) / 50) {
batch_size_res.insert(std::pair<size_t, double>(i, 0));
int count;
for (size_t j = 0; j < 5; j++) {
if (test_baseline) {
std::wclog << "Baseline: Getline + normal parse... " << std::endl;
std::cout << "Gigabytes/second\t" << "Nb of documents parsed" << std::endl;
for (auto i = 0; i < 3; i++) {
//Actual test
simdjson::ParsedJson pj;
simdjson::JsonStream js{p, i};
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
simdjson::document::parser parser;
bool allocok = parser.allocate_capacity(p.size());
if (!allocok) {
std::cerr << "failed to allocate memory" << std::endl;
return EXIT_FAILURE;
}
std::istringstream ss(std::string(p.data(), p.size()));
auto start = std::chrono::steady_clock::now();
count = 0;
while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
parse_res = js.json_parse(pj);
int count = 0;
std::string line;
int parse_res = simdjson::SUCCESS;
while (getline(ss, line)) {
// TODO we're likely triggering simdjson's padding reallocation here. Is that intentional?
parser.parse(line);
count++;
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
double speedinGBs = (p.size()) / (secs.count() * 1000000000.0);
if (speedinGBs > batch_size_res.at(i))
batch_size_res[i] = speedinGBs;
std::cout << speedinGBs << "\t\t\t\t" << count << std::endl;
if (parse_res != simdjson::SUCCESS) {
std::wcerr << "Parsing failed with: " << simdjson::error_message(parse_res).c_str() << std::endl;
std::cerr << "Parsing failed" << std::endl;
exit(1);
}
}
std::cout << i << "\t\t" << std::fixed << std::setprecision(3) << batch_size_res.at(i) << "\t\t\t\t" << count << std::endl;
}
}
if(test_best_batch) {
size_t optimal_batch_size;
if(test_per_batch)
optimal_batch_size = (*min_element(batch_size_res.begin(), batch_size_res.end(), compare)).first;
else
optimal_batch_size = MIN_BATCH_SIZE;
std::wclog << "Starting speed test... Best of " << NB_ITERATION << " iterations..." << std::endl;
std::wclog << "Seemingly optimal batch_size: " << optimal_batch_size << "..." << std::endl;
std::vector<double> res;
for (int i = 0; i < NB_ITERATION; i++) {
//Actual test
simdjson::ParsedJson pj;
simdjson::JsonStream js{p, 4000000};
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
auto start = std::chrono::steady_clock::now();
while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
parse_res = js.json_parse(pj);
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
res.push_back(secs.count());
if (parse_res != simdjson::SUCCESS) {
std::cerr << "Parsing failed" << std::endl;
exit(1);
}
}
std::min(res.begin(), res.end());
std::map<size_t, double> batch_size_res;
if(test_per_batch) {
std::wclog << "parse_many: Speed per batch_size... from " << MIN_BATCH_SIZE
<< " bytes to " << MAX_BATCH_SIZE << " bytes..." << std::endl;
std::cout << "Batch Size\t" << "Gigabytes/second\t" << "Nb of documents parsed" << std::endl;
for (size_t i = MIN_BATCH_SIZE; i <= MAX_BATCH_SIZE; i += (MAX_BATCH_SIZE - MIN_BATCH_SIZE) / 50) {
batch_size_res.insert(std::pair<size_t, double>(i, 0));
int count;
for (size_t j = 0; j < 5; j++) {
//Actual test
simdjson::document::parser parser;
simdjson::error_code error;
double min_result = *min_element(res.begin(), res.end());
double speedinGBs = (p.size()) / (min_result * 1000000000.0);
std::cout << "Min: " << min_result << " bytes read: " << p.size()
<< " Gigabytes/second: " << speedinGBs << std::endl;
}
auto start = std::chrono::steady_clock::now();
count = 0;
for (auto result : parser.parse_many(p, 4000000)) {
error = result.error;
count++;
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
double speedinGBs = (p.size()) / (secs.count() * 1000000000.0);
if (speedinGBs > batch_size_res.at(i))
batch_size_res[i] = speedinGBs;
if (error != simdjson::SUCCESS) {
std::wcerr << "Parsing failed with: " << simdjson::error_message(error).c_str() << std::endl;
exit(1);
}
}
std::cout << i << "\t\t" << std::fixed << std::setprecision(3) << batch_size_res.at(i) << "\t\t\t\t" << count << std::endl;
}
}
if (test_best_batch) {
size_t optimal_batch_size;
if (test_per_batch) {
optimal_batch_size = (*min_element(batch_size_res.begin(), batch_size_res.end(), compare)).first;
} else {
optimal_batch_size = MIN_BATCH_SIZE;
}
std::wclog << "Starting speed test... Best of " << NB_ITERATION << " iterations..." << std::endl;
std::wclog << "Seemingly optimal batch_size: " << optimal_batch_size << "..." << std::endl;
std::vector<double> res;
for (int i = 0; i < NB_ITERATION; i++) {
// Actual test
simdjson::document::parser parser;
simdjson::error_code error;
auto start = std::chrono::steady_clock::now();
// TODO this includes allocation of the parser; is that intentional?
for (auto result : parser.parse_many(p, 4000000)) {
error = result.error;
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
res.push_back(secs.count());
if (error != simdjson::SUCCESS) {
std::wcerr << "Parsing failed with: " << simdjson::error_message(error).c_str() << std::endl;
exit(1);
}
}
std::min(res.begin(), res.end());
double min_result = *min_element(res.begin(), res.end());
double speedinGBs = (p.size()) / (min_result * 1000000000.0);
std::cout << "Min: " << min_result << " bytes read: " << p.size()
<< " Gigabytes/second: " << speedinGBs << std::endl;
}
return 0;
}

View File

@ -1,7 +1,12 @@
# simdjson
## JsonStream
NOTE: The JsonStream API has been deprecated, though its functionality remains! Please use
`parser.parse_many()` instead. See API documentation for more.
An interface providing features to work with files or streams containing multiple JSON documents.
As fast and convenient as possible.
## Contents
- [simdjson](#simdjson)
- [JsonStream](#jsonstream)

View File

@ -111,13 +111,13 @@ private:
inline size_t get_current_buffer_loc() const { return current_buffer_loc; }
/**
* Returns the total amount of complete documents parsed by the JsonStream,
* Returns the total amount of complete documents parsed by the stream,
* in the current buffer, at the given time.
*/
inline size_t get_n_parsed_docs() const { return n_parsed_docs; }
/**
* Returns the total amount of data (in bytes) parsed by the JsonStream,
* Returns the total amount of data (in bytes) parsed by the stream,
* in the current buffer, at the given time.
*/
inline size_t get_n_bytes_parsed() const { return n_bytes_parsed; }
@ -149,7 +149,7 @@ private:
#endif
template <class string_container> friend class JsonStream;
friend class document::parser;
}; // end of class JsonStream
}; // class document::stream
} // end of namespace simdjson
#endif // SIMDJSON_DOCUMENT_STREAM_H

View File

@ -64,7 +64,7 @@ public:
* @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
* @param len the length of the json document.
* @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity.
* @param streaming whether this is being called by a JsonStream parser.
* @param streaming whether this is being called by document::parser::parse_many.
* @return the error code, or SUCCESS if there was no error.
*/
WARN_UNUSED virtual error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept = 0;
@ -82,7 +82,7 @@ public:
WARN_UNUSED virtual error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept = 0;
/**
* Stage 2 of the document parser for JsonStream.
* Stage 2 of the document parser for document::parser::parse_many.
*
* Overridden by each implementation.
*

View File

@ -14,7 +14,7 @@ add_cpp_test(basictests)
#endif()
add_cpp_test(jsoncheck)
add_cpp_test(jsonstream_test)
add_cpp_test(parse_many_test)
add_cpp_test(pointercheck)
add_cpp_test(integer_tests)

View File

@ -251,16 +251,13 @@ static bool parse_json_message_issue467(char const* message, std::size_t len, si
std::cerr << "Failed to allocated memory for simdjson::document::parser" << std::endl;
return false;
}
int res;
simdjson::padded_string str(message,len);
simdjson::JsonStream<simdjson::padded_string> js(str, parser.capacity());
do {
res = js.json_parse(parser);
count++;
} while (res == simdjson::SUCCESS_AND_HAS_MORE);
if (res != simdjson::SUCCESS) {
std::cerr << "Failed with simdjson error= " << simdjson::error_message(res) << std::endl;
return false;
for (auto [doc, error] : parser.parse_many(str, parser.capacity())) {
if (error) {
std::cerr << "Failed with simdjson error= " << simdjson::error_message(error) << std::endl;
return false;
}
count++;
}
if(count != expectedcount) {
std::cerr << "bad count" << std::endl;
@ -407,49 +404,48 @@ bool JsonStream_utf8_test() {
i, i, (i % 2) ? "" : "", i % 10, i % 10);
data += std::string(buf, n);
}
for(size_t batch_size = 1000; batch_size < 2000; batch_size += (batch_size>1050?10:1)) {
printf(".");
fflush(NULL);
simdjson::padded_string str(data);
simdjson::JsonStream<simdjson::padded_string> js{str, batch_size};
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
size_t count = 0;
simdjson::document::parser parser;
while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
parse_res = js.json_parse(parser);
simdjson::document::iterator iter(parser);
if(!iter.is_object()) {
printf("Root should be object\n");
return false;
}
if(!iter.down()) {
printf("Root should not be emtpy\n");
return false;
}
if(!iter.is_string()) {
printf("Object should start with string key\n");
return false;
}
if(strcmp(iter.get_string(),"id")!=0) {
printf("There should a single key, id.\n");
return false;
}
iter.move_to_value();
if(!iter.is_integer()) {
printf("Value of image should be integer\n");
return false;
}
int64_t keyid = iter.get_integer();
if(keyid != (int64_t)count) {
printf("key does not match %d, expected %d\n",(int) keyid, (int) count);
return false;
}
count++;
}
if(count != n_records) {
printf("Something is wrong in JsonStream_utf8_test at window size = %zu.\n", batch_size);
const size_t batch_size = 1000;
printf(".");
fflush(NULL);
simdjson::padded_string str(data);
simdjson::JsonStream<simdjson::padded_string> js{str, batch_size};
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
size_t count = 0;
simdjson::document::parser parser;
while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
parse_res = js.json_parse(parser);
simdjson::document::iterator iter(parser);
if(!iter.is_object()) {
printf("Root should be object\n");
return false;
}
if(!iter.down()) {
printf("Root should not be emtpy\n");
return false;
}
if(!iter.is_string()) {
printf("Object should start with string key\n");
return false;
}
if(strcmp(iter.get_string(),"id")!=0) {
printf("There should a single key, id.\n");
return false;
}
iter.move_to_value();
if(!iter.is_integer()) {
printf("Value of image should be integer\n");
return false;
}
int64_t keyid = iter.get_integer();
if(keyid != (int64_t)count) {
printf("key does not match %d, expected %d\n",(int) keyid, (int) count);
return false;
}
count++;
}
if(count != n_records) {
printf("Something is wrong in JsonStream_utf8_test at window size = %zu.\n", batch_size);
return false;
}
printf("ok\n");
return true;
@ -469,49 +465,48 @@ bool JsonStream_test() {
i, i, (i % 2) ? "homme" : "femme", i % 10, i % 10);
data += std::string(buf, n);
}
for(size_t batch_size = 1000; batch_size < 2000; batch_size += (batch_size>1050?10:1)) {
printf(".");
fflush(NULL);
simdjson::padded_string str(data);
simdjson::JsonStream<simdjson::padded_string> js{str, batch_size};
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
size_t count = 0;
simdjson::document::parser parser;
while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
parse_res = js.json_parse(parser);
simdjson::document::iterator iter(parser);
if(!iter.is_object()) {
printf("Root should be object\n");
return false;
}
if(!iter.down()) {
printf("Root should not be emtpy\n");
return false;
}
if(!iter.is_string()) {
printf("Object should start with string key\n");
return false;
}
if(strcmp(iter.get_string(),"id")!=0) {
printf("There should a single key, id.\n");
return false;
}
iter.move_to_value();
if(!iter.is_integer()) {
printf("Value of image should be integer\n");
return false;
}
int64_t keyid = iter.get_integer();
if(keyid != (int64_t)count) {
printf("key does not match %d, expected %d\n",(int) keyid, (int) count);
return false;
}
count++;
}
if(count != n_records) {
printf("Something is wrong in JsonStream_test at window size = %zu.\n", batch_size);
const size_t batch_size = 1000;
printf(".");
fflush(NULL);
simdjson::padded_string str(data);
simdjson::JsonStream<simdjson::padded_string> js{str, batch_size};
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
size_t count = 0;
simdjson::document::parser parser;
while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
parse_res = js.json_parse(parser);
simdjson::document::iterator iter(parser);
if(!iter.is_object()) {
printf("Root should be object\n");
return false;
}
if(!iter.down()) {
printf("Root should not be emtpy\n");
return false;
}
if(!iter.is_string()) {
printf("Object should start with string key\n");
return false;
}
if(strcmp(iter.get_string(),"id")!=0) {
printf("There should a single key, id.\n");
return false;
}
iter.move_to_value();
if(!iter.is_integer()) {
printf("Value of image should be integer\n");
return false;
}
int64_t keyid = iter.get_integer();
if(keyid != (int64_t)count) {
printf("key does not match %d, expected %d\n",(int) keyid, (int) count);
return false;
}
count++;
}
if(count != n_records) {
printf("Something is wrong in JsonStream_test at window size = %zu.\n", batch_size);
return false;
}
printf("ok\n");
return true;
@ -565,6 +560,54 @@ bool document_stream_test() {
return true;
}
// returns true if successful
bool document_stream_utf8_test() {
printf("Running document_stream_utf8_test");
fflush(NULL);
const size_t n_records = 10000;
std::string data;
char buf[1024];
for (size_t i = 0; i < n_records; ++i) {
auto n = sprintf(buf,
"{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
"\"été\": {\"id\": %zu, \"name\": \"éventail%zu\"}}",
i, i, (i % 2) ? "" : "", i % 10, i % 10);
data += std::string(buf, n);
}
for(size_t batch_size = 1000; batch_size < 2000; batch_size += (batch_size>1050?10:1)) {
printf(".");
fflush(NULL);
simdjson::padded_string str(data);
simdjson::document::parser parser;
size_t count = 0;
for (auto [doc, error] : parser.parse_many(str, batch_size)) {
if (error) {
printf("Error at on document %zd at batch size %zu: %s\n", count, batch_size, simdjson::error_message(error).c_str());
return false;
}
auto [keyid, error2] = doc["id"].as_int64_t();
if (error2) {
printf("Error getting id as int64 on document %zd at batch size %zu: %s\n", count, batch_size, simdjson::error_message(error2).c_str());
return false;
}
if (keyid != int64_t(count)) {
printf("key does not match %ld, expected %zd on document %zd at batch size %zu\n", keyid, count, count, batch_size);
return false;
}
count++;
}
if(count != n_records) {
printf("Found wrong number of documents %zd, expected %zd at batch size %zu\n", count, n_records, batch_size);
return false;
}
}
printf("ok\n");
return true;
}
// returns true if successful
bool skyprophet_test() {
const size_t n_records = 100000;
@ -830,12 +873,6 @@ int main() {
std::cout << "Running basic tests." << std::endl;
if(!json_issue467())
return EXIT_FAILURE;
if(!JsonStream_test())
return EXIT_FAILURE;
if(!JsonStream_utf8_test())
return EXIT_FAILURE;
if(!document_stream_test())
return EXIT_FAILURE;
if(!number_test_small_integers())
return EXIT_FAILURE;
if(!stable_test())
@ -852,6 +889,14 @@ int main() {
return EXIT_FAILURE;
if (!dom_api::run_tests())
return EXIT_FAILURE;
if(!document_stream_test())
return EXIT_FAILURE;
if(!document_stream_utf8_test())
return EXIT_FAILURE;
if(!JsonStream_test())
return EXIT_FAILURE;
if(!JsonStream_utf8_test())
return EXIT_FAILURE;
std::cout << "Basic tests are ok." << std::endl;
return EXIT_SUCCESS;
}

View File

@ -15,7 +15,7 @@
#include "simdjson.h"
/**
* Does the file filename ends with the given extension.
* Does the file filename end with the given extension.
*/
static bool has_extension(const char *filename, const char *extension) {
const char *ext = strrchr(filename, '.');
@ -76,37 +76,36 @@ bool validate(const char *dirname) {
/* The actual test*/
simdjson::padded_string p;
simdjson::padded_string json;
try {
simdjson::get_corpus(fullpath).swap(p);
simdjson::get_corpus(fullpath).swap(json);
} catch (const std::exception &) {
std::cerr << "Could not load the file " << fullpath << std::endl;
return EXIT_FAILURE;
}
simdjson::ParsedJson pj;
simdjson::JsonStream js{p};
simdjson::document::parser parser;
++how_many;
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
while(parse_res == simdjson::SUCCESS_AND_HAS_MORE){
parse_res = js.json_parse(pj);
simdjson::error_code error = simdjson::SUCCESS;
for (auto result : parser.parse_many(json)) {
error = result.error;
}
printf("%s\n", parse_res == 0 ? "ok" : "invalid");
printf("%s\n", error ? "ok" : "invalid");
/* Check if the file is supposed to pass or not. Print the results */
if (contains("EXCLUDE", name)) {
// skipping
how_many--;
} else if (starts_with("pass", name) and (has_extension(extension1, name) or has_extension(extension2, name)) and parse_res != 0) {
} else if (starts_with("pass", name) and (has_extension(extension1, name) or has_extension(extension2, name)) and error) {
is_file_as_expected[i] = false;
printf("warning: file %s should pass but it fails. Error is: %s\n",
name, simdjson::error_message(parse_res).data());
printf("size of file in bytes: %zu \n", p.size());
name, simdjson::error_message(error).data());
printf("size of file in bytes: %zu \n", json.size());
everything_fine = false;
} else if ( starts_with("fail", name) and (not starts_with("fail10.json", name)) and parse_res == 0) {
} else if ( starts_with("fail", name) and (not starts_with("fail10.json", name)) and !error) {
is_file_as_expected[i] = false;
printf("warning: file %s should fail but it passes.\n", name);
printf("size of file in bytes: %zu \n", p.size());
printf("size of file in bytes: %zu \n", json.size());
everything_fine = false;
}
free(fullpath);