simdjson/tests/ondemand/ondemand_document_stream_te...

453 lines
16 KiB
C++

#include "simdjson.h"
#include "test_ondemand.h"
using namespace simdjson;
namespace document_stream_tests {
std::string my_string(ondemand::document& doc) {
std::stringstream ss;
ss << doc;
return ss.str();
}
bool simple_document_iteration() {
TEST_START();
auto json = R"([1,[1,2]] {"a":1,"b":2} {"o":{"1":1,"2":2}} [1,2,3])"_padded;
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS(parser.iterate_many(json).get(stream));
std::string_view expected[4] = {"[1,[1,2]]", "{\"a\":1,\"b\":2}", "{\"o\":{\"1\":1,\"2\":2}}", "[1,2,3]"};
size_t counter{0};
for(auto & doc : stream) {
ASSERT_TRUE(counter < 4);
ASSERT_EQUAL(my_string(doc), expected[counter++]);
}
ASSERT_EQUAL(counter, 4);
TEST_SUCCEED();
}
bool simple_document_iteration_multiple_batches() {
TEST_START();
auto json = R"([1,[1,2]] {"a":1,"b":2} {"o":{"1":1,"2":2}} [1,2,3])"_padded;
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS(parser.iterate_many(json,32).get(stream));
std::string_view expected[4] = {"[1,[1,2]]", "{\"a\":1,\"b\":2}", "{\"o\":{\"1\":1,\"2\":2}}", "[1,2,3]"};
size_t counter{0};
for(auto i = stream.begin(); i != stream.end(); ++i) {
ASSERT_TRUE(counter < 4);
ASSERT_EQUAL(i.source(), expected[counter++]);
}
ASSERT_EQUAL(counter, 4);
TEST_SUCCEED();
}
bool simple_document_iteration_with_parsing() {
TEST_START();
auto json = R"([1,[1,2]] {"a":1,"b":2} {"o":{"1":1,"2":2}} [1,2,3])"_padded;
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS(parser.iterate_many(json).get(stream));
std::string_view expected[4] = {"[1,[1,2]]", "{\"a\":1,\"b\":2}", "{\"o\":{\"1\":1,\"2\":2}}", "[1,2,3]"};
size_t counter{0};
auto i = stream.begin();
int64_t x;
ASSERT_EQUAL(i.source(),expected[counter++]);
ASSERT_SUCCESS( (*i).at_pointer("/1/1").get(x) );
ASSERT_EQUAL(x,2);
++i;
ASSERT_EQUAL(i.source(),expected[counter++]);
ASSERT_SUCCESS( (*i).find_field("a").get(x) );
ASSERT_EQUAL(x,1);
++i;
ASSERT_EQUAL(i.source(),expected[counter++]);
ASSERT_SUCCESS( (*i).at_pointer("/o/2").get(x) );
ASSERT_EQUAL(x,2);
++i;
ASSERT_EQUAL(i.source(),expected[counter++]);
ASSERT_SUCCESS( (*i).at_pointer("/2").get(x) );
ASSERT_EQUAL(x,3);
++i;
if (i != stream.end()) { return false; }
TEST_SUCCEED();
}
bool atoms_json() {
TEST_START();
auto json = R"(5 true 20.3 "string" )"_padded;
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS(parser.iterate_many(json).get(stream));
std::string_view expected[4] = {"5", "true", "20.3", "\"string\""};
size_t counter{0};
for (auto i = stream.begin(); i != stream.end(); ++i) {
ASSERT_EQUAL(i.source(), expected[counter++]);
}
ASSERT_EQUAL(counter,4);
TEST_SUCCEED();
}
bool doc_index() {
TEST_START();
auto json = R"({"z":5} {"1":1,"2":2,"4":4} [7, 10, 9] [15, 11, 12, 13] [154, 110, 112, 1311])"_padded;
std::string_view expected[5] = {R"({"z":5})",R"({"1":1,"2":2,"4":4})","[7, 10, 9]","[15, 11, 12, 13]","[154, 110, 112, 1311]"};
size_t expected_indexes[5] = {0, 9, 29, 44, 65};
ondemand::parser parser;
ondemand::document_stream stream;
size_t counter{0};
ASSERT_SUCCESS(parser.iterate_many(json).get(stream));
for(auto i = stream.begin(); i != stream.end(); ++i) {
ASSERT_TRUE(counter < 5);
ASSERT_EQUAL(i.current_index(), expected_indexes[counter]);
ASSERT_EQUAL(i.source(), expected[counter]);
counter++;
}
ASSERT_EQUAL(counter, 5);
TEST_SUCCEED();
}
bool doc_index_multiple_batches() {
TEST_START();
auto json = R"({"z":5} {"1":1,"2":2,"4":4} [7, 10, 9] [15, 11, 12, 13] [154, 110, 112, 1311])"_padded;
std::string_view expected[5] = {R"({"z":5})",R"({"1":1,"2":2,"4":4})","[7, 10, 9]","[15, 11, 12, 13]","[154, 110, 112, 1311]"};
size_t expected_indexes[5] = {0, 9, 29, 44, 65};
ondemand::parser parser;
ondemand::document_stream stream;
size_t counter{0};
ASSERT_SUCCESS(parser.iterate_many(json,32).get(stream));
for(auto i = stream.begin(); i != stream.end(); ++i) {
ASSERT_TRUE(counter < 5);
ASSERT_EQUAL(i.current_index(), expected_indexes[counter]);
ASSERT_EQUAL(i.source(), expected[counter]);
counter++;
}
ASSERT_EQUAL(counter, 5);
TEST_SUCCEED();
}
bool source_test() {
TEST_START();
auto json = R"([1,[1,2]] {"a":1,"b":2} {"o":{"1":1,"2":2}} [1,2,3] )"_padded;
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS(parser.iterate_many(json).get(stream));
std::string_view expected[4] = {"[1,[1,2]]", "{\"a\":1,\"b\":2}", "{\"o\":{\"1\":1,\"2\":2}}", "[1,2,3]"};
size_t counter{0};
for (auto i = stream.begin(); i != stream.end(); ++i) {
ASSERT_EQUAL(i.source(), expected[counter++]);
}
ASSERT_EQUAL(counter,4);
TEST_SUCCEED();
}
bool truncated() {
TEST_START();
// The last JSON document is intentionally truncated.
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} [1,2 )"_padded;
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS(parser.iterate_many(json).get(stream));
size_t counter{0};
for (auto i = stream.begin(); i != stream.end(); ++i) {
counter++;
}
size_t truncated = stream.truncated_bytes();
ASSERT_EQUAL(truncated, 6);
ASSERT_EQUAL(counter,2);
TEST_SUCCEED();
}
bool truncated_complete_docs() {
TEST_START();
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} [1,2] )"_padded;
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS(parser.iterate_many(json).get(stream));
size_t counter{0};
for (auto i = stream.begin(); i != stream.end(); ++i) {
counter++;
}
size_t truncated = stream.truncated_bytes();
ASSERT_EQUAL(truncated, 0);
ASSERT_EQUAL(counter,3);
TEST_SUCCEED();
}
bool truncated_unclosed_string() {
TEST_START();
// The last JSON document is intentionally truncated.
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} "intentionally unclosed string )"_padded;
ondemand::parser parser;
ondemand::document_stream stream;
// We use a window of json.size() though any large value would do.
ASSERT_SUCCESS( parser.iterate_many(json).get(stream) );
size_t counter{0};
for(auto i = stream.begin(); i != stream.end(); ++i) {
counter++;
}
size_t truncated = stream.truncated_bytes();
ASSERT_EQUAL(counter,2);
ASSERT_EQUAL(truncated,32);
TEST_SUCCEED();
}
bool truncated_unclosed_string_in_object() {
// The last JSON document is intentionally truncated.
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} {"key":"intentionally unclosed string )"_padded;
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS( parser.iterate_many(json).get(stream) );
size_t counter{0};
for(auto i = stream.begin(); i != stream.end(); ++i) {
counter++;
}
size_t truncated = stream.truncated_bytes();
ASSERT_EQUAL(counter,2);
ASSERT_EQUAL(truncated,39);
TEST_SUCCEED();
}
bool small_window() {
TEST_START();
std::vector<char> input;
input.push_back('[');
for(size_t i = 1; i < 1024; i++) {
input.push_back('1');
input.push_back(i < 1023 ? ',' : ']');
}
auto json = simdjson::padded_string(input.data(),input.size());
ondemand::parser parser;
size_t window_size{1024}; // deliberately too small
ondemand::document_stream stream;
ASSERT_SUCCESS( parser.iterate_many(json, window_size).get(stream) );
auto i = stream.begin();
ASSERT_ERROR(i.error(), CAPACITY);
TEST_SUCCEED();
}
bool large_window() {
TEST_START();
#if SIZE_MAX > 17179869184
auto json = R"({"error":[],"result":{"token":"xxx"}}{"error":[],"result":{"token":"xxx"}})"_padded;
ondemand::parser parser;
uint64_t window_size{17179869184}; // deliberately too big
ondemand::document_stream stream;
ASSERT_SUCCESS( parser.iterate_many(json, size_t(window_size)).get(stream) );
auto i = stream.begin();
ASSERT_ERROR(i.error(),CAPACITY);
#endif
TEST_SUCCEED();
}
bool test_leading_spaces() {
TEST_START();
auto input = R"( [1,1] [1,2] [1,3] [1,4] [1,5] [1,6] [1,7] [1,8] [1,9] [1,10] [1,11] [1,12] [1,13] [1,14] [1,15] )"_padded;;
size_t count{0};
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS(parser.iterate_many(input, 32).get(stream));
for(auto i = stream.begin(); i != stream.end(); ++i) {
ASSERT_SUCCESS(i.error());
count++;
}
ASSERT_EQUAL(count,15);
TEST_SUCCEED();
}
bool test_crazy_leading_spaces() {
TEST_START();
auto input = R"( [1,1] [1,2] [1,3] [1,4] [1,5] [1,6] [1,7] [1,8] [1,9] [1,10] [1,11] [1,12] [1,13] [1,14] [1,15] )"_padded;;
size_t count{0};
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS(parser.iterate_many(input, 32).get(stream));
for(auto i = stream.begin(); i != stream.end(); ++i) {
ASSERT_SUCCESS(i.error());
count++;
}
ASSERT_EQUAL(count,15);
TEST_SUCCEED();
}
bool adversarial_single_document() {
TEST_START();
auto json = R"({"f[)"_padded;
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS(parser.iterate_many(json).get(stream));
size_t count{0};
for (auto & doc : stream) {
(void)doc;
count++;
}
ASSERT_EQUAL(count,0);
TEST_SUCCEED();
}
bool adversarial_single_document_array() {
TEST_START();
auto json = R"(["this is an unclosed string ])"_padded;
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS(parser.iterate_many(json).get(stream));
size_t count{0};
for (auto & doc : stream) {
(void)doc;
count++;
}
ASSERT_EQUAL(count,0);
TEST_SUCCEED();
}
bool document_stream_test() {
TEST_START();
fflush(NULL);
const size_t n_records = 10000;
std::string data;
std::vector<char> buf(1024);
// Generating data
for (size_t i = 0; i < n_records; ++i) {
size_t n = snprintf(buf.data(),
buf.size(),
"{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
"\"ete\": {\"id\": %zu, \"name\": \"eventail%zu\"}}",
i, i, (i % 2) ? "homme" : "femme", i % 10, i % 10);
if (n >= buf.size()) { abort(); }
data += std::string(buf.data(), n);
}
for(size_t batch_size = 1000; batch_size < 2000; batch_size += (batch_size>1050?10:1)) {
fflush(NULL);
simdjson::padded_string str(data);
ondemand::parser parser;
ondemand::document_stream stream;
size_t count{0};
ASSERT_SUCCESS( parser.iterate_many(str, batch_size).get(stream) );
for (auto & doc : stream) {
int64_t keyid;
ASSERT_SUCCESS( doc["id"].get(keyid) );
ASSERT_EQUAL( keyid, int64_t(count) );
count++;
}
ASSERT_EQUAL(count,n_records);
}
TEST_SUCCEED();
}
bool document_stream_utf8_test() {
TEST_START();
fflush(NULL);
const size_t n_records = 10000;
std::string data;
std::vector<char> buf(1024);
// Generating data
for (size_t i = 0; i < n_records; ++i) {
size_t n = snprintf(buf.data(),
buf.size(),
"{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
"\"\xC3\xA9t\xC3\xA9\": {\"id\": %zu, \"name\": \"\xC3\xA9ventail%zu\"}}",
i, i, (i % 2) ? "\xE2\xBA\x83" : "\xE2\xBA\x95", i % 10, i % 10);
if (n >= buf.size()) { abort(); }
data += std::string(buf.data(), n);
}
for(size_t batch_size = 1000; batch_size < 2000; batch_size += (batch_size>1050?10:1)) {
fflush(NULL);
simdjson::padded_string str(data);
ondemand::parser parser;
ondemand::document_stream stream;
size_t count{0};
ASSERT_SUCCESS( parser.iterate_many(str, batch_size).get(stream) );
for (auto & doc : stream) {
int64_t keyid;
ASSERT_SUCCESS( doc["id"].get(keyid) );
ASSERT_EQUAL( keyid, int64_t(count) );
count++;
}
ASSERT_EQUAL( count, n_records )
}
TEST_SUCCEED();
}
bool stress_data_race() {
TEST_START();
// Correct JSON.
auto input = R"([1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] )"_padded;;
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS(parser.iterate_many(input, 32).get(stream));
for(auto i = stream.begin(); i != stream.end(); ++i) {
ASSERT_SUCCESS(i.error());
}
TEST_SUCCEED();
}
bool stress_data_race_with_error() {
TEST_START();
#if SIMDJSON_THREAD_ENABLED
std::cout << "ENABLED" << std::endl;
#endif
// Intentionally broken
auto input = R"([1,23] [1,23] [1,23] [1,23 [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] )"_padded;
ondemand::parser parser;
ondemand::document_stream stream;
ASSERT_SUCCESS(parser.iterate_many(input, 32).get(stream));
size_t count{0};
for(auto i = stream.begin(); i != stream.end(); ++i) {
auto error = i.error();
if(count <= 3) {
ASSERT_SUCCESS(error);
} else {
ASSERT_ERROR(error,TAPE_ERROR);
break;
}
count++;
}
TEST_SUCCEED();
}
bool run() {
return
simple_document_iteration() &&
simple_document_iteration_multiple_batches() &&
simple_document_iteration_with_parsing() &&
atoms_json() &&
doc_index() &&
doc_index_multiple_batches() &&
source_test() &&
truncated() &&
truncated_complete_docs() &&
truncated_unclosed_string() &&
small_window() &&
large_window() &&
test_leading_spaces() &&
test_crazy_leading_spaces() &&
adversarial_single_document() &&
adversarial_single_document_array() &&
document_stream_test() &&
document_stream_utf8_test() &&
stress_data_race() &&
stress_data_race_with_error() &&
true;
}
} // document_stream_tests
int main (int argc, char *argv[]) {
return test_main(argc, argv, document_stream_tests::run);
}