Fix an issue with truncated-byte function. (#1674)

This commit is contained in:
Daniel Lemire 2021-07-30 13:12:42 -04:00 committed by GitHub
parent f657516a7e
commit d83e69d977
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 56 additions and 2 deletions

View File

@ -1046,7 +1046,12 @@ for (auto doc: stream) {
ondemand::value val;
error = doc.at_pointer("/4").get(val);
// error == simdjson::CAPACITY
if(error) { std::cerr << error << std::endl; break; }
if(error) {
std::cerr << error << std::endl;
// We left 293 bytes unprocessed at the tail end of the input.
std::cout << " unprocessed bytes at the end: " << stream.truncated_bytes() << std::endl;
break;
}
}
counter++;
}
@ -1062,6 +1067,7 @@ This example should print out:
5 = 5
5 = 5
This parser can't support a document that big
unprocessed bytes at the end: 293
```
If your documents are large (e.g., larger than a megabyte), then the `iterate_many` function is maybe ill-suited. It is really meant to support reading efficiently streams of relatively small documents (e.g., a few kilobytes each). If you have larger documents, you should use other functions like `iterate`.

View File

@ -256,6 +256,7 @@ inline size_t document_stream::size_in_bytes() const noexcept {
}
inline size_t document_stream::truncated_bytes() const noexcept {
if(error == CAPACITY) { return len - batch_start; }
return parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] - parser->implementation->structural_indexes[parser->implementation->n_structural_indexes + 1];
}

View File

@ -126,6 +126,7 @@ inline size_t document_stream::size_in_bytes() const noexcept {
}
inline size_t document_stream::truncated_bytes() const noexcept {
if(error == CAPACITY) { return len - batch_start; }
return parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] - parser->implementation->structural_indexes[parser->implementation->n_structural_indexes + 1];
}

View File

@ -611,6 +611,43 @@ namespace document_stream_tests {
return true;
}
bool issue1668() {
TEST_START();
auto json = R"([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100])"_padded;
simdjson::dom::parser odparser;
simdjson::dom::document_stream odstream;
ASSERT_SUCCESS( odparser.parse_many(json.data(), json.length(), 50).get(odstream) );
for (auto doc: odstream) {
simdjson::dom::element val;
ASSERT_ERROR(doc.at_pointer("/40").get(val), simdjson::CAPACITY);
ASSERT_EQUAL(odstream.truncated_bytes(), json.length());
}
TEST_SUCCEED();
}
bool issue1668_long() {
TEST_START();
auto json = R"([1,2,3,4,5] [1,2,3,4,5] [1,2,3,4,5] [1,2,3,4,5] [1,2,3,4,5] [1,2,3,4,5] [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100])"_padded;
simdjson::dom::parser odparser;
simdjson::dom::document_stream odstream;
size_t counter{0};
ASSERT_SUCCESS( odparser.parse_many(json.data(), json.length(), 50).get(odstream) );
for (auto doc: odstream) {
if(counter < 6) {
int64_t val;
ASSERT_SUCCESS(doc.at_pointer("/4").get(val));
ASSERT_EQUAL(val, 5);
} else {
simdjson::dom::element val;
ASSERT_ERROR(doc.at_pointer("/4").get(val), simdjson::CAPACITY);
// We left 293 bytes unprocessed.
ASSERT_EQUAL(odstream.truncated_bytes(), 293);
}
counter++;
}
TEST_SUCCEED();
}
bool small_window() {
std::cout << "Running " << __func__ << std::endl;
std::vector<char> input;

View File

@ -231,6 +231,7 @@ namespace document_stream_tests {
ASSERT_SUCCESS( parser.iterate_many(json, window_size).get(stream) );
auto i = stream.begin();
ASSERT_ERROR(i.error(), CAPACITY);
ASSERT_EQUAL(stream.truncated_bytes(), json.length());
TEST_SUCCEED();
}
@ -356,6 +357,7 @@ namespace document_stream_tests {
for (auto doc: odstream) {
ondemand::value val;
ASSERT_ERROR(doc.at_pointer("/40").get(val), CAPACITY);
ASSERT_EQUAL(odstream.truncated_bytes(), json.length());
}
TEST_SUCCEED();
}
@ -376,6 +378,8 @@ namespace document_stream_tests {
} else {
ondemand::value val;
ASSERT_ERROR(doc.at_pointer("/4").get(val), CAPACITY);
// We left 293 bytes unprocessed.
ASSERT_EQUAL(odstream.truncated_bytes(), 293);
}
counter++;
}

View File

@ -428,7 +428,12 @@ bool stream_capacity_example() {
ondemand::value val;
error = doc.at_pointer("/4").get(val);
// error == simdjson::CAPACITY
if(error) { std::cerr << error << std::endl; break; }
if(error) {
std::cerr << error << std::endl;
// We left 293 bytes unprocessed at the tail end of the input.
std::cout << " unprocessed bytes at the end: " << stream.truncated_bytes() << std::endl;
break;
}
}
counter++;
}