Prototype test for issue 1299: using parse_many, find the location of the end of the last document (#1301)
* Prototype test for issue 1299. * This improves the documentation. * Removing trailing white spaces. * Removing trailing spaces * Trailing.
This commit is contained in:
parent
725ca010e7
commit
9304d88920
|
@ -13,6 +13,8 @@ Contents
|
||||||
- [Support](#support)
|
- [Support](#support)
|
||||||
- [API](#api)
|
- [API](#api)
|
||||||
- [Use cases](#use-cases)
|
- [Use cases](#use-cases)
|
||||||
|
- [Tracking your position](#tracking-your-position)
|
||||||
|
- [Incomplete streams](#incomplete-streams)
|
||||||
|
|
||||||
Motivation
|
Motivation
|
||||||
-----------
|
-----------
|
||||||
|
@ -158,3 +160,58 @@ From [jsonlines.org](http://jsonlines.org/examples/):
|
||||||
```
|
```
|
||||||
JSON Lines' biggest strength is in handling lots of similar nested data structures. One .jsonl file is easier to
|
JSON Lines' biggest strength is in handling lots of similar nested data structures. One .jsonl file is easier to
|
||||||
work with than a directory full of XML files.
|
work with than a directory full of XML files.
|
||||||
|
|
||||||
|
|
||||||
|
Tracking your position
|
||||||
|
-----------
|
||||||
|
|
||||||
|
Some users would like to know where the document they parsed is in the input array of bytes.
|
||||||
|
It is possible to do so by accessing directly the iterator and calling its `current_index()`
|
||||||
|
method which reports the location (in bytes) of the current document in the input stream.
|
||||||
|
|
||||||
|
Let us illustrate the idea with code:
|
||||||
|
|
||||||
|
|
||||||
|
```C++
|
||||||
|
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} [1,2,3] )"_padded;
|
||||||
|
simdjson::dom::parser parser;
|
||||||
|
simdjson::dom::document_stream stream;
|
||||||
|
ASSERT_SUCCESS( parser.parse_many(json).get(stream) );
|
||||||
|
auto i = stream.begin();
|
||||||
|
for(; i != stream.end(); ++i) {
|
||||||
|
auto doc = *i;
|
||||||
|
if(!doc.error()) {
|
||||||
|
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
size_t index = i.current_index();
|
||||||
|
if(index != 38) {
|
||||||
|
std::cerr << "Expected to stop after the three full documents " << std::endl;
|
||||||
|
std::cerr << "index = " << index << std::endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
This code will print:
|
||||||
|
```
|
||||||
|
got full document at 0
|
||||||
|
got full document at 9
|
||||||
|
got full document at 29
|
||||||
|
```
|
||||||
|
|
||||||
|
The last call to `i.current_index()` return the byte index 38, which is just beyond
|
||||||
|
the last document.
|
||||||
|
|
||||||
|
Incomplete streams
|
||||||
|
-----------
|
||||||
|
|
||||||
|
Some users may need to work with truncated streams while tracking their location in the stream.
|
||||||
|
The same code, with the `current_index()` will work. However, the last block (by default 1MB)
|
||||||
|
terminates with an unclosed string, then no JSON document within this last block will validate.
|
||||||
|
In particular, it means that if your input string is `[1,2,3] {"1":1,"2":3,"4":4} [1,2` then
|
||||||
|
no JSON document will be successfully parsed. The error `simdjson::UNCLOSED_STRING` will be
|
||||||
|
given (even with the first JSON document). It is then your responsability to terminate the input
|
||||||
|
maybe by appending the missing data at the end of the truncated string, or by copying the truncated
|
||||||
|
data before the continuing input.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -312,7 +312,93 @@ namespace document_stream_tests {
|
||||||
return count == 1;
|
return count == 1;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
bool simple_example() {
|
||||||
|
std::cout << "Running " << __func__ << std::endl;
|
||||||
|
// The last JSON document is
|
||||||
|
// intentionally truncated.
|
||||||
|
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} [1,2,3] )"_padded;
|
||||||
|
simdjson::dom::parser parser;
|
||||||
|
size_t count = 0;
|
||||||
|
simdjson::dom::document_stream stream;
|
||||||
|
// We use a window of json.size() though any large value would do.
|
||||||
|
ASSERT_SUCCESS( parser.parse_many(json, json.size()).get(stream) );
|
||||||
|
auto i = stream.begin();
|
||||||
|
for(; i != stream.end(); ++i) {
|
||||||
|
auto doc = *i;
|
||||||
|
if(!doc.error()) {
|
||||||
|
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(count != 3) {
|
||||||
|
std::cerr << "Expected to get three full documents " << std::endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
size_t index = i.current_index();
|
||||||
|
if(index != 38) {
|
||||||
|
std::cerr << "Expected to stop after the three full documents " << std::endl;
|
||||||
|
std::cerr << "index = " << index << std::endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool truncated_window() {
|
||||||
|
std::cout << "Running " << __func__ << std::endl;
|
||||||
|
// The last JSON document is
|
||||||
|
// intentionally truncated.
|
||||||
|
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} [1,2 )"_padded;
|
||||||
|
simdjson::dom::parser parser;
|
||||||
|
size_t count = 0;
|
||||||
|
simdjson::dom::document_stream stream;
|
||||||
|
// We use a window of json.size() though any large value would do.
|
||||||
|
ASSERT_SUCCESS( parser.parse_many(json, json.size()).get(stream) );
|
||||||
|
auto i = stream.begin();
|
||||||
|
for(; i != stream.end(); ++i) {
|
||||||
|
auto doc = *i;
|
||||||
|
if(!doc.error()) {
|
||||||
|
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(count != 2) {
|
||||||
|
std::cerr << "Expected to get two full documents " << std::endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
size_t index = i.current_index();
|
||||||
|
if(index != 29) {
|
||||||
|
std::cerr << "Expected to stop after the two full documents " << std::endl;
|
||||||
|
std::cerr << "index = " << index << std::endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool truncated_window_unclosed_string() {
|
||||||
|
std::cout << "Running " << __func__ << std::endl;
|
||||||
|
// The last JSON document is intentionally truncated. In this instance, we use
|
||||||
|
// a truncated string which will create trouble since stage 1 will recognize the
|
||||||
|
// JSON as invalid and refuse to even start parsing.
|
||||||
|
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} "intentionally unclosed string )"_padded;
|
||||||
|
simdjson::dom::parser parser;
|
||||||
|
simdjson::dom::document_stream stream;
|
||||||
|
// We use a window of json.size() though any large value would do.
|
||||||
|
ASSERT_SUCCESS( parser.parse_many(json,json.size()).get(stream) );
|
||||||
|
// Rest is ineffective because stage 1 fails.
|
||||||
|
auto i = stream.begin();
|
||||||
|
for(; i != stream.end(); ++i) {
|
||||||
|
auto doc = *i;
|
||||||
|
if(!doc.error()) {
|
||||||
|
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
std::cout << doc.error() << std::endl;
|
||||||
|
return (doc.error() == simdjson::UNCLOSED_STRING);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
bool small_window() {
|
bool small_window() {
|
||||||
std::cout << "Running " << __func__ << std::endl;
|
std::cout << "Running " << __func__ << std::endl;
|
||||||
char input[2049];
|
char input[2049];
|
||||||
|
@ -502,7 +588,10 @@ namespace document_stream_tests {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool run() {
|
bool run() {
|
||||||
return issue1307() &&
|
return simple_example() &&
|
||||||
|
truncated_window() &&
|
||||||
|
truncated_window_unclosed_string() &&
|
||||||
|
issue1307() &&
|
||||||
issue1308() &&
|
issue1308() &&
|
||||||
issue1309() &&
|
issue1309() &&
|
||||||
issue1310() &&
|
issue1310() &&
|
||||||
|
|
Loading…
Reference in New Issue