Prototype test for issue 1299: using parse_many, find the location of the end of the last document (#1301)
* Prototype test for issue 1299. * This improves the documentation. * Removing trailing white spaces. * Removing trailing spaces * Trailing.
This commit is contained in:
parent
725ca010e7
commit
9304d88920
|
@ -13,6 +13,8 @@ Contents
|
|||
- [Support](#support)
|
||||
- [API](#api)
|
||||
- [Use cases](#use-cases)
|
||||
- [Tracking your position](#tracking-your-position)
|
||||
- [Incomplete streams](#incomplete-streams)
|
||||
|
||||
Motivation
|
||||
-----------
|
||||
|
@ -158,3 +160,58 @@ From [jsonlines.org](http://jsonlines.org/examples/):
|
|||
```
|
||||
JSON Lines' biggest strength is in handling lots of similar nested data structures. One .jsonl file is easier to
|
||||
work with than a directory full of XML files.
|
||||
|
||||
|
||||
Tracking your position
|
||||
-----------
|
||||
|
||||
Some users would like to know where the document they parsed is in the input array of bytes.
|
||||
It is possible to do so by accessing directly the iterator and calling its `current_index()`
|
||||
method which reports the location (in bytes) of the current document in the input stream.
|
||||
|
||||
Let us illustrate the idea with code:
|
||||
|
||||
|
||||
```C++
|
||||
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} [1,2,3] )"_padded;
|
||||
simdjson::dom::parser parser;
|
||||
simdjson::dom::document_stream stream;
|
||||
ASSERT_SUCCESS( parser.parse_many(json).get(stream) );
|
||||
auto i = stream.begin();
|
||||
for(; i != stream.end(); ++i) {
|
||||
auto doc = *i;
|
||||
if(!doc.error()) {
|
||||
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||
}
|
||||
}
|
||||
size_t index = i.current_index();
|
||||
if(index != 38) {
|
||||
std::cerr << "Expected to stop after the three full documents " << std::endl;
|
||||
std::cerr << "index = " << index << std::endl;
|
||||
return false;
|
||||
}
|
||||
```
|
||||
|
||||
This code will print:
|
||||
```
|
||||
got full document at 0
|
||||
got full document at 9
|
||||
got full document at 29
|
||||
```
|
||||
|
||||
The last call to `i.current_index()` return the byte index 38, which is just beyond
|
||||
the last document.
|
||||
|
||||
Incomplete streams
|
||||
-----------
|
||||
|
||||
Some users may need to work with truncated streams while tracking their location in the stream.
|
||||
The same code, with the `current_index()` will work. However, the last block (by default 1MB)
|
||||
terminates with an unclosed string, then no JSON document within this last block will validate.
|
||||
In particular, it means that if your input string is `[1,2,3] {"1":1,"2":3,"4":4} [1,2` then
|
||||
no JSON document will be successfully parsed. The error `simdjson::UNCLOSED_STRING` will be
|
||||
given (even with the first JSON document). It is then your responsability to terminate the input
|
||||
maybe by appending the missing data at the end of the truncated string, or by copying the truncated
|
||||
data before the continuing input.
|
||||
|
||||
|
||||
|
|
|
@ -312,7 +312,93 @@ namespace document_stream_tests {
|
|||
return count == 1;
|
||||
}
|
||||
#endif
|
||||
bool simple_example() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
// The last JSON document is
|
||||
// intentionally truncated.
|
||||
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} [1,2,3] )"_padded;
|
||||
simdjson::dom::parser parser;
|
||||
size_t count = 0;
|
||||
simdjson::dom::document_stream stream;
|
||||
// We use a window of json.size() though any large value would do.
|
||||
ASSERT_SUCCESS( parser.parse_many(json, json.size()).get(stream) );
|
||||
auto i = stream.begin();
|
||||
for(; i != stream.end(); ++i) {
|
||||
auto doc = *i;
|
||||
if(!doc.error()) {
|
||||
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||
count++;
|
||||
}
|
||||
}
|
||||
if(count != 3) {
|
||||
std::cerr << "Expected to get three full documents " << std::endl;
|
||||
return false;
|
||||
}
|
||||
size_t index = i.current_index();
|
||||
if(index != 38) {
|
||||
std::cerr << "Expected to stop after the three full documents " << std::endl;
|
||||
std::cerr << "index = " << index << std::endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool truncated_window() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
// The last JSON document is
|
||||
// intentionally truncated.
|
||||
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} [1,2 )"_padded;
|
||||
simdjson::dom::parser parser;
|
||||
size_t count = 0;
|
||||
simdjson::dom::document_stream stream;
|
||||
// We use a window of json.size() though any large value would do.
|
||||
ASSERT_SUCCESS( parser.parse_many(json, json.size()).get(stream) );
|
||||
auto i = stream.begin();
|
||||
for(; i != stream.end(); ++i) {
|
||||
auto doc = *i;
|
||||
if(!doc.error()) {
|
||||
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||
count++;
|
||||
}
|
||||
}
|
||||
if(count != 2) {
|
||||
std::cerr << "Expected to get two full documents " << std::endl;
|
||||
return false;
|
||||
}
|
||||
size_t index = i.current_index();
|
||||
if(index != 29) {
|
||||
std::cerr << "Expected to stop after the two full documents " << std::endl;
|
||||
std::cerr << "index = " << index << std::endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool truncated_window_unclosed_string() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
// The last JSON document is intentionally truncated. In this instance, we use
|
||||
// a truncated string which will create trouble since stage 1 will recognize the
|
||||
// JSON as invalid and refuse to even start parsing.
|
||||
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} "intentionally unclosed string )"_padded;
|
||||
simdjson::dom::parser parser;
|
||||
simdjson::dom::document_stream stream;
|
||||
// We use a window of json.size() though any large value would do.
|
||||
ASSERT_SUCCESS( parser.parse_many(json,json.size()).get(stream) );
|
||||
// Rest is ineffective because stage 1 fails.
|
||||
auto i = stream.begin();
|
||||
for(; i != stream.end(); ++i) {
|
||||
auto doc = *i;
|
||||
if(!doc.error()) {
|
||||
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||
return false;
|
||||
} else {
|
||||
std::cout << doc.error() << std::endl;
|
||||
return (doc.error() == simdjson::UNCLOSED_STRING);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool small_window() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
char input[2049];
|
||||
|
@ -502,7 +588,10 @@ namespace document_stream_tests {
|
|||
}
|
||||
|
||||
bool run() {
|
||||
return issue1307() &&
|
||||
return simple_example() &&
|
||||
truncated_window() &&
|
||||
truncated_window_unclosed_string() &&
|
||||
issue1307() &&
|
||||
issue1308() &&
|
||||
issue1309() &&
|
||||
issue1310() &&
|
||||
|
|
Loading…
Reference in New Issue