Document stream: truncate final unfinished document and give access to the number of truncated bytes. (#1534)
* Truncate final unclosed string. * Adding more precise remarks. * Better documentation and more robust code. * ARM + PPC corrections. * Patching ARM implementation with new stage1_mode parameter. * Fixed most problems. * Correcting white spaces and adding a remark. * This adds the truncated_bytes() method to the stream instances.
This commit is contained in:
parent
48f5e8b6c3
commit
8eed8f5155
|
@ -331,7 +331,7 @@ struct benchmarker {
|
|||
|
||||
// Stage 1 (find structurals)
|
||||
collector.start();
|
||||
error = parser.implementation->stage1(reinterpret_cast<const uint8_t *>(json.data()), json.size(), false);
|
||||
error = parser.implementation->stage1(reinterpret_cast<const uint8_t *>(json.data()), json.size(), stage1_mode::regular);
|
||||
event_count stage1_count = collector.end();
|
||||
stage1 << stage1_count;
|
||||
if (error) {
|
||||
|
|
|
@ -183,7 +183,7 @@ int main(int argc, char *argv[]) {
|
|||
for (uint32_t i = 0; i < iterations; i++) {
|
||||
unified.start();
|
||||
// The default template is simdjson::architecture::NATIVE.
|
||||
bool isok = (parser.implementation->stage1((const uint8_t *)p.data(), p.size(), false) == simdjson::SUCCESS);
|
||||
bool isok = (parser.implementation->stage1((const uint8_t *)p.data(), p.size(), simdjson::stage1_mode::regular) == simdjson::SUCCESS);
|
||||
unified.end(results);
|
||||
|
||||
cy1 += results[0];
|
||||
|
|
|
@ -168,6 +168,7 @@ Tracking your position
|
|||
Some users would like to know where the document they parsed is in the input array of bytes.
|
||||
It is possible to do so by accessing directly the iterator and calling its `current_index()`
|
||||
method which reports the location (in bytes) of the current document in the input stream.
|
||||
You may also call the `source()` method to get a `std::string_view` instance on the document.
|
||||
|
||||
Let us illustrate the idea with code:
|
||||
|
||||
|
@ -182,36 +183,46 @@ Let us illustrate the idea with code:
|
|||
auto doc = *i;
|
||||
if(!doc.error()) {
|
||||
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||
std::cout << i.source() << std::endl;
|
||||
count++;
|
||||
} else {
|
||||
std::cout << "got broken document at " << i.current_index() << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
size_t index = i.current_index();
|
||||
if(index != 38) {
|
||||
std::cerr << "Expected to stop after the three full documents " << std::endl;
|
||||
std::cerr << "index = " << index << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
This code will print:
|
||||
```
|
||||
got full document at 0
|
||||
[1,2,3]
|
||||
got full document at 9
|
||||
{"1":1,"2":3,"4":4}
|
||||
got full document at 29
|
||||
[1,2,3]
|
||||
```
|
||||
|
||||
The last call to `i.current_index()` return the byte index 38, which is just beyond
|
||||
the last document.
|
||||
|
||||
Incomplete streams
|
||||
-----------
|
||||
|
||||
Some users may need to work with truncated streams while tracking their location in the stream.
|
||||
The same code, with the `current_index()` will work. However, the last block (by default 1MB)
|
||||
terminates with an unclosed string, then no JSON document within this last block will validate.
|
||||
In particular, it means that if your input string is `[1,2,3] {"1":1,"2":3,"4":4} [1,2` then
|
||||
no JSON document will be successfully parsed. The error `simdjson::UNCLOSED_STRING` will be
|
||||
given (even with the first JSON document). It is then your responsability to terminate the input
|
||||
maybe by appending the missing data at the end of the truncated string, or by copying the truncated
|
||||
data before the continuing input.
|
||||
Some users may need to work with truncated streams. The simdjson may truncate documents at the very end of the stream that cannot possibly be valid JSON (e.g., they contain unclosed strings, unmatched brackets, unmatched braces). After iterating through the stream, you may query the `truncated_bytes()` method which tells you how many bytes were truncated. If the stream is made of full (whole) documents, then you should expect `truncated_bytes()` to return zero.
|
||||
|
||||
|
||||
Consider the following example where a truncated document (`{"key":"intentionally unclosed string `) containing 39 bytes has been left within the stream. In such cases, the first two whole documents are parsed and returned, and the `truncated_bytes()` method returns 39.
|
||||
|
||||
```C++
|
||||
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} {"key":"intentionally unclosed string )"_padded;
|
||||
simdjson::dom::parser parser;
|
||||
simdjson::dom::document_stream stream;
|
||||
auto error = parser.parse_many(json,json.size()).get(stream);
|
||||
if(error) { std::cerr << error << std::endl; return; }
|
||||
for(auto doc : stream) {
|
||||
std::cout << doc << std::endl;
|
||||
}
|
||||
std::cout << stream.truncated_bytes() << " bytes "<< std::endl; // returns 39 bytes
|
||||
```
|
||||
|
||||
|
||||
Importantly, you should only call `truncated_bytes()` after iterating through all of the documents since the stream cannot tell whether there are truncated documents at the very end when it may not have accessed that part of the data yet.
|
|
@ -213,8 +213,15 @@ simdjson_really_inline size_t document_stream::iterator::current_index() const n
|
|||
}
|
||||
|
||||
simdjson_really_inline std::string_view document_stream::iterator::source() const noexcept {
|
||||
size_t next_doc_index = stream->batch_start + stream->parser->implementation->structural_indexes[stream->parser->implementation->next_structural_index];
|
||||
return std::string_view(reinterpret_cast<const char*>(stream->buf) + current_index(), next_doc_index - current_index() - 1);
|
||||
const char* start = reinterpret_cast<const char*>(stream->buf) + current_index();
|
||||
bool object_or_array = ((*start == '[') || (*start == '{'));
|
||||
if(object_or_array) {
|
||||
size_t next_doc_index = stream->batch_start + stream->parser->implementation->structural_indexes[stream->parser->implementation->next_structural_index - 1];
|
||||
return std::string_view(start, next_doc_index - current_index() + 1);
|
||||
} else {
|
||||
size_t next_doc_index = stream->batch_start + stream->parser->implementation->structural_indexes[stream->parser->implementation->next_structural_index];
|
||||
return std::string_view(reinterpret_cast<const char*>(stream->buf) + current_index(), next_doc_index - current_index() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -245,18 +252,24 @@ inline void document_stream::next() noexcept {
|
|||
error = parser->implementation->stage2_next(parser->doc);
|
||||
}
|
||||
}
|
||||
inline size_t document_stream::size_in_bytes() const noexcept {
|
||||
return len;
|
||||
}
|
||||
|
||||
inline size_t document_stream::truncated_bytes() const noexcept {
|
||||
return parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] - parser->implementation->structural_indexes[parser->implementation->n_structural_indexes + 1];
|
||||
}
|
||||
|
||||
inline size_t document_stream::next_batch_start() const noexcept {
|
||||
return batch_start + parser->implementation->structural_indexes[parser->implementation->n_structural_indexes];
|
||||
}
|
||||
|
||||
inline error_code document_stream::run_stage1(dom::parser &p, size_t _batch_start) noexcept {
|
||||
// If this is the final batch, pass partial = false
|
||||
size_t remaining = len - _batch_start;
|
||||
if (remaining <= batch_size) {
|
||||
return p.implementation->stage1(&buf[_batch_start], remaining, false);
|
||||
return p.implementation->stage1(&buf[_batch_start], remaining, stage1_mode::streaming_final);
|
||||
} else {
|
||||
return p.implementation->stage1(&buf[_batch_start], batch_size, true);
|
||||
return p.implementation->stage1(&buf[_batch_start], batch_size, stage1_mode::streaming_partial);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -87,7 +87,29 @@ public:
|
|||
simdjson_really_inline document_stream &operator=(document_stream &&other) noexcept = default;
|
||||
|
||||
simdjson_really_inline ~document_stream() noexcept;
|
||||
|
||||
/**
|
||||
* Returns the input size in bytes.
|
||||
*/
|
||||
inline size_t size_in_bytes() const noexcept;
|
||||
/**
|
||||
* After iterating through the stream, this method
|
||||
* returns the number of bytes that were not parsed at the end
|
||||
* of the stream. If truncated_bytes() differs from zero,
|
||||
* then the input was truncated maybe because incomplete JSON
|
||||
* documents were found at the end of the stream. You
|
||||
* may need to process the bytes in the interval [size_in_bytes()-truncated_bytes(), size_in_bytes()).
|
||||
*
|
||||
* You should only call truncated_bytes() after streaming through all
|
||||
* documents, like so:
|
||||
*
|
||||
* document_stream stream = parser.parse_many(json,window);
|
||||
* for(auto doc : stream) {
|
||||
* // do something with doc
|
||||
* }
|
||||
* size_t truncated = stream.truncated_bytes();
|
||||
*
|
||||
*/
|
||||
inline size_t truncated_bytes() const noexcept;
|
||||
/**
|
||||
* An iterator through a forward-only stream of documents.
|
||||
*/
|
||||
|
@ -245,7 +267,6 @@ private:
|
|||
error_code error;
|
||||
size_t batch_start{0};
|
||||
size_t doc_index{};
|
||||
|
||||
#ifdef SIMDJSON_THREADS_ENABLED
|
||||
/** Indicates whether we use threads. Note that this needs to be a constant during the execution of the parsing. */
|
||||
bool use_thread;
|
||||
|
|
|
@ -32,7 +32,7 @@ public:
|
|||
dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
|
||||
|
||||
simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
|
||||
simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final;
|
||||
simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
|
||||
simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
|
||||
simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
|
||||
inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
|
||||
|
|
|
@ -32,7 +32,7 @@ simdjson_warn_unused simdjson_really_inline simdjson_result<document> parser::it
|
|||
}
|
||||
|
||||
// Run stage 1.
|
||||
SIMDJSON_TRY( implementation->stage1(reinterpret_cast<const uint8_t *>(json.data()), json.length(), false) );
|
||||
SIMDJSON_TRY( implementation->stage1(reinterpret_cast<const uint8_t *>(json.data()), json.length(), stage1_mode::regular) );
|
||||
return document::start({ reinterpret_cast<const uint8_t *>(json.data()), this });
|
||||
}
|
||||
|
||||
|
@ -75,7 +75,7 @@ simdjson_warn_unused simdjson_really_inline simdjson_result<json_iterator> parse
|
|||
}
|
||||
|
||||
// Run stage 1.
|
||||
SIMDJSON_TRY( implementation->stage1(reinterpret_cast<const uint8_t *>(json.data()), json.length(), false) );
|
||||
SIMDJSON_TRY( implementation->stage1(reinterpret_cast<const uint8_t *>(json.data()), json.length(), stage1_mode::regular) );
|
||||
return json_iterator(reinterpret_cast<const uint8_t *>(json.data()), this);
|
||||
}
|
||||
|
||||
|
|
|
@ -11,8 +11,30 @@ namespace dom {
|
|||
class document;
|
||||
} // namespace dom
|
||||
|
||||
/**
|
||||
* This enum is used with the dom_parser_implementation::stage1 function.
|
||||
* 1) The regular mode expects a fully formed JSON document.
|
||||
* 2) The streaming_partial mode expects a possibly truncated
|
||||
* input within a stream on JSON documents.
|
||||
* 3) The stream_final mode allows us to truncate final
|
||||
* unterminated strings. It is useful in conjunction with streaming_partial.
|
||||
*/
|
||||
enum class stage1_mode { regular, streaming_partial, streaming_final};
|
||||
|
||||
/**
|
||||
* Returns true if mode == streaming_partial or mode == streaming_final
|
||||
*/
|
||||
inline bool is_streaming(stage1_mode mode) {
|
||||
// performance note: it is probably faster to check that mode is different
|
||||
// from regular than checking that it is either streaming_partial or streaming_final.
|
||||
return (mode != stage1_mode::regular);
|
||||
// return (mode == stage1_mode::streaming_partial || mode == stage1_mode::streaming_final);
|
||||
}
|
||||
|
||||
|
||||
namespace internal {
|
||||
|
||||
|
||||
/**
|
||||
* An implementation of simdjson's DOM parser for a particular CPU architecture.
|
||||
*
|
||||
|
@ -51,7 +73,7 @@ public:
|
|||
* @param streaming Whether this is being called by parser::parse_many.
|
||||
* @return The error code, or SUCCESS if there was no error.
|
||||
*/
|
||||
simdjson_warn_unused virtual error_code stage1(const uint8_t *buf, size_t len, bool streaming) noexcept = 0;
|
||||
simdjson_warn_unused virtual error_code stage1(const uint8_t *buf, size_t len, stage1_mode streaming) noexcept = 0;
|
||||
|
||||
/**
|
||||
* @private For internal implementation use
|
||||
|
|
|
@ -133,7 +133,7 @@ simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_
|
|||
return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
|
||||
}
|
||||
|
||||
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
|
||||
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
|
||||
this->buf = _buf;
|
||||
this->len = _len;
|
||||
return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
|
||||
|
@ -152,7 +152,7 @@ simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::docu
|
|||
}
|
||||
|
||||
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
||||
auto error = stage1(_buf, _len, false);
|
||||
auto error = stage1(_buf, _len, stage1_mode::regular);
|
||||
if (error) { return error; }
|
||||
return stage2(_doc);
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@ namespace stage1 {
|
|||
class structural_scanner {
|
||||
public:
|
||||
|
||||
simdjson_really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial)
|
||||
simdjson_really_inline structural_scanner(dom_parser_implementation &_parser, stage1_mode _partial)
|
||||
: buf{_parser.buf},
|
||||
next_structural_index{_parser.structural_indexes.get()},
|
||||
parser{_parser},
|
||||
|
@ -43,7 +43,7 @@ simdjson_really_inline void validate_utf8_character() {
|
|||
if ((buf[idx] & 0b00100000) == 0) {
|
||||
// missing continuation
|
||||
if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
|
||||
if (idx+1 > len && partial) { idx = len; return; }
|
||||
if (idx+1 > len && is_streaming(partial)) { idx = len; return; }
|
||||
error = UTF8_ERROR;
|
||||
idx++;
|
||||
return;
|
||||
|
@ -58,7 +58,7 @@ simdjson_really_inline void validate_utf8_character() {
|
|||
if ((buf[idx] & 0b00010000) == 0) {
|
||||
// missing continuation
|
||||
if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
|
||||
if (idx+2 > len && partial) { idx = len; return; }
|
||||
if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
|
||||
error = UTF8_ERROR;
|
||||
idx++;
|
||||
return;
|
||||
|
@ -74,7 +74,7 @@ simdjson_really_inline void validate_utf8_character() {
|
|||
// 4-byte
|
||||
// missing continuation
|
||||
if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
|
||||
if (idx+2 > len && partial) { idx = len; return; }
|
||||
if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
|
||||
error = UTF8_ERROR;
|
||||
idx++;
|
||||
return;
|
||||
|
@ -147,24 +147,47 @@ simdjson_really_inline error_code scan() {
|
|||
break;
|
||||
}
|
||||
}
|
||||
*next_structural_index = len;
|
||||
// We pad beyond.
|
||||
// https://github.com/simdjson/simdjson/issues/906
|
||||
// See json_structural_indexer.h for an explanation.
|
||||
*next_structural_index = len; // assumed later in partial == stage1_mode::streaming_final
|
||||
next_structural_index[1] = len;
|
||||
next_structural_index[2] = 0;
|
||||
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
|
||||
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
|
||||
parser.next_structural_index = 0;
|
||||
if (partial) {
|
||||
if (partial == stage1_mode::streaming_partial) {
|
||||
if(unclosed_string) {
|
||||
parser.n_structural_indexes--;
|
||||
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
|
||||
}
|
||||
// We truncate the input to the end of the last complete document (or zero).
|
||||
auto new_structural_indexes = find_next_document_index(parser);
|
||||
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
||||
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
|
||||
}
|
||||
parser.n_structural_indexes = new_structural_indexes;
|
||||
} else if(partial == stage1_mode::streaming_final) {
|
||||
if(unclosed_string) { parser.n_structural_indexes--; }
|
||||
// We truncate the input to the end of the last complete document (or zero).
|
||||
// Because partial == stage1_mode::streaming_final, it means that we may
|
||||
// silently ignore trailing garbage. Though it sounds bad, we do it
|
||||
// deliberately because many people who have streams of JSON documents
|
||||
// will truncate them for processing. E.g., imagine that you are uncompressing
|
||||
// the data from a size file or receiving it in chunks from the network. You
|
||||
// may not know where exactly the last document will be. Meanwhile the
|
||||
// document_stream instances allow people to know the JSON documents they are
|
||||
// parsing (see the iterator.source() method).
|
||||
parser.n_structural_indexes = find_next_document_index(parser);
|
||||
// We store the initial n_structural_indexes so that the client can see
|
||||
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
|
||||
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
|
||||
// otherwise, it will copy some prior index.
|
||||
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
|
||||
// This next line is critical, do not change it unless you understand what you are
|
||||
// doing.
|
||||
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
|
||||
if (parser.n_structural_indexes == 0) { return EMPTY; }
|
||||
} else if(unclosed_string) { error = UNCLOSED_STRING; }
|
||||
return error;
|
||||
}
|
||||
|
@ -176,13 +199,13 @@ private:
|
|||
uint32_t len;
|
||||
uint32_t idx{0};
|
||||
error_code error{SUCCESS};
|
||||
bool partial;
|
||||
stage1_mode partial;
|
||||
}; // structural_scanner
|
||||
|
||||
} // namespace stage1
|
||||
} // unnamed namespace
|
||||
|
||||
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept {
|
||||
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode partial) noexcept {
|
||||
this->buf = _buf;
|
||||
this->len = _len;
|
||||
stage1::structural_scanner scanner(*this, partial);
|
||||
|
@ -328,7 +351,7 @@ simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::docu
|
|||
}
|
||||
|
||||
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
||||
auto error = stage1(_buf, _len, false);
|
||||
auto error = stage1(_buf, _len, stage1_mode::regular);
|
||||
if (error) { return error; }
|
||||
return stage2(_doc);
|
||||
}
|
||||
|
|
|
@ -13,8 +13,8 @@ namespace {
|
|||
*
|
||||
* Simply put, we iterate over the structural characters, starting from
|
||||
* the end. We consider that we found the end of a JSON document when the
|
||||
* first element of the pair is NOT one of these characters: '{' '[' ';' ','
|
||||
* and when the second element is NOT one of these characters: '}' '}' ';' ','.
|
||||
* first element of the pair is NOT one of these characters: '{' '[' ':' ','
|
||||
* and when the second element is NOT one of these characters: '}' ']' ':' ','.
|
||||
*
|
||||
* This simple comparison works most of the time, but it does not cover cases
|
||||
* where the batch's structural indexes contain a perfect amount of documents.
|
||||
|
@ -28,7 +28,8 @@ namespace {
|
|||
* batch.
|
||||
*/
|
||||
simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
|
||||
// TODO don't count separately, just figure out depth
|
||||
// Variant: do not count separately, just figure out depth
|
||||
if(parser.n_structural_indexes == 0) { return 0; }
|
||||
auto arr_cnt = 0;
|
||||
auto obj_cnt = 0;
|
||||
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
|
||||
|
@ -65,6 +66,25 @@ simdjson_really_inline uint32_t find_next_document_index(dom_parser_implementati
|
|||
// Last document is incomplete; mark the document at i + 1 as the next one
|
||||
return i;
|
||||
}
|
||||
// If we made it to the end, we want to finish counting to see if we have a full document.
|
||||
switch (parser.buf[parser.structural_indexes[0]]) {
|
||||
case '}':
|
||||
obj_cnt--;
|
||||
break;
|
||||
case ']':
|
||||
arr_cnt--;
|
||||
break;
|
||||
case '{':
|
||||
obj_cnt++;
|
||||
break;
|
||||
case '[':
|
||||
arr_cnt++;
|
||||
break;
|
||||
}
|
||||
if (!arr_cnt && !obj_cnt) {
|
||||
// We have a complete document.
|
||||
return parser.n_structural_indexes;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -74,14 +74,14 @@ public:
|
|||
* you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
|
||||
*/
|
||||
template<size_t STEP_SIZE>
|
||||
static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
|
||||
static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept;
|
||||
|
||||
private:
|
||||
simdjson_really_inline json_structural_indexer(uint32_t *structural_indexes);
|
||||
template<size_t STEP_SIZE>
|
||||
simdjson_really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
||||
simdjson_really_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
|
||||
simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
|
||||
simdjson_really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
|
||||
|
||||
json_scanner scanner{};
|
||||
utf8_checker checker{};
|
||||
|
@ -131,9 +131,9 @@ simdjson_really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len)
|
|||
// workout.
|
||||
//
|
||||
template<size_t STEP_SIZE>
|
||||
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
|
||||
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
|
||||
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
|
||||
if (partial) { len = trim_partial_utf8(buf, len); }
|
||||
if (is_streaming(partial)) { len = trim_partial_utf8(buf, len); }
|
||||
|
||||
buf_block_reader<STEP_SIZE> reader(buf, len);
|
||||
json_structural_indexer indexer(parser.structural_indexes.get());
|
||||
|
@ -178,14 +178,14 @@ simdjson_really_inline void json_structural_indexer::next(const simd::simd8x64<u
|
|||
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
|
||||
}
|
||||
|
||||
simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
|
||||
simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
|
||||
// Write out the final iteration's structurals
|
||||
indexer.write(uint32_t(idx-64), prev_structurals);
|
||||
|
||||
error_code error = scanner.finish();
|
||||
// We deliberately break down the next expression so that it is
|
||||
// human readable.
|
||||
const bool should_we_exit = partial ?
|
||||
const bool should_we_exit = is_streaming(partial) ?
|
||||
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
|
||||
: (error != SUCCESS); // if partial is false, we must have SUCCESS
|
||||
const bool have_unclosed_string = (error == UNCLOSED_STRING);
|
||||
|
@ -194,9 +194,10 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp
|
|||
if (unescaped_chars_error) {
|
||||
return UNESCAPED_CHARS;
|
||||
}
|
||||
|
||||
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
|
||||
/***
|
||||
* The On Demand API requires special padding.
|
||||
*
|
||||
* This is related to https://github.com/simdjson/simdjson/issues/906
|
||||
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
|
||||
* structural character, it quickly stops.
|
||||
|
@ -209,8 +210,11 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp
|
|||
* if the repeated character is [. But if so, the document must start with [. But if the document
|
||||
* starts with [, it should end with ]. If we enforce that rule, then we would get
|
||||
* ][[ which is invalid.
|
||||
*
|
||||
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
|
||||
* R"({ "a": [,,)"
|
||||
**/
|
||||
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
|
||||
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
|
||||
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
|
||||
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
|
||||
parser.next_structural_index = 0;
|
||||
|
@ -221,7 +225,7 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp
|
|||
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
|
||||
return UNEXPECTED_ERROR;
|
||||
}
|
||||
if (partial) {
|
||||
if (partial == stage1_mode::streaming_partial) {
|
||||
// If we have an unclosed string, then the last structural
|
||||
// will be the quote and we want to make sure to omit it.
|
||||
if(have_unclosed_string) {
|
||||
|
@ -229,11 +233,39 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp
|
|||
// a valid JSON file cannot have zero structural indexes - we should have found something
|
||||
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
|
||||
}
|
||||
// We truncate the input to the end of the last complete document (or zero).
|
||||
auto new_structural_indexes = find_next_document_index(parser);
|
||||
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
||||
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
|
||||
}
|
||||
|
||||
parser.n_structural_indexes = new_structural_indexes;
|
||||
} else if (partial == stage1_mode::streaming_final) {
|
||||
if(have_unclosed_string) { parser.n_structural_indexes--; }
|
||||
// We truncate the input to the end of the last complete document (or zero).
|
||||
// Because partial == stage1_mode::streaming_final, it means that we may
|
||||
// silently ignore trailing garbage. Though it sounds bad, we do it
|
||||
// deliberately because many people who have streams of JSON documents
|
||||
// will truncate them for processing. E.g., imagine that you are uncompressing
|
||||
// the data from a size file or receiving it in chunks from the network. You
|
||||
// may not know where exactly the last document will be. Meanwhile the
|
||||
// document_stream instances allow people to know the JSON documents they are
|
||||
// parsing (see the iterator.source() method).
|
||||
parser.n_structural_indexes = find_next_document_index(parser);
|
||||
// We store the initial n_structural_indexes so that the client can see
|
||||
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
|
||||
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
|
||||
// otherwise, it will copy some prior index.
|
||||
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
|
||||
// This next line is critical, do not change it unless you understand what you are
|
||||
// doing.
|
||||
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
|
||||
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
||||
// We tolerate an unclosed string at the very end of the stream. Indeed, users
|
||||
// often load their data in bulk without being careful and they want us to ignore
|
||||
// the trailing garbage.
|
||||
return EMPTY;
|
||||
}
|
||||
}
|
||||
checker.check_eof();
|
||||
return checker.errors();
|
||||
|
|
|
@ -134,7 +134,7 @@ simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_
|
|||
return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
|
||||
}
|
||||
|
||||
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
|
||||
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
|
||||
this->buf = _buf;
|
||||
this->len = _len;
|
||||
return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
|
||||
|
@ -153,7 +153,7 @@ simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::docu
|
|||
}
|
||||
|
||||
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
||||
auto error = stage1(_buf, _len, false);
|
||||
auto error = stage1(_buf, _len, stage1_mode::regular);
|
||||
if (error) { return error; }
|
||||
return stage2(_doc);
|
||||
}
|
||||
|
|
|
@ -104,7 +104,7 @@ simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_
|
|||
return ppc64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
|
||||
}
|
||||
|
||||
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
|
||||
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
|
||||
this->buf = _buf;
|
||||
this->len = _len;
|
||||
return ppc64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
|
||||
|
@ -123,7 +123,7 @@ simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::docu
|
|||
}
|
||||
|
||||
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
||||
auto error = stage1(_buf, _len, false);
|
||||
auto error = stage1(_buf, _len, stage1_mode::regular);
|
||||
if (error) { return error; }
|
||||
return stage2(_doc);
|
||||
}
|
||||
|
|
|
@ -133,7 +133,7 @@ simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_
|
|||
return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
|
||||
}
|
||||
|
||||
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
|
||||
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
|
||||
this->buf = _buf;
|
||||
this->len = _len;
|
||||
return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
|
||||
|
@ -152,7 +152,7 @@ simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::docu
|
|||
}
|
||||
|
||||
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
||||
auto error = stage1(_buf, _len, false);
|
||||
auto error = stage1(_buf, _len, stage1_mode::regular);
|
||||
if (error) { return error; }
|
||||
return stage2(_doc);
|
||||
}
|
||||
|
|
|
@ -2149,7 +2149,7 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
// We want to know what we are testing.
|
||||
std::cout << "Running tests against this implementation: " << simdjson::active_implementation->name();
|
||||
std::cout << "(" << simdjson::active_implementation->description() << ")" << std::endl;
|
||||
std::cout << " (" << simdjson::active_implementation->description() << ")" << std::endl;
|
||||
std::cout << "------------------------------------------------------------" << std::endl;
|
||||
|
||||
std::cout << "Running basic tests." << std::endl;
|
||||
|
|
|
@ -86,17 +86,14 @@ namespace document_stream_tests {
|
|||
std::cout << "Running " << __func__ << std::endl;
|
||||
// Correct JSON.
|
||||
const simdjson::padded_string input = R"([1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] )"_padded;;
|
||||
// This will spin up and tear down 1000 worker threads.
|
||||
for(size_t i = 0; i < 1; i++) {
|
||||
simdjson::dom::parser parser;
|
||||
simdjson::dom::document_stream stream;
|
||||
ASSERT_SUCCESS(parser.parse_many(input, 32).get(stream));
|
||||
for(auto doc: stream) {
|
||||
auto error = doc.error();
|
||||
if(error) {
|
||||
std::cout << "Expected no error but got " << error << std::endl;
|
||||
return false;
|
||||
}
|
||||
simdjson::dom::parser parser;
|
||||
simdjson::dom::document_stream stream;
|
||||
ASSERT_SUCCESS(parser.parse_many(input, 32).get(stream));
|
||||
for(auto doc: stream) {
|
||||
auto error = doc.error();
|
||||
if(error) {
|
||||
std::cout << "Expected no error but got " << error << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
@ -106,13 +103,11 @@ namespace document_stream_tests {
|
|||
std::cout << "Running " << __func__ << std::endl;
|
||||
// Intentionally broken
|
||||
const simdjson::padded_string input = R"([1,23] [1,23] [1,23] [1,23 [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] [1,23] )"_padded;;
|
||||
// This will spin up and tear down 1000 worker threads.
|
||||
for(size_t i = 0; i < 1; i++) {
|
||||
simdjson::dom::parser parser;
|
||||
simdjson::dom::document_stream stream;
|
||||
ASSERT_SUCCESS(parser.parse_many(input, 32).get(stream));
|
||||
size_t count = 0;
|
||||
for(auto doc: stream) {
|
||||
simdjson::dom::parser parser;
|
||||
simdjson::dom::document_stream stream;
|
||||
ASSERT_SUCCESS(parser.parse_many(input, 32).get(stream));
|
||||
size_t count = 0;
|
||||
for(auto doc: stream) {
|
||||
auto error = doc.error();
|
||||
if(count <= 2) {
|
||||
if(error) {
|
||||
|
@ -127,7 +122,6 @@ namespace document_stream_tests {
|
|||
break;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -319,7 +313,42 @@ namespace document_stream_tests {
|
|||
count++;
|
||||
previous_i = i;
|
||||
}
|
||||
return count == 1;
|
||||
// We should have two documents
|
||||
if(count != 2) {
|
||||
std::cout << "finished with count = " << count << std::endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool adversarial_single_document() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
simdjson::dom::parser parser;
|
||||
auto json = R"({"f[)"_padded;
|
||||
simdjson::dom::document_stream stream;
|
||||
ASSERT_SUCCESS(parser.parse_many(json).get(stream));
|
||||
size_t count = 0;
|
||||
for (auto doc : stream) {
|
||||
(void)doc;
|
||||
count += 1;
|
||||
}
|
||||
std::cout << "number of documents (0 expected) = " << count << std::endl;
|
||||
return count == 0;
|
||||
}
|
||||
|
||||
bool adversarial_single_document_array() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
simdjson::dom::parser parser;
|
||||
auto json = R"(["this is an unclosed string ])"_padded;
|
||||
simdjson::dom::document_stream stream;
|
||||
ASSERT_SUCCESS(parser.parse_many(json).get(stream));
|
||||
size_t count = 0;
|
||||
for (auto doc : stream) {
|
||||
(void)doc;
|
||||
count += 1;
|
||||
}
|
||||
std::cout << "number of documents (0 expected) = " << count << std::endl;
|
||||
return count == 0;
|
||||
}
|
||||
|
||||
bool single_document() {
|
||||
|
@ -345,6 +374,7 @@ namespace document_stream_tests {
|
|||
}
|
||||
count += 1;
|
||||
}
|
||||
std::cout << "number of documents " << count << std::endl;
|
||||
return count == 1;
|
||||
}
|
||||
#if SIMDJSON_EXCEPTIONS
|
||||
|
@ -382,6 +412,10 @@ namespace document_stream_tests {
|
|||
}
|
||||
count += 1;
|
||||
}
|
||||
if(docs.truncated_bytes() != 0) {
|
||||
std::cerr << "Unexpected truncation : " << docs.truncated_bytes() << std::endl;
|
||||
return false;
|
||||
}
|
||||
return count == 1;
|
||||
}
|
||||
#endif
|
||||
|
@ -398,13 +432,21 @@ namespace document_stream_tests {
|
|||
auto doc = *i;
|
||||
if(!doc.error()) {
|
||||
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||
std::cout << i.source() << std::endl;
|
||||
count++;
|
||||
} else {
|
||||
std::cout << "got broken document at " << i.current_index() << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if(count != 3) {
|
||||
std::cerr << "Expected to get three full documents " << std::endl;
|
||||
return false;
|
||||
}
|
||||
if(stream.truncated_bytes() != 0) {
|
||||
std::cerr << "Unexpected truncation : " << stream.truncated_bytes() << std::endl;
|
||||
return false;
|
||||
}
|
||||
size_t index = i.current_index();
|
||||
if(index != 38) {
|
||||
std::cerr << "Expected to stop after the three full documents " << std::endl;
|
||||
|
@ -414,12 +456,35 @@ namespace document_stream_tests {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool unquoted_key() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
auto json = R"({unquoted_key: "keys must be quoted"})"_padded;
|
||||
simdjson::dom::parser parser;
|
||||
simdjson::dom::document_stream stream;
|
||||
// We use a window of json.size() though any large value would do.
|
||||
ASSERT_SUCCESS( parser.parse_many(json, json.size()).get(stream) );
|
||||
auto i = stream.begin();
|
||||
for(; i != stream.end(); ++i) {
|
||||
auto doc = *i;
|
||||
if(!doc.error()) {
|
||||
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||
std::cout << i.source() << std::endl;
|
||||
return false;
|
||||
} else {
|
||||
std::cout << "got broken document at " << i.current_index() << std::endl;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool truncated_window() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
// The last JSON document is
|
||||
// intentionally truncated.
|
||||
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} [1,2 )"_padded;
|
||||
std::cout << "input size " << json.size() << std::endl;
|
||||
simdjson::dom::parser parser;
|
||||
size_t count = 0;
|
||||
simdjson::dom::document_stream stream;
|
||||
|
@ -430,17 +495,22 @@ namespace document_stream_tests {
|
|||
auto doc = *i;
|
||||
if(!doc.error()) {
|
||||
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||
std::cout << i.source() << std::endl;
|
||||
count++;
|
||||
} else {
|
||||
std::cout << "got broken document at " << i.current_index() << std::endl;
|
||||
}
|
||||
}
|
||||
if(count != 2) {
|
||||
std::cerr << "Expected to get two full documents " << std::endl;
|
||||
return false;
|
||||
}
|
||||
size_t index = i.current_index();
|
||||
if(index != 29) {
|
||||
std::cerr << "Expected to stop after the two full documents " << std::endl;
|
||||
std::cerr << "index = " << index << std::endl;
|
||||
if(stream.truncated_bytes() == 0) {
|
||||
std::cerr << "Expected truncation : " << stream.truncated_bytes() << std::endl;
|
||||
return false;
|
||||
}
|
||||
if(stream.truncated_bytes() != 6) {
|
||||
std::cerr << "Expected truncation of 6 bytes got " << stream.truncated_bytes() << std::endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -448,28 +518,79 @@ namespace document_stream_tests {
|
|||
|
||||
bool truncated_window_unclosed_string() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
// The last JSON document is intentionally truncated. In this instance, we use
|
||||
// a truncated string which will create trouble since stage 1 will recognize the
|
||||
// JSON as invalid and refuse to even start parsing.
|
||||
// The last JSON document is intentionally truncated.
|
||||
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} "intentionally unclosed string )"_padded;
|
||||
simdjson::dom::parser parser;
|
||||
simdjson::dom::document_stream stream;
|
||||
// We use a window of json.size() though any large value would do.
|
||||
ASSERT_SUCCESS( parser.parse_many(json,json.size()).get(stream) );
|
||||
// Rest is ineffective because stage 1 fails.
|
||||
auto i = stream.begin();
|
||||
size_t counter{0};
|
||||
for(; i != stream.end(); ++i) {
|
||||
auto doc = *i;
|
||||
if(!doc.error()) {
|
||||
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||
return false;
|
||||
std::cout << "the document is " << i.source() << std::endl;
|
||||
} else {
|
||||
std::cout << "got broken document at " << i.current_index() << std::endl;
|
||||
std::cout << doc.error() << std::endl;
|
||||
return (doc.error() == simdjson::UNCLOSED_STRING);
|
||||
}
|
||||
counter++;
|
||||
}
|
||||
return false;
|
||||
std::cout << "final index is " << i.current_index() << std::endl;
|
||||
if(counter != 2) {
|
||||
std::cerr << "You should have parsed two documents. I found " << counter << "." << std::endl;
|
||||
return false;
|
||||
}
|
||||
if(stream.truncated_bytes() == 0) {
|
||||
std::cerr << "Expected truncation : " << stream.truncated_bytes() << std::endl;
|
||||
return false;
|
||||
}
|
||||
if(stream.truncated_bytes() != 32) {
|
||||
std::cerr << "Expected truncation of 32 bytes got " << stream.truncated_bytes() << std::endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool truncated_window_unclosed_string_in_object() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
// The last JSON document is intentionally truncated.
|
||||
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} {"key":"intentionally unclosed string )"_padded;
|
||||
simdjson::dom::parser parser;
|
||||
simdjson::dom::document_stream stream;
|
||||
// We use a window of json.size() though any large value would do.
|
||||
ASSERT_SUCCESS( parser.parse_many(json,json.size()).get(stream) );
|
||||
auto i = stream.begin();
|
||||
size_t counter{0};
|
||||
for(; i != stream.end(); ++i) {
|
||||
auto doc = *i;
|
||||
if(!doc.error()) {
|
||||
std::cout << "got full document at " << i.current_index() << std::endl;
|
||||
std::cout << "the document is " << i.source() << std::endl;
|
||||
} else {
|
||||
std::cout << "got broken document at " << i.current_index() << std::endl;
|
||||
std::cout << doc.error() << std::endl;
|
||||
}
|
||||
counter++;
|
||||
}
|
||||
std::cout << "final index is " << i.current_index() << std::endl;
|
||||
if(counter != 2) {
|
||||
std::cerr << "You should have parsed two documents. I found " << counter << "." << std::endl;
|
||||
return false;
|
||||
}
|
||||
if(stream.truncated_bytes() == 0) {
|
||||
std::cerr << "Expected truncation : " << stream.truncated_bytes() << std::endl;
|
||||
return false;
|
||||
}
|
||||
if(stream.truncated_bytes() != 39) {
|
||||
std::cerr << "Expected truncation of 39 bytes got " << stream.truncated_bytes() << std::endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool small_window() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
std::vector<char> input;
|
||||
|
@ -668,11 +789,15 @@ namespace document_stream_tests {
|
|||
}
|
||||
|
||||
bool run() {
|
||||
return stress_data_race() &&
|
||||
return adversarial_single_document_array() &&
|
||||
adversarial_single_document() &&
|
||||
unquoted_key() &&
|
||||
stress_data_race() &&
|
||||
stress_data_race_with_error() &&
|
||||
test_leading_spaces() &&
|
||||
simple_example() &&
|
||||
truncated_window() &&
|
||||
truncated_window_unclosed_string_in_object() &&
|
||||
truncated_window_unclosed_string() &&
|
||||
issue1307() &&
|
||||
issue1308() &&
|
||||
|
@ -728,7 +853,7 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
// We want to know what we are testing.
|
||||
std::cout << "Running tests against this implementation: " << simdjson::active_implementation->name();
|
||||
std::cout << "(" << simdjson::active_implementation->description() << ")" << std::endl;
|
||||
std::cout << " (" << simdjson::active_implementation->description() << ")" << std::endl;
|
||||
std::cout << "------------------------------------------------------------" << std::endl;
|
||||
|
||||
std::cout << "Running document_stream tests." << std::endl;
|
||||
|
|
|
@ -222,7 +222,7 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
// We want to know what we are testing.
|
||||
std::cout << "Running tests against this implementation: " << simdjson::active_implementation->name();
|
||||
std::cout << "(" << simdjson::active_implementation->description() << ")" << std::endl;
|
||||
std::cout << " (" << simdjson::active_implementation->description() << ")" << std::endl;
|
||||
std::cout << "------------------------------------------------------------" << std::endl;
|
||||
|
||||
std::cout << "Running document tests." << std::endl;
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <set>
|
||||
#include <string_view>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "simdjson.h"
|
||||
|
||||
|
@ -70,7 +70,7 @@ namespace parser_load {
|
|||
count++;
|
||||
ASSERT_ERROR(doc.error(), TAPE_ERROR);
|
||||
}
|
||||
ASSERT_EQUAL(count, 1);
|
||||
ASSERT_EQUAL(count, 0);
|
||||
TEST_SUCCEED();
|
||||
}
|
||||
|
||||
|
@ -92,7 +92,7 @@ namespace parser_load {
|
|||
ASSERT_EQUAL(val, count);
|
||||
}
|
||||
}
|
||||
ASSERT_EQUAL(count, 3);
|
||||
ASSERT_EQUAL(count, 2);
|
||||
TEST_SUCCEED();
|
||||
}
|
||||
|
||||
|
@ -188,12 +188,39 @@ namespace adversarial {
|
|||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
int main(int argc, char *argv[]) {
|
||||
std::cout << std::unitbuf;
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "a:")) != -1) {
|
||||
switch (c) {
|
||||
case 'a': {
|
||||
const simdjson::implementation *impl = simdjson::available_implementations[optarg];
|
||||
if (!impl) {
|
||||
fprintf(stderr, "Unsupported architecture value -a %s\n", optarg);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
if(!impl->supported_by_runtime_system()) {
|
||||
fprintf(stderr, "The selected implementation does not match your current CPU: -a %s\n", optarg);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
simdjson::active_implementation = impl;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
fprintf(stderr, "Unexpected argument %c\n", c);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
// this is put here deliberately to check that the documentation is correct (README),
|
||||
// should this fail to compile, you should update the documentation:
|
||||
if (simdjson::active_implementation->name() == "unsupported") {
|
||||
printf("unsupported CPU\n");
|
||||
}
|
||||
// We want to know what we are testing.
|
||||
std::cout << "Running tests against this implementation: " << simdjson::active_implementation->name();
|
||||
std::cout << " (" << simdjson::active_implementation->description() << ")" << std::endl;
|
||||
std::cout << "------------------------------------------------------------" << std::endl;
|
||||
std::cout << "Running error tests." << std::endl;
|
||||
if (!(true
|
||||
&& parser_load::run()
|
||||
|
|
|
@ -30,6 +30,30 @@ bool contains(const char *pre, const char *str) {
|
|||
return (strstr(str, pre) != nullptr);
|
||||
}
|
||||
|
||||
bool is_skip_listed(const char *name) {
|
||||
std::vector<const char*> white_list = {"fail36.json", "fail62.json", "fail63.json", "fail64.json"};
|
||||
for(const char* x : white_list) {
|
||||
if(starts_with(x, name)) {
|
||||
std::cout << " Though the file " << x << " is not valid JSON, whether it should pass as ndjson after truncation is undefined" << std::endl;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_white_listed(const char *name) {
|
||||
std::vector<const char*> white_list = {"fail02.json", "fail08.json", "fail10.json", "fail32.json", "fail33.json",
|
||||
"fail52.json", "fail53.json", "fail54.json", "fail70.json", "fail74.json",
|
||||
"fail78.json", "fail79.json", "fail80.json"};
|
||||
for(const char* x : white_list) {
|
||||
if(starts_with(x, name)) {
|
||||
std::cout << " Though the file " << x << " is not valid JSON, we expect parse_many to succeed." << std::endl;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool validate(const char *dirname) {
|
||||
bool everything_fine = true;
|
||||
const char *extension1 = ".ndjson";
|
||||
|
@ -59,6 +83,7 @@ bool validate(const char *dirname) {
|
|||
for (int i = 0; i < c; i++) {
|
||||
const char *name = entry_list[i]->d_name;
|
||||
if (has_extension(name, extension1) || has_extension(name, extension2) || has_extension(name, extension3)) {
|
||||
if(is_skip_listed(name)) { continue; }
|
||||
|
||||
/* Finding the file path */
|
||||
printf("validating: file %s ", name);
|
||||
|
@ -77,15 +102,15 @@ bool validate(const char *dirname) {
|
|||
simdjson::dom::document_stream docs;
|
||||
error = parser.parse_many(json).get(docs);
|
||||
for (auto doc : docs) {
|
||||
error = doc.error();
|
||||
error = doc.error();
|
||||
}
|
||||
}
|
||||
printf("%s\n", error ? "ok" : "invalid");
|
||||
std::cout << "error status: " << error << std::endl;
|
||||
/* Check if the file is supposed to pass or not. Print the results */
|
||||
if (contains("EXCLUDE", name)) {
|
||||
// skipping
|
||||
how_many--;
|
||||
} else if (starts_with("pass", name) or starts_with("fail10.json", name) or starts_with("fail70.json", name)) {
|
||||
} else if (starts_with("pass", name) or is_white_listed(name)) {
|
||||
if (error) {
|
||||
is_file_as_expected[i] = false;
|
||||
printf("warning: file %s should pass but it fails. Error is: %s\n",
|
||||
|
@ -127,7 +152,39 @@ bool validate(const char *dirname) {
|
|||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc != 2) {
|
||||
std::cout << std::unitbuf;
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "a:")) != -1) {
|
||||
switch (c) {
|
||||
case 'a': {
|
||||
const simdjson::implementation *impl = simdjson::available_implementations[optarg];
|
||||
if (!impl) {
|
||||
fprintf(stderr, "Unsupported architecture value -a %s\n", optarg);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
if(!impl->supported_by_runtime_system()) {
|
||||
fprintf(stderr, "The selected implementation does not match your current CPU: -a %s\n", optarg);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
simdjson::active_implementation = impl;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
fprintf(stderr, "Unexpected argument %c\n", c);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
// this is put here deliberately to check that the documentation is correct (README),
|
||||
// should this fail to compile, you should update the documentation:
|
||||
if (simdjson::active_implementation->name() == "unsupported") {
|
||||
printf("unsupported CPU\n");
|
||||
}
|
||||
// We want to know what we are testing.
|
||||
std::cout << "Running tests against this implementation: " << simdjson::active_implementation->name();
|
||||
std::cout << " (" << simdjson::active_implementation->description() << ")" << std::endl;
|
||||
std::cout << "------------------------------------------------------------" << std::endl;
|
||||
if(optind >= argc) {
|
||||
std::cerr << "Usage: " << argv[0] << " <directorywithjsonfiles>"
|
||||
<< std::endl;
|
||||
#ifndef SIMDJSON_TEST_DATA_DIR
|
||||
|
@ -140,6 +197,6 @@ int main(int argc, char *argv[]) {
|
|||
<< SIMDJSON_TEST_DATA_DIR << "' directory." << std::endl;
|
||||
return validate(SIMDJSON_TEST_DATA_DIR) ? EXIT_SUCCESS : EXIT_FAILURE;
|
||||
#endif
|
||||
}
|
||||
return validate(argv[1]) ? EXIT_SUCCESS : EXIT_FAILURE;
|
||||
}
|
||||
return validate(argv[optind]) ? EXIT_SUCCESS : EXIT_FAILURE;
|
||||
}
|
||||
|
|
|
@ -52,7 +52,17 @@ void basics_dom_1() {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
void parse_many_truncated() {
|
||||
auto json = R"([1,2,3] {"1":1,"2":3,"4":4} {"key":"intentionally unclosed string )"_padded;
|
||||
simdjson::dom::parser parser;
|
||||
simdjson::dom::document_stream stream;
|
||||
auto error = parser.parse_many(json,json.size()).get(stream);
|
||||
if(error) { std::cerr << error << std::endl; return; }
|
||||
for(auto doc : stream) {
|
||||
std::cout << doc << std::endl;
|
||||
}
|
||||
std::cout << stream.truncated_bytes() << " bytes "<< std::endl; // returns 39 bytes
|
||||
}
|
||||
|
||||
void basics_dom_2() {
|
||||
auto cars_json = R"( [
|
||||
|
|
Loading…
Reference in New Issue