From 04a19f9813b377826dad3afe92aace65917d1f90 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Wed, 17 Jun 2020 18:06:13 -0400 Subject: [PATCH 1/2] Fixes https://github.com/simdjson/simdjson/issues/937 --- include/simdjson/dom/document_stream.h | 16 +++++++++++- include/simdjson/inline/document_stream.h | 5 ++++ tests/basictests.cpp | 31 +++++++++++++++++++++-- 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/include/simdjson/dom/document_stream.h b/include/simdjson/dom/document_stream.h index fee73ac8..e4a86292 100644 --- a/include/simdjson/dom/document_stream.h +++ b/include/simdjson/dom/document_stream.h @@ -15,6 +15,7 @@ namespace dom { #ifdef SIMDJSON_THREADS_ENABLED +/** @private **/ struct stage1_worker { stage1_worker() noexcept = default; stage1_worker(const stage1_worker&) = delete; @@ -93,7 +94,17 @@ public: * @param other the end iterator to compare to. */ really_inline bool operator!=(const iterator &other) const noexcept; - + /** + * Gives the current index in the input document in bytes. + * + * auto stream = parser.parse_many(json,window); + * auto i = stream.begin(); + * for(; i != stream.end(); ++i) { + * auto doc = *i; + * size_t index = i.current_index(); + * } + */ + really_inline size_t current_index() noexcept; private: really_inline iterator(document_stream &s, bool finished) noexcept; /** The document_stream we're iterating through. */ @@ -204,6 +215,9 @@ private: #endif // SIMDJSON_THREADS_ENABLED friend class dom::parser; + + size_t doc_index{}; + }; // class document_stream } // namespace dom diff --git a/include/simdjson/inline/document_stream.h b/include/simdjson/inline/document_stream.h index 04b2230c..93ab4436 100644 --- a/include/simdjson/inline/document_stream.h +++ b/include/simdjson/inline/document_stream.h @@ -143,10 +143,14 @@ inline void document_stream::start() noexcept { next(); } +really_inline size_t document_stream::iterator::current_index() noexcept { + return stream.doc_index; +} inline void document_stream::next() noexcept { if (error) { return; } // Load the next document from the batch + doc_index = batch_start + parser.implementation->structural_indexes[parser.implementation->next_structural_index]; error = parser.implementation->stage2_next(parser.doc); // If that was the last document in the batch, load another batch (if available) while (error == EMPTY) { @@ -160,6 +164,7 @@ inline void document_stream::next() noexcept { #endif if (error) { continue; } // If the error was EMPTY, we may want to load another batch. // Run stage 2 on the first document in the batch + doc_index = batch_start + parser.implementation->structural_indexes[parser.implementation->next_structural_index]; error = parser.implementation->stage2_next(parser.doc); } } diff --git a/tests/basictests.cpp b/tests/basictests.cpp index 5a8ae77f..49ce4ee9 100644 --- a/tests/basictests.cpp +++ b/tests/basictests.cpp @@ -371,7 +371,33 @@ namespace document_stream_tests { simdjson::padded_string str("{}",2); simdjson::dom::document_stream s1 = parse_many_stream_return(parser, str); } - + bool test_current_index() { + std::cout << "Running " << __func__ << std::endl; + std::string base("1 ");// one JSON! + std::string json; + for(size_t k = 0; k < 1000; k++) { + json += base; + } + simdjson::dom::parser parser; + const size_t window = 32; // deliberately small + auto stream = parser.parse_many(json,window); + auto i = stream.begin(); + size_t count = 0; + for(; i != stream.end(); ++i) { + auto doc = *i; + if (doc.error()) { + std::cerr << doc.error() << std::endl; + return false; + } + if( i.current_index() != count) { + std::cout << "index:" << i.current_index() << std::endl; + std::cout << "expected index:" << count << std::endl; + return false; + } + count += base.size(); + } + return true; + } bool small_window() { std::cout << "Running " << __func__ << std::endl; auto json = R"({"error":[],"result":{"token":"xxx"}}{"error":[],"result":{"token":"xxx"}})"_padded; @@ -541,7 +567,8 @@ namespace document_stream_tests { } bool run() { - return small_window() && + return test_current_index() && + small_window() && large_window() && json_issue467() && document_stream_test() && From ef688a74feeb3ec18faad7e927ed93dd47190a89 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 18 Jun 2020 18:18:12 -0400 Subject: [PATCH 2/2] Minor tweak to the documentation. --- include/simdjson/dom/document_stream.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/simdjson/dom/document_stream.h b/include/simdjson/dom/document_stream.h index e4a86292..10f89843 100644 --- a/include/simdjson/dom/document_stream.h +++ b/include/simdjson/dom/document_stream.h @@ -15,7 +15,7 @@ namespace dom { #ifdef SIMDJSON_THREADS_ENABLED -/** @private **/ +/** @private Custom worker class **/ struct stage1_worker { stage1_worker() noexcept = default; stage1_worker(const stage1_worker&) = delete; @@ -95,6 +95,8 @@ public: */ really_inline bool operator!=(const iterator &other) const noexcept; /** + * @private + * * Gives the current index in the input document in bytes. * * auto stream = parser.parse_many(json,window); @@ -103,6 +105,10 @@ public: * auto doc = *i; * size_t index = i.current_index(); * } + * + * This function (current_index()) is experimental and the usage + * may change in future versions of simdjson: we find the API somewhat + * awkward and we would like to offer something friendlier. */ really_inline size_t current_index() noexcept; private: