diff --git a/include/simdjson/dom/document_stream.h b/include/simdjson/dom/document_stream.h index fee73ac8..10f89843 100644 --- a/include/simdjson/dom/document_stream.h +++ b/include/simdjson/dom/document_stream.h @@ -15,6 +15,7 @@ namespace dom { #ifdef SIMDJSON_THREADS_ENABLED +/** @private Custom worker class **/ struct stage1_worker { stage1_worker() noexcept = default; stage1_worker(const stage1_worker&) = delete; @@ -93,7 +94,23 @@ public: * @param other the end iterator to compare to. */ really_inline bool operator!=(const iterator &other) const noexcept; - + /** + * @private + * + * Gives the current index in the input document in bytes. + * + * auto stream = parser.parse_many(json,window); + * auto i = stream.begin(); + * for(; i != stream.end(); ++i) { + * auto doc = *i; + * size_t index = i.current_index(); + * } + * + * This function (current_index()) is experimental and the usage + * may change in future versions of simdjson: we find the API somewhat + * awkward and we would like to offer something friendlier. + */ + really_inline size_t current_index() noexcept; private: really_inline iterator(document_stream &s, bool finished) noexcept; /** The document_stream we're iterating through. */ @@ -204,6 +221,9 @@ private: #endif // SIMDJSON_THREADS_ENABLED friend class dom::parser; + + size_t doc_index{}; + }; // class document_stream } // namespace dom diff --git a/include/simdjson/inline/document_stream.h b/include/simdjson/inline/document_stream.h index 04b2230c..93ab4436 100644 --- a/include/simdjson/inline/document_stream.h +++ b/include/simdjson/inline/document_stream.h @@ -143,10 +143,14 @@ inline void document_stream::start() noexcept { next(); } +really_inline size_t document_stream::iterator::current_index() noexcept { + return stream.doc_index; +} inline void document_stream::next() noexcept { if (error) { return; } // Load the next document from the batch + doc_index = batch_start + parser.implementation->structural_indexes[parser.implementation->next_structural_index]; error = parser.implementation->stage2_next(parser.doc); // If that was the last document in the batch, load another batch (if available) while (error == EMPTY) { @@ -160,6 +164,7 @@ inline void document_stream::next() noexcept { #endif if (error) { continue; } // If the error was EMPTY, we may want to load another batch. // Run stage 2 on the first document in the batch + doc_index = batch_start + parser.implementation->structural_indexes[parser.implementation->next_structural_index]; error = parser.implementation->stage2_next(parser.doc); } } diff --git a/tests/basictests.cpp b/tests/basictests.cpp index 5a8ae77f..49ce4ee9 100644 --- a/tests/basictests.cpp +++ b/tests/basictests.cpp @@ -371,7 +371,33 @@ namespace document_stream_tests { simdjson::padded_string str("{}",2); simdjson::dom::document_stream s1 = parse_many_stream_return(parser, str); } - + bool test_current_index() { + std::cout << "Running " << __func__ << std::endl; + std::string base("1 ");// one JSON! + std::string json; + for(size_t k = 0; k < 1000; k++) { + json += base; + } + simdjson::dom::parser parser; + const size_t window = 32; // deliberately small + auto stream = parser.parse_many(json,window); + auto i = stream.begin(); + size_t count = 0; + for(; i != stream.end(); ++i) { + auto doc = *i; + if (doc.error()) { + std::cerr << doc.error() << std::endl; + return false; + } + if( i.current_index() != count) { + std::cout << "index:" << i.current_index() << std::endl; + std::cout << "expected index:" << count << std::endl; + return false; + } + count += base.size(); + } + return true; + } bool small_window() { std::cout << "Running " << __func__ << std::endl; auto json = R"({"error":[],"result":{"token":"xxx"}}{"error":[],"result":{"token":"xxx"}})"_padded; @@ -541,7 +567,8 @@ namespace document_stream_tests { } bool run() { - return small_window() && + return test_current_index() && + small_window() && large_window() && json_issue467() && document_stream_test() &&