From 83615ff351be6e8aa50494c9eddd6dfba3a0e81e Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 6 Aug 2020 11:42:13 -0400 Subject: [PATCH] Fixes issue 1088 (#1096) --- include/simdjson/dom/document_stream.h | 23 +++++++++++++++- include/simdjson/inline/document_stream.h | 9 ++++++- tests/document_stream_tests.cpp | 33 ++++++++++++++++++++--- 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/include/simdjson/dom/document_stream.h b/include/simdjson/dom/document_stream.h index eb4f53c5..4ce6f2e8 100644 --- a/include/simdjson/dom/document_stream.h +++ b/include/simdjson/dom/document_stream.h @@ -121,7 +121,28 @@ public: * may change in future versions of simdjson: we find the API somewhat * awkward and we would like to offer something friendlier. */ - really_inline size_t current_index() noexcept; + really_inline size_t current_index() const noexcept; + /** + * @private + * + * Gives a view of the current document. + * + * document_stream stream = parser.parse_many(json,window); + * for(auto i = stream.begin(); i != stream.end(); ++i) { + * auto doc = *i; + * std::string_view v = i->source(); + * } + * + * The returned string_view instance is simply a map to the (unparsed) + * source string: it may thus include white-space characters and all manner + * of padding. + * + * This function (source()) is experimental and the usage + * may change in future versions of simdjson: we find the API somewhat + * awkward and we would like to offer something friendlier. + */ + really_inline std::string_view source() const noexcept; + private: really_inline iterator(document_stream &s, bool finished) noexcept; /** The document_stream we're iterating through. */ diff --git a/include/simdjson/inline/document_stream.h b/include/simdjson/inline/document_stream.h index ccd4d9e3..2b554684 100644 --- a/include/simdjson/inline/document_stream.h +++ b/include/simdjson/inline/document_stream.h @@ -150,9 +150,16 @@ inline void document_stream::start() noexcept { next(); } -really_inline size_t document_stream::iterator::current_index() noexcept { +really_inline size_t document_stream::iterator::current_index() const noexcept { return stream.doc_index; } + +really_inline std::string_view document_stream::iterator::source() const noexcept { + size_t next_doc_index = stream.batch_start + stream.parser->implementation->structural_indexes[stream.parser->implementation->next_structural_index]; + return std::string_view(reinterpret_cast(stream.buf) + current_index(), next_doc_index - current_index() - 1); +} + + inline void document_stream::next() noexcept { if (error) { return; } diff --git a/tests/document_stream_tests.cpp b/tests/document_stream_tests.cpp index 9142c00e..19ffb111 100644 --- a/tests/document_stream_tests.cpp +++ b/tests/document_stream_tests.cpp @@ -5,6 +5,19 @@ #include "simdjson.h" #include "test_macros.h" + +std::string trim(const std::string s) { + auto start = s.begin(); + auto end = s.end(); + while (start != s.end() && std::isspace(*start)) { + start++; + } + do { + end--; + } while (std::distance(start, end) > 0 && std::isspace(*end)); + return std::string(start, end + 1); +} + namespace document_stream_tests { static simdjson::dom::document_stream parse_many_stream_return(simdjson::dom::parser &parser, simdjson::padded_string &str) { simdjson::dom::document_stream stream; @@ -19,10 +32,18 @@ namespace document_stream_tests { } bool test_current_index() { std::cout << "Running " << __func__ << std::endl; - std::string base("1 ");// one JSON! + std::string base1("1 ");// one JSON! + std::string base2("{\"k\":1} ");// one JSON! + std::string base3("[1,2] ");// one JSON! + assert(base1.size() == base2.size()); + assert(base2.size() == base3.size()); + std::vector source_strings = {base1, base2, base3}; + std::string json; for(size_t k = 0; k < 1000; k++) { - json += base; + json += base1; + json += base2; + json += base3; } simdjson::dom::parser parser; const size_t window = 32; // deliberately small @@ -38,7 +59,13 @@ namespace document_stream_tests { std::cout << "expected index:" << count << std::endl; return false; } - count += base.size(); + std::string answer = source_strings[(count / base1.size()) % source_strings.size()]; + if(trim(std::string(i.source())) != trim(answer)) { + std::cout << "got: '" << i.source() << "'" << std::endl; + std::cout << "expected : '" << answer << "'" << std::endl; + return false; + } + count += base1.size(); } return true; }