This makes it possible to a have document instance (DOM) that is separate from the parser if you would like. (#1430)
* This makes it possible to a have document instance that is separate from the parser if you would like.
This commit is contained in:
parent
0f72ff3a57
commit
4c63a929bc
|
@ -19,27 +19,41 @@ namespace dom {
|
|||
inline element document::root() const noexcept {
|
||||
return element(internal::tape_ref(this, 1));
|
||||
}
|
||||
simdjson_warn_unused
|
||||
inline size_t document::capacity() const noexcept {
|
||||
return allocated_capacity;
|
||||
}
|
||||
|
||||
simdjson_warn_unused
|
||||
inline error_code document::allocate(size_t capacity) noexcept {
|
||||
if (capacity == 0) {
|
||||
string_buf.reset();
|
||||
tape.reset();
|
||||
allocated_capacity = 0;
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
// a pathological input like "[[[[..." would generate len tape elements, so
|
||||
// need a capacity of at least len + 1, but it is also possible to do
|
||||
// a pathological input like "[[[[..." would generate capacity tape elements, so
|
||||
// need a capacity of at least capacity + 1, but it is also possible to do
|
||||
// worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6"
|
||||
//where len + 1 tape elements are
|
||||
//where capacity + 1 tape elements are
|
||||
// generated, see issue https://github.com/lemire/simdjson/issues/345
|
||||
size_t tape_capacity = SIMDJSON_ROUNDUP_N(capacity + 3, 64);
|
||||
// a document with only zero-length strings... could have len/3 string
|
||||
// and we would need len/3 * 5 bytes on the string buffer
|
||||
// a document with only zero-length strings... could have capacity/3 string
|
||||
// and we would need capacity/3 * 5 bytes on the string buffer
|
||||
size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * capacity / 3 + SIMDJSON_PADDING, 64);
|
||||
string_buf.reset( new (std::nothrow) uint8_t[string_capacity]);
|
||||
tape.reset(new (std::nothrow) uint64_t[tape_capacity]);
|
||||
return string_buf && tape ? SUCCESS : MEMALLOC;
|
||||
if(!(string_buf && tape)) {
|
||||
allocated_capacity = 0;
|
||||
string_buf.reset();
|
||||
tape.reset();
|
||||
return MEMALLOC;
|
||||
}
|
||||
// Technically the allocated_capacity might be larger than capacity
|
||||
// so the next line is pessimistic.
|
||||
allocated_capacity = capacity;
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
inline bool document::dump_raw_tape(std::ostream &os) const noexcept {
|
||||
|
|
|
@ -63,9 +63,27 @@ public:
|
|||
* Should be at least byte_capacity.
|
||||
*/
|
||||
std::unique_ptr<uint8_t[]> string_buf{};
|
||||
/** @private Allocate memory to support
|
||||
* input JSON documents of up to len bytes.
|
||||
*
|
||||
* When calling this function, you lose
|
||||
* all the data.
|
||||
*
|
||||
* The memory allocation is strict: you
|
||||
* can you use this function to increase
|
||||
* or lower the amount of allocated memory.
|
||||
* Passsing zero clears the memory.
|
||||
*/
|
||||
error_code allocate(size_t len) noexcept;
|
||||
/** @private Capacity in bytes, in terms
|
||||
* of how many bytes of input JSON we can
|
||||
* support.
|
||||
*/
|
||||
size_t capacity() const noexcept;
|
||||
|
||||
|
||||
private:
|
||||
inline error_code allocate(size_t len) noexcept;
|
||||
size_t allocated_capacity{0};
|
||||
friend class parser;
|
||||
}; // class document
|
||||
|
||||
|
|
|
@ -180,7 +180,6 @@ inline void document_stream::start() noexcept {
|
|||
if (error) { return; }
|
||||
error = parser->ensure_capacity(batch_size);
|
||||
if (error) { return; }
|
||||
|
||||
// Always run the first stage 1 parse immediately
|
||||
batch_start = 0;
|
||||
error = run_stage1(*parser, batch_start);
|
||||
|
|
|
@ -87,21 +87,39 @@ inline simdjson_result<document_stream> parser::load_many(const std::string &pat
|
|||
return document_stream(*this, reinterpret_cast<const uint8_t*>(loaded_bytes.get()), len, batch_size);
|
||||
}
|
||||
|
||||
inline simdjson_result<element> parser::parse(const uint8_t *buf, size_t len, bool realloc_if_needed) & noexcept {
|
||||
error_code _error = ensure_capacity(len);
|
||||
inline simdjson_result<element> parser::parse_into_document(document& provided_doc, const uint8_t *buf, size_t len, bool realloc_if_needed) & noexcept {
|
||||
// Important: we need to ensure that document has enough capacity.
|
||||
// Important: It is possible that provided_doc is actually the internal 'doc' within the parser!!!
|
||||
error_code _error = ensure_capacity(provided_doc, len);
|
||||
if (_error) { return _error; }
|
||||
std::unique_ptr<uint8_t[]> tmp_buf;
|
||||
|
||||
if (realloc_if_needed) {
|
||||
tmp_buf.reset(reinterpret_cast<uint8_t *>( internal::allocate_padded_buffer(len) ));
|
||||
if (tmp_buf.get() == nullptr) { return MEMALLOC; }
|
||||
std::memcpy(static_cast<void *>(tmp_buf.get()), buf, len);
|
||||
}
|
||||
_error = implementation->parse(realloc_if_needed ? tmp_buf.get() : buf, len, doc);
|
||||
_error = implementation->parse(realloc_if_needed ? tmp_buf.get() : buf, len, provided_doc);
|
||||
|
||||
if (_error) { return _error; }
|
||||
|
||||
return doc.root();
|
||||
return provided_doc.root();
|
||||
}
|
||||
|
||||
simdjson_really_inline simdjson_result<element> parser::parse_into_document(document& provided_doc, const char *buf, size_t len, bool realloc_if_needed) & noexcept {
|
||||
return parse_into_document(provided_doc, reinterpret_cast<const uint8_t *>(buf), len, realloc_if_needed);
|
||||
}
|
||||
simdjson_really_inline simdjson_result<element> parser::parse_into_document(document& provided_doc, const std::string &s) & noexcept {
|
||||
return parse_into_document(provided_doc, s.data(), s.length(), s.capacity() - s.length() < SIMDJSON_PADDING);
|
||||
}
|
||||
simdjson_really_inline simdjson_result<element> parser::parse_into_document(document& provided_doc, const padded_string &s) & noexcept {
|
||||
return parse_into_document(provided_doc, s.data(), s.length(), false);
|
||||
}
|
||||
|
||||
|
||||
inline simdjson_result<element> parser::parse(const uint8_t *buf, size_t len, bool realloc_if_needed) & noexcept {
|
||||
return parse_into_document(doc, buf, len, realloc_if_needed);
|
||||
}
|
||||
|
||||
simdjson_really_inline simdjson_result<element> parser::parse(const char *buf, size_t len, bool realloc_if_needed) & noexcept {
|
||||
return parse(reinterpret_cast<const uint8_t *>(buf), len, realloc_if_needed);
|
||||
}
|
||||
|
@ -139,26 +157,15 @@ simdjson_really_inline size_t parser::max_depth() const noexcept {
|
|||
simdjson_warn_unused
|
||||
inline error_code parser::allocate(size_t capacity, size_t max_depth) noexcept {
|
||||
//
|
||||
// Reallocate implementation and document if needed
|
||||
// Reallocate implementation if needed
|
||||
//
|
||||
error_code err;
|
||||
//
|
||||
// It is possible that we change max_depth without touching capacity, in
|
||||
// which case, we do not want to reallocate the document buffers.
|
||||
//
|
||||
bool need_doc_allocation{false};
|
||||
if (implementation) {
|
||||
need_doc_allocation = implementation->capacity() != capacity || !doc.tape;
|
||||
err = implementation->allocate(capacity, max_depth);
|
||||
} else {
|
||||
need_doc_allocation = true;
|
||||
err = simdjson::active_implementation->create_dom_parser_implementation(capacity, max_depth, implementation);
|
||||
}
|
||||
if (err) { return err; }
|
||||
if (need_doc_allocation) {
|
||||
err = doc.allocate(capacity);
|
||||
if (err) { return err; }
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -168,22 +175,40 @@ inline bool parser::allocate_capacity(size_t capacity, size_t max_depth) noexcep
|
|||
return !allocate(capacity, max_depth);
|
||||
}
|
||||
#endif // SIMDJSON_DISABLE_DEPRECATED_API
|
||||
|
||||
inline error_code parser::ensure_capacity(size_t desired_capacity) noexcept {
|
||||
return ensure_capacity(doc, desired_capacity);
|
||||
}
|
||||
|
||||
|
||||
inline error_code parser::ensure_capacity(document& target_document, size_t desired_capacity) noexcept {
|
||||
// 1. It is wasteful to allocate a document and a parser for documents spanning less than MINIMAL_DOCUMENT_CAPACITY bytes.
|
||||
// 2. If we allow desired_capacity = 0 then it is possible to exit this function with implementation == nullptr.
|
||||
if(desired_capacity < MINIMAL_DOCUMENT_CAPACITY) { desired_capacity = MINIMAL_DOCUMENT_CAPACITY; }
|
||||
// If we don't have enough capacity, (try to) automatically bump it.
|
||||
// If the document was taken, reallocate that too.
|
||||
// If the document needs allocation, do it too.
|
||||
// Both in one if statement to minimize unlikely branching.
|
||||
if (simdjson_unlikely(capacity() < desired_capacity || !doc.tape)) {
|
||||
//
|
||||
// Note: we must make sure that this function is called if capacity() == 0. We do so because we
|
||||
// ensure that desired_capacity > 0.
|
||||
if (simdjson_unlikely(capacity() < desired_capacity || target_document.capacity() < desired_capacity)) {
|
||||
if (desired_capacity > max_capacity()) {
|
||||
return error = CAPACITY;
|
||||
}
|
||||
return allocate(desired_capacity, max_depth());
|
||||
error_code err1 = target_document.capacity() < desired_capacity ? target_document.allocate(desired_capacity) : SUCCESS;
|
||||
error_code err2 = capacity() < desired_capacity ? allocate(desired_capacity, max_depth()) : SUCCESS;
|
||||
if(err1 != SUCCESS) { return error = err1; }
|
||||
if(err2 != SUCCESS) { return error = err2; }
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
simdjson_really_inline void parser::set_max_capacity(size_t max_capacity) noexcept {
|
||||
if(max_capacity < MINIMAL_DOCUMENT_CAPACITY) {
|
||||
_max_capacity = max_capacity;
|
||||
} else {
|
||||
_max_capacity = MINIMAL_DOCUMENT_CAPACITY;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace dom
|
||||
|
|
|
@ -31,6 +31,11 @@ static constexpr size_t DEFAULT_BATCH_SIZE = 1000000;
|
|||
*/
|
||||
static constexpr size_t MINIMAL_BATCH_SIZE = 32;
|
||||
|
||||
/**
|
||||
* It is wasteful to allocate memory for tiny documents (e.g., 4 bytes).
|
||||
*/
|
||||
static constexpr size_t MINIMAL_DOCUMENT_CAPACITY = 32;
|
||||
|
||||
/**
|
||||
* A persistent document parser.
|
||||
*
|
||||
|
@ -120,7 +125,7 @@ public:
|
|||
* Parse a JSON document and return a temporary reference to it.
|
||||
*
|
||||
* dom::parser parser;
|
||||
* element doc = parser.parse(buf, len);
|
||||
* element doc_root = parser.parse(buf, len);
|
||||
*
|
||||
* The function eagerly parses the input: the input can be modified and discarded after
|
||||
* the `parser.parse(buf, len)` call has completed.
|
||||
|
@ -174,7 +179,7 @@ public:
|
|||
* realloc_if_needed is true.
|
||||
* @param len The length of the JSON.
|
||||
* @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding.
|
||||
* @return The document, or an error:
|
||||
* @return An element pointing at the root of the document, or an error:
|
||||
* - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity,
|
||||
* and memory allocation fails.
|
||||
* - CAPACITY if the parser does not have enough capacity and len > max_capacity.
|
||||
|
@ -196,6 +201,65 @@ public:
|
|||
/** @private We do not want to allow implicit conversion from C string to std::string. */
|
||||
simdjson_really_inline simdjson_result<element> parse(const char *buf) noexcept = delete;
|
||||
|
||||
/**
|
||||
* Parse a JSON document into a provide document instance and return a temporary reference to it.
|
||||
* It is similar to the function `parse` except that instead of parsing into the internal
|
||||
* `document` instance associated with the parser, it allows the user to provide a document
|
||||
* instance.
|
||||
*
|
||||
* dom::parser parser;
|
||||
* dom::document doc;
|
||||
* element doc_root = parser.parse_into_document(doc, buf, len);
|
||||
*
|
||||
* The function eagerly parses the input: the input can be modified and discarded after
|
||||
* the `parser.parse(buf, len)` call has completed.
|
||||
*
|
||||
* ### IMPORTANT: Document Lifetime
|
||||
*
|
||||
* After the call to parse_into_document, the parser is no longer needed.
|
||||
*
|
||||
* The JSON document lives in the document instance: you must keep the document
|
||||
* instance alive while you navigate through it (i.e., used the returned value from
|
||||
* parse_into_document). You are encourage to reuse the document instance
|
||||
* many times with new data to avoid reallocations:
|
||||
*
|
||||
* dom::document doc;
|
||||
* element doc_root1 = parser.parse_into_document(doc, buf1, len);
|
||||
* //... doc_root1 is a pointer inside doc
|
||||
* element doc_root2 = parser.parse_into_document(doc, buf1, len);
|
||||
* //... doc_root2 is a pointer inside doc
|
||||
* // at this point doc_root1 is no longer safe
|
||||
*
|
||||
* Moving the document instance is safe, but it invalidates the element instances. After
|
||||
* moving a document, you can recover safe access to the document root with its `root()` method.
|
||||
*
|
||||
* @param doc The document instance where the parsed data will be stored (on success).
|
||||
* @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless
|
||||
* realloc_if_needed is true.
|
||||
* @param len The length of the JSON.
|
||||
* @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding.
|
||||
* @return An element pointing at the root of document, or an error:
|
||||
* - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity,
|
||||
* and memory allocation fails.
|
||||
* - CAPACITY if the parser does not have enough capacity and len > max_capacity.
|
||||
* - other json errors if parsing fails. You should not rely on these errors to always the same for the
|
||||
* same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
|
||||
*/
|
||||
inline simdjson_result<element> parse_into_document(document& doc, const uint8_t *buf, size_t len, bool realloc_if_needed = true) & noexcept;
|
||||
inline simdjson_result<element> parse_into_document(document& doc, const uint8_t *buf, size_t len, bool realloc_if_needed = true) && =delete;
|
||||
/** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */
|
||||
simdjson_really_inline simdjson_result<element> parse_into_document(document& doc, const char *buf, size_t len, bool realloc_if_needed = true) & noexcept;
|
||||
simdjson_really_inline simdjson_result<element> parse_into_document(document& doc, const char *buf, size_t len, bool realloc_if_needed = true) && =delete;
|
||||
/** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */
|
||||
simdjson_really_inline simdjson_result<element> parse_into_document(document& doc, const std::string &s) & noexcept;
|
||||
simdjson_really_inline simdjson_result<element> parse_into_document(document& doc, const std::string &s) && =delete;
|
||||
/** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */
|
||||
simdjson_really_inline simdjson_result<element> parse_into_document(document& doc, const padded_string &s) & noexcept;
|
||||
simdjson_really_inline simdjson_result<element> parse_into_document(document& doc, const padded_string &s) && =delete;
|
||||
|
||||
/** @private We do not want to allow implicit conversion from C string to std::string. */
|
||||
simdjson_really_inline simdjson_result<element> parse_into_document(document& doc, const char *buf) noexcept = delete;
|
||||
|
||||
/**
|
||||
* Load a file containing many JSON documents.
|
||||
*
|
||||
|
@ -430,6 +494,10 @@ public:
|
|||
* The parser may reallocate internal buffers as needed up to this amount as documents are passed
|
||||
* to it.
|
||||
*
|
||||
* Note: To avoid limiting the memory to an absurd value, such as zero or two bytes,
|
||||
* iff you try to set max_capacity to a value lower than MINIMAL_DOCUMENT_CAPACITY,
|
||||
* then the maximal capacity is set to MINIMAL_DOCUMENT_CAPACITY.
|
||||
*
|
||||
* This call will not allocate or deallocate, even if capacity is currently above max_capacity.
|
||||
*
|
||||
* @param max_capacity The new maximum capacity, in bytes.
|
||||
|
@ -515,9 +583,16 @@ private:
|
|||
|
||||
/**
|
||||
* Ensure we have enough capacity to handle at least desired_capacity bytes,
|
||||
* and auto-allocate if not.
|
||||
* and auto-allocate if not. This also allocates memory if needed in the
|
||||
* internal document.
|
||||
*/
|
||||
inline error_code ensure_capacity(size_t desired_capacity) noexcept;
|
||||
/**
|
||||
* Ensure we have enough capacity to handle at least desired_capacity bytes,
|
||||
* and auto-allocate if not. This also allocates memory if needed in the
|
||||
* provided document.
|
||||
*/
|
||||
inline error_code ensure_capacity(document& doc, size_t desired_capacity) noexcept;
|
||||
|
||||
/** Read the file into loaded_bytes */
|
||||
inline simdjson_result<size_t> read_file(const std::string &path) noexcept;
|
||||
|
|
|
@ -346,6 +346,73 @@ namespace parse_api_tests {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
#if SIMDJSON_EXCEPTIONS
|
||||
bool issue679() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
auto input = "[1, 2, 3]"_padded;
|
||||
dom::document doc;
|
||||
{
|
||||
dom::parser parser;
|
||||
element doc_root = parser.parse_into_document(doc, input);
|
||||
if(simdjson::to_string(doc_root) != "[1,2,3]") { return false; }
|
||||
// parser will go out of scope here.
|
||||
}
|
||||
if(simdjson::to_string(doc.root()) != "[1,2,3]") { return false; }
|
||||
dom::parser parser; // new parser
|
||||
element doc_root1 = parser.parse_into_document(doc, input);
|
||||
if(simdjson::to_string(doc_root1) != "[1,2,3]") { return false; }
|
||||
//... doc_root1 is a pointer inside doc
|
||||
element doc_root2 = parser.parse_into_document(doc, input);
|
||||
//... doc_root2 is a pointer inside doc
|
||||
if(simdjson::to_string(doc_root2) != "[1,2,3]") { return false; }
|
||||
|
||||
// Here let us take moving the document:
|
||||
dom::document docm = std::move(doc);
|
||||
element doc_root3 = docm.root();
|
||||
if(simdjson::to_string(doc_root3) != "[1,2,3]") { return false; }
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
//See https://github.com/simdjson/simdjson/issues/1332
|
||||
bool parser_moving_parser_and_recovering() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
auto input = "[1, 2, 3]"_padded;
|
||||
auto parser = dom::parser{};
|
||||
dom::element root = parser.parse(input); // might throw
|
||||
auto parser2 = std::move(parser);
|
||||
root = parser2.doc.root();
|
||||
std::cout << simdjson::to_string(root) << std::endl;
|
||||
return simdjson::to_string(root) == "[1,2,3]";
|
||||
}
|
||||
// Some users want to parse the document and keep it for later.
|
||||
// Such users can then keep track of the state of the parser's document.
|
||||
struct moving_parser {
|
||||
dom::parser parser{};
|
||||
bool is_valid{false};
|
||||
simdjson::error_code parse(const padded_string & input) {
|
||||
auto answer = parser.parse(input).error();
|
||||
is_valid = !answer;
|
||||
return answer;
|
||||
}
|
||||
// result is invalidated when moving_parser is moved.
|
||||
dom::element get_root() {
|
||||
if(is_valid) { return parser.doc.root(); }
|
||||
throw std::runtime_error("no document");
|
||||
}
|
||||
};
|
||||
// Shows how to use moving_parser
|
||||
bool parser_moving_parser_and_recovering_struct() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
auto input = "[1, 2, 3]"_padded;
|
||||
moving_parser mp{};
|
||||
mp.parse(input);// I could check the error here if I want
|
||||
auto mp2 = std::move(mp);
|
||||
auto root = mp2.get_root();// might throw if document was invalid
|
||||
std::cout << simdjson::to_string(root) << std::endl;
|
||||
return simdjson::to_string(root) == "[1,2,3]";
|
||||
}
|
||||
#endif
|
||||
bool parser_parse() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
dom::parser parser;
|
||||
|
@ -530,10 +597,13 @@ namespace parse_api_tests {
|
|||
parser_load_many_deprecated() &&
|
||||
#endif
|
||||
#if SIMDJSON_EXCEPTIONS
|
||||
parser_moving_parser_and_recovering_struct() &&
|
||||
parser_moving_parser_and_recovering() &&
|
||||
parser_parse_exception() &&
|
||||
parser_parse_many_exception() &&
|
||||
parser_load_exception() &&
|
||||
parser_load_many_exception() &&
|
||||
issue679() &&
|
||||
#endif
|
||||
true;
|
||||
}
|
||||
|
|
|
@ -76,7 +76,7 @@ void basics_error_3() {
|
|||
for (dom::element elem : array) {
|
||||
dom::object obj;
|
||||
if ((error = elem.get(obj))) { cerr << error << endl; exit(1); }
|
||||
for (auto & key_value : obj) {
|
||||
for (auto key_value : obj) {
|
||||
cout << "key: " << key_value.key << " : ";
|
||||
dom::object innerobj;
|
||||
if ((error = key_value.value.get(innerobj))) { cerr << error << endl; exit(1); }
|
||||
|
|
Loading…
Reference in New Issue