Keep loaded_bytes in parser to reduce allocation
Also centralized memory ownership to make it easy to keep data around
This commit is contained in:
parent
c34b1a1b2a
commit
36ceaa4452
|
@ -1522,6 +1522,14 @@ private:
|
|||
//
|
||||
size_t _max_depth;
|
||||
|
||||
//
|
||||
// The loaded buffer (reused each time load() is called)
|
||||
//
|
||||
std::unique_ptr<char[], decltype(&aligned_free_char)> loaded_bytes;
|
||||
|
||||
// Capacity of loaded_bytes buffer.
|
||||
size_t _loaded_bytes_capacity{0};
|
||||
|
||||
// all nodes are stored on the doc.tape using a 64-bit word.
|
||||
//
|
||||
// strings, double and ints are stored as
|
||||
|
@ -1543,6 +1551,11 @@ private:
|
|||
// and auto-allocate if not.
|
||||
inline error_code ensure_capacity(size_t desired_capacity) noexcept;
|
||||
|
||||
//
|
||||
// Read the file into loaded_bytes
|
||||
//
|
||||
inline simdjson_result<size_t> read_file(const std::string &path) noexcept;
|
||||
|
||||
#if SIMDJSON_EXCEPTIONS
|
||||
// Used internally to get the document
|
||||
inline const document &get_document() const noexcept(false);
|
||||
|
|
|
@ -1,522 +0,0 @@
|
|||
#ifndef SIMDJSON_DOCUMENT_PARSER_H
|
||||
#define SIMDJSON_DOCUMENT_PARSER_H
|
||||
|
||||
#include "simdjson/document.h"
|
||||
#include "simdjson/common_defs.h"
|
||||
#include "simdjson/error.h"
|
||||
#include "simdjson/padded_string.h"
|
||||
#include <string>
|
||||
|
||||
namespace simdjson {
|
||||
|
||||
/**
|
||||
* A persistent document parser.
|
||||
*
|
||||
* Use this if you intend to parse more than one document. It holds the internal memory necessary
|
||||
* to do parsing, as well as memory for a single document that is overwritten on each parse.
|
||||
*
|
||||
* This class cannot be copied, only moved, to avoid unintended allocations.
|
||||
*
|
||||
* @note This is not thread safe: one parser cannot produce two documents at the same time!
|
||||
*/
|
||||
class document::parser {
|
||||
public:
|
||||
/**
|
||||
* Create a JSON parser with zero capacity. Call allocate_capacity() to initialize it.
|
||||
*/
|
||||
parser()=default;
|
||||
~parser()=default;
|
||||
|
||||
/**
|
||||
* Take another parser's buffers and state.
|
||||
*
|
||||
* @param other The parser to take. Its capacity is zeroed.
|
||||
*/
|
||||
parser(document::parser &&other) = default;
|
||||
parser(const document::parser &) = delete; // Disallow copying
|
||||
/**
|
||||
* Take another parser's buffers and state.
|
||||
*
|
||||
* @param other The parser to take. Its capacity is zeroed.
|
||||
*/
|
||||
parser &operator=(document::parser &&other) = default;
|
||||
parser &operator=(const document::parser &) = delete; // Disallow copying
|
||||
|
||||
/**
|
||||
* Parse a JSON document and return a reference to it.
|
||||
*
|
||||
* The JSON document still lives in the parser: this is the most efficient way to parse JSON
|
||||
* documents because it reuses the same buffers, but you *must* use the document before you
|
||||
* destroy the parser or call parse() again.
|
||||
*
|
||||
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
|
||||
* those bytes are initialized to, as long as they are allocated. If realloc_if_needed is true,
|
||||
* it is assumed that the buffer does *not* have enough padding, and it is reallocated, enlarged
|
||||
* and copied before parsing.
|
||||
*
|
||||
* @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless
|
||||
* realloc_if_needed is true.
|
||||
* @param len The length of the JSON.
|
||||
* @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding.
|
||||
* @return the document, or an error if the JSON is invalid.
|
||||
*/
|
||||
inline doc_ref_result parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) noexcept;
|
||||
|
||||
/**
|
||||
* Parse a JSON document and return a reference to it.
|
||||
*
|
||||
* The JSON document still lives in the parser: this is the most efficient way to parse JSON
|
||||
* documents because it reuses the same buffers, but you *must* use the document before you
|
||||
* destroy the parser or call parse() again.
|
||||
*
|
||||
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
|
||||
* those bytes are initialized to, as long as they are allocated. If realloc_if_needed is true,
|
||||
* it is assumed that the buffer does *not* have enough padding, and it is reallocated, enlarged
|
||||
* and copied before parsing.
|
||||
*
|
||||
* @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless
|
||||
* realloc_if_needed is true.
|
||||
* @param len The length of the JSON.
|
||||
* @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding.
|
||||
* @return the document, or an error if the JSON is invalid.
|
||||
*/
|
||||
really_inline doc_ref_result parse(const char *buf, size_t len, bool realloc_if_needed = true) noexcept;
|
||||
|
||||
/**
|
||||
* Parse a JSON document and return a reference to it.
|
||||
*
|
||||
* The JSON document still lives in the parser: this is the most efficient way to parse JSON
|
||||
* documents because it reuses the same buffers, but you *must* use the document before you
|
||||
* destroy the parser or call parse() again.
|
||||
*
|
||||
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
|
||||
* those bytes are initialized to, as long as they are allocated. If `str.capacity() - str.size()
|
||||
* < SIMDJSON_PADDING`, the string will be copied to a string with larger capacity before parsing.
|
||||
*
|
||||
* @param s The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, or
|
||||
* a new string will be created with the extra padding.
|
||||
* @return the document, or an error if the JSON is invalid.
|
||||
*/
|
||||
really_inline doc_ref_result parse(const std::string &s) noexcept;
|
||||
|
||||
/**
|
||||
* Parse a JSON document and return a reference to it.
|
||||
*
|
||||
* The JSON document still lives in the parser: this is the most efficient way to parse JSON
|
||||
* documents because it reuses the same buffers, but you *must* use the document before you
|
||||
* destroy the parser or call parse() again.
|
||||
*
|
||||
* @param s The JSON to parse.
|
||||
* @return the document, or an error if the JSON is invalid.
|
||||
*/
|
||||
really_inline doc_ref_result parse(const padded_string &s) noexcept;
|
||||
|
||||
// We do not want to allow implicit conversion from C string to std::string.
|
||||
really_inline doc_ref_result parse(const char *buf) noexcept = delete;
|
||||
|
||||
/**
|
||||
* Parse a buffer containing many JSON documents.
|
||||
*
|
||||
* document::parser parser;
|
||||
* for (const document &doc : parser.parse_many(buf, len)) {
|
||||
* cout << std::string(doc["title"]) << endl;
|
||||
* }
|
||||
*
|
||||
* ### Format
|
||||
*
|
||||
* The buffer must contain a series of one or more JSON documents, concatenated into a single
|
||||
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
|
||||
* then starts parsing the next document at that point. (It does this with more parallelism and
|
||||
* lookahead than you might think, though.)
|
||||
*
|
||||
* documents that consist of an object or array may omit the whitespace between them, concatenating
|
||||
* with no separator. documents that consist of a single primitive (i.e. documents that are not
|
||||
* arrays or objects) MUST be separated with whitespace.
|
||||
*
|
||||
* ### Error Handling
|
||||
*
|
||||
* All errors are returned during iteration: if there is a global error such as memory allocation,
|
||||
* it will be yielded as the first result. Iteration always stops after the first error.
|
||||
*
|
||||
* As with all other simdjson methods, non-exception error handling is readily available through
|
||||
* the same interface, requiring you to check the error before using the document:
|
||||
*
|
||||
* document::parser parser;
|
||||
* for (auto [doc, error] : parser.parse_many(buf, len)) {
|
||||
* if (error) { cerr << error << endl; exit(1); }
|
||||
* cout << std::string(doc["title"]) << endl;
|
||||
* }
|
||||
*
|
||||
* ### REQUIRED: Buffer Padding
|
||||
*
|
||||
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
|
||||
* those bytes are initialized to, as long as they are allocated.
|
||||
*
|
||||
* ### Threads
|
||||
*
|
||||
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
|
||||
* hood to do some lookahead.
|
||||
*
|
||||
* ### Parser Capacity
|
||||
*
|
||||
* If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
|
||||
* allocated, it must have a capacity at least as large as batch_size.
|
||||
*
|
||||
* @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
|
||||
* @param len The length of the concatenated JSON.
|
||||
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
|
||||
* spot is cache-related: small enough to fit in cache, yet big enough to
|
||||
* parse as many documents as possible in one tight loop.
|
||||
* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
|
||||
* @return The stream. If there is an error, it will be returned during iteration. An empty input
|
||||
* will yield 0 documents rather than an EMPTY error. Errors:
|
||||
* - MEMALLOC if the parser is unallocated and memory allocation fails
|
||||
* - CAPACITY if the parser already has a capacity, and it is less than batch_size
|
||||
* - other json errors if parsing fails.
|
||||
*/
|
||||
inline stream parse_many(const uint8_t *buf, size_t len, size_t batch_size = 1000000) noexcept;
|
||||
|
||||
/**
|
||||
* Parse a buffer containing many JSON documents.
|
||||
*
|
||||
* document::parser parser;
|
||||
* for (const document &doc : parser.parse_many(buf, len)) {
|
||||
* cout << std::string(doc["title"]) << endl;
|
||||
* }
|
||||
*
|
||||
* ### Format
|
||||
*
|
||||
* The buffer must contain a series of one or more JSON documents, concatenated into a single
|
||||
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
|
||||
* then starts parsing the next document at that point. (It does this with more parallelism and
|
||||
* lookahead than you might think, though.)
|
||||
*
|
||||
* documents that consist of an object or array may omit the whitespace between them, concatenating
|
||||
* with no separator. documents that consist of a single primitive (i.e. documents that are not
|
||||
* arrays or objects) MUST be separated with whitespace.
|
||||
*
|
||||
* ### Error Handling
|
||||
*
|
||||
* All errors are returned during iteration: if there is a global error such as memory allocation,
|
||||
* it will be yielded as the first result. Iteration always stops after the first error.
|
||||
*
|
||||
* As with all other simdjson methods, non-exception error handling is readily available through
|
||||
* the same interface, requiring you to check the error before using the document:
|
||||
*
|
||||
* document::parser parser;
|
||||
* for (auto [doc, error] : parser.parse_many(buf, len)) {
|
||||
* if (error) { cerr << error << endl; exit(1); }
|
||||
* cout << std::string(doc["title"]) << endl;
|
||||
* }
|
||||
*
|
||||
* ### REQUIRED: Buffer Padding
|
||||
*
|
||||
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
|
||||
* those bytes are initialized to, as long as they are allocated.
|
||||
*
|
||||
* ### Threads
|
||||
*
|
||||
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
|
||||
* hood to do some lookahead.
|
||||
*
|
||||
* ### Parser Capacity
|
||||
*
|
||||
* If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
|
||||
* allocated, it must have a capacity at least as large as batch_size.
|
||||
*
|
||||
* @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
|
||||
* @param len The length of the concatenated JSON.
|
||||
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
|
||||
* spot is cache-related: small enough to fit in cache, yet big enough to
|
||||
* parse as many documents as possible in one tight loop.
|
||||
* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
|
||||
* @return The stream. If there is an error, it will be returned during iteration. An empty input
|
||||
* will yield 0 documents rather than an EMPTY error. Errors:
|
||||
* - MEMALLOC if the parser is unallocated and memory allocation fails
|
||||
* - CAPACITY if the parser already has a capacity, and it is less than batch_size
|
||||
* - other json errors if parsing fails
|
||||
*/
|
||||
inline stream parse_many(const char *buf, size_t len, size_t batch_size = 1000000) noexcept;
|
||||
|
||||
/**
|
||||
* Parse a buffer containing many JSON documents.
|
||||
*
|
||||
* document::parser parser;
|
||||
* for (const document &doc : parser.parse_many(buf, len)) {
|
||||
* cout << std::string(doc["title"]) << endl;
|
||||
* }
|
||||
*
|
||||
* ### Format
|
||||
*
|
||||
* The buffer must contain a series of one or more JSON documents, concatenated into a single
|
||||
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
|
||||
* then starts parsing the next document at that point. (It does this with more parallelism and
|
||||
* lookahead than you might think, though.)
|
||||
*
|
||||
* documents that consist of an object or array may omit the whitespace between them, concatenating
|
||||
* with no separator. documents that consist of a single primitive (i.e. documents that are not
|
||||
* arrays or objects) MUST be separated with whitespace.
|
||||
*
|
||||
* ### Error Handling
|
||||
*
|
||||
* All errors are returned during iteration: if there is a global error such as memory allocation,
|
||||
* it will be yielded as the first result. Iteration always stops after the first error.
|
||||
*
|
||||
* As with all other simdjson methods, non-exception error handling is readily available through
|
||||
* the same interface, requiring you to check the error before using the document:
|
||||
*
|
||||
* document::parser parser;
|
||||
* for (auto [doc, error] : parser.parse_many(buf, len)) {
|
||||
* if (error) { cerr << error << endl; exit(1); }
|
||||
* cout << std::string(doc["title"]) << endl;
|
||||
* }
|
||||
*
|
||||
* ### REQUIRED: Buffer Padding
|
||||
*
|
||||
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
|
||||
* those bytes are initialized to, as long as they are allocated.
|
||||
*
|
||||
* ### Threads
|
||||
*
|
||||
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
|
||||
* hood to do some lookahead.
|
||||
*
|
||||
* ### Parser Capacity
|
||||
*
|
||||
* If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
|
||||
* allocated, it must have a capacity at least as large as batch_size.
|
||||
*
|
||||
* @param s The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
|
||||
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
|
||||
* spot is cache-related: small enough to fit in cache, yet big enough to
|
||||
* parse as many documents as possible in one tight loop.
|
||||
* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
|
||||
* @return he stream. If there is an error, it will be returned during iteration. An empty input
|
||||
* will yield 0 documents rather than an EMPTY error. Errors:
|
||||
* - MEMALLOC if the parser is unallocated and memory allocation fails
|
||||
* - CAPACITY if the parser already has a capacity, and it is less than batch_size
|
||||
* - other json errors if parsing fails
|
||||
*/
|
||||
inline stream parse_many(const std::string &s, size_t batch_size = 1000000) noexcept;
|
||||
|
||||
/**
|
||||
* Parse a buffer containing many JSON documents.
|
||||
*
|
||||
* document::parser parser;
|
||||
* for (const document &doc : parser.parse_many(buf, len)) {
|
||||
* cout << std::string(doc["title"]) << endl;
|
||||
* }
|
||||
*
|
||||
* ### Format
|
||||
*
|
||||
* The buffer must contain a series of one or more JSON documents, concatenated into a single
|
||||
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
|
||||
* then starts parsing the next document at that point. (It does this with more parallelism and
|
||||
* lookahead than you might think, though.)
|
||||
*
|
||||
* documents that consist of an object or array may omit the whitespace between them, concatenating
|
||||
* with no separator. documents that consist of a single primitive (i.e. documents that are not
|
||||
* arrays or objects) MUST be separated with whitespace.
|
||||
*
|
||||
* ### Error Handling
|
||||
*
|
||||
* All errors are returned during iteration: if there is a global error such as memory allocation,
|
||||
* it will be yielded as the first result. Iteration always stops after the first error.
|
||||
*
|
||||
* As with all other simdjson methods, non-exception error handling is readily available through
|
||||
* the same interface, requiring you to check the error before using the document:
|
||||
*
|
||||
* document::parser parser;
|
||||
* for (auto [doc, error] : parser.parse_many(buf, len)) {
|
||||
* if (error) { cerr << error << endl; exit(1); }
|
||||
* cout << std::string(doc["title"]) << endl;
|
||||
* }
|
||||
*
|
||||
* ### REQUIRED: Buffer Padding
|
||||
*
|
||||
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
|
||||
* those bytes are initialized to, as long as they are allocated.
|
||||
*
|
||||
* ### Threads
|
||||
*
|
||||
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
|
||||
* hood to do some lookahead.
|
||||
*
|
||||
* ### Parser Capacity
|
||||
*
|
||||
* If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
|
||||
* allocated, it must have a capacity at least as large as batch_size.
|
||||
*
|
||||
* @param s The concatenated JSON to parse.
|
||||
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
|
||||
* spot is cache-related: small enough to fit in cache, yet big enough to
|
||||
* parse as many documents as possible in one tight loop.
|
||||
* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
|
||||
* @return he stream. If there is an error, it will be returned during iteration. An empty input
|
||||
* will yield 0 documents rather than an EMPTY error. Errors:
|
||||
* - MEMALLOC if the parser is unallocated and memory allocation fails
|
||||
* - CAPACITY if the parser already has a capacity, and it is less than batch_size
|
||||
* - other json errors if parsing fails
|
||||
*/
|
||||
inline stream parse_many(const padded_string &s, size_t batch_size = 1000000) noexcept;
|
||||
|
||||
// We do not want to allow implicit conversion from C string to std::string.
|
||||
really_inline doc_ref_result parse_many(const char *buf, size_t batch_size = 1000000) noexcept = delete;
|
||||
|
||||
/**
|
||||
* Current capacity: the largest document this parser can support without reallocating.
|
||||
*/
|
||||
really_inline size_t capacity() const noexcept;
|
||||
|
||||
/**
|
||||
* The maximum level of nested object and arrays supported by this parser.
|
||||
*/
|
||||
really_inline size_t max_depth() const noexcept;
|
||||
|
||||
/**
|
||||
* Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
|
||||
* and `max_depth` depth.
|
||||
*/
|
||||
WARN_UNUSED inline bool allocate_capacity(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH);
|
||||
|
||||
// type aliases for backcompat
|
||||
using Iterator = document::iterator;
|
||||
using InvalidJSON = simdjson_error;
|
||||
|
||||
// Next location to write to in the tape
|
||||
uint32_t current_loc{0};
|
||||
|
||||
// structural indices passed from stage 1 to stage 2
|
||||
uint32_t n_structural_indexes{0};
|
||||
std::unique_ptr<uint32_t[]> structural_indexes;
|
||||
|
||||
// location and return address of each open { or [
|
||||
std::unique_ptr<uint32_t[]> containing_scope_offset;
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
std::unique_ptr<void*[]> ret_address;
|
||||
#else
|
||||
std::unique_ptr<char[]> ret_address;
|
||||
#endif
|
||||
|
||||
// Next place to write a string
|
||||
uint8_t *current_string_buf_loc;
|
||||
|
||||
bool valid{false};
|
||||
error_code error{UNINITIALIZED};
|
||||
|
||||
// Document we're writing to
|
||||
document doc;
|
||||
|
||||
//
|
||||
// TODO these are deprecated; use the results of parse instead.
|
||||
//
|
||||
|
||||
// returns true if the document parsed was valid
|
||||
inline bool is_valid() const noexcept;
|
||||
|
||||
// return an error code corresponding to the last parsing attempt, see
|
||||
// simdjson.h will return UNITIALIZED if no parsing was attempted
|
||||
inline int get_error_code() const noexcept;
|
||||
|
||||
// return the string equivalent of "get_error_code"
|
||||
inline std::string get_error_message() const noexcept;
|
||||
|
||||
// print the json to std::ostream (should be valid)
|
||||
// return false if the tape is likely wrong (e.g., you did not parse a valid
|
||||
// JSON).
|
||||
/** @deprecated Use cout << parser.parse() */
|
||||
inline bool print_json(std::ostream &os) const noexcept;
|
||||
inline bool dump_raw_tape(std::ostream &os) const noexcept;
|
||||
|
||||
//
|
||||
// Parser callbacks: these are internal!
|
||||
//
|
||||
// TODO find a way to do this without exposing the interface or crippling performance
|
||||
//
|
||||
|
||||
// this should be called when parsing (right before writing the tapes)
|
||||
inline void init_stage2() noexcept;
|
||||
really_inline error_code on_error(error_code new_error_code) noexcept;
|
||||
really_inline error_code on_success(error_code success_code) noexcept;
|
||||
really_inline bool on_start_document(uint32_t depth) noexcept;
|
||||
really_inline bool on_start_object(uint32_t depth) noexcept;
|
||||
really_inline bool on_start_array(uint32_t depth) noexcept;
|
||||
// TODO we're not checking this bool
|
||||
really_inline bool on_end_document(uint32_t depth) noexcept;
|
||||
really_inline bool on_end_object(uint32_t depth) noexcept;
|
||||
really_inline bool on_end_array(uint32_t depth) noexcept;
|
||||
really_inline bool on_true_atom() noexcept;
|
||||
really_inline bool on_false_atom() noexcept;
|
||||
really_inline bool on_null_atom() noexcept;
|
||||
really_inline uint8_t *on_start_string() noexcept;
|
||||
really_inline bool on_end_string(uint8_t *dst) noexcept;
|
||||
really_inline bool on_number_s64(int64_t value) noexcept;
|
||||
really_inline bool on_number_u64(uint64_t value) noexcept;
|
||||
really_inline bool on_number_double(double value) noexcept;
|
||||
//
|
||||
// Called before a parse is initiated.
|
||||
//
|
||||
// - Returns CAPACITY if the document is too large
|
||||
// - Returns MEMALLOC if we needed to allocate memory and could not
|
||||
//
|
||||
WARN_UNUSED inline error_code init_parse(size_t len) noexcept;
|
||||
|
||||
private:
|
||||
//
|
||||
// The maximum document length this parser supports.
|
||||
//
|
||||
// Buffers are large enough to handle any document up to this length.
|
||||
//
|
||||
size_t _capacity{0};
|
||||
|
||||
//
|
||||
// The maximum depth (number of nested objects and arrays) supported by this parser.
|
||||
//
|
||||
// Defaults to DEFAULT_MAX_DEPTH.
|
||||
//
|
||||
size_t _max_depth{0};
|
||||
|
||||
// all nodes are stored on the doc.tape using a 64-bit word.
|
||||
//
|
||||
// strings, double and ints are stored as
|
||||
// a 64-bit word with a pointer to the actual value
|
||||
//
|
||||
//
|
||||
//
|
||||
// for objects or arrays, store [ or { at the beginning and } and ] at the
|
||||
// end. For the openings ([ or {), we annotate them with a reference to the
|
||||
// location on the doc.tape of the end, and for then closings (} and ]), we
|
||||
// annotate them with a reference to the location of the opening
|
||||
//
|
||||
//
|
||||
|
||||
inline void write_tape(uint64_t val, tape_type t) noexcept;
|
||||
inline void annotate_previous_loc(uint32_t saved_loc, uint64_t val) noexcept;
|
||||
|
||||
//
|
||||
// Set the current capacity: the largest document this parser can support without reallocating.
|
||||
//
|
||||
// This will allocate *or deallocate* as necessary.
|
||||
//
|
||||
// Returns false if allocation fails.
|
||||
//
|
||||
inline WARN_UNUSED bool set_capacity(size_t capacity);
|
||||
|
||||
//
|
||||
// Set the maximum level of nested object and arrays supported by this parser.
|
||||
//
|
||||
// This will allocate *or deallocate* as necessary.
|
||||
//
|
||||
// Returns false if allocation fails.
|
||||
//
|
||||
inline WARN_UNUSED bool set_max_depth(size_t max_depth);
|
||||
|
||||
// Used internally to get the document
|
||||
inline const document &get_document() const noexcept(false);
|
||||
|
||||
template<size_t max_depth> friend class document_iterator;
|
||||
}; // class parser
|
||||
|
||||
} // namespace simdjson
|
||||
|
||||
#endif // SIMDJSON_DOCUMENT_PARSER_H
|
|
@ -9,6 +9,7 @@
|
|||
#include "simdjson/padded_string.h"
|
||||
#include "simdjson/internal/jsonformatutils.h"
|
||||
#include <iostream>
|
||||
#include <climits>
|
||||
|
||||
namespace simdjson {
|
||||
|
||||
|
@ -361,7 +362,7 @@ inline document::element_result document::doc_move_result::operator[](const char
|
|||
// document::parser inline implementation
|
||||
//
|
||||
really_inline document::parser::parser(size_t max_capacity, size_t max_depth) noexcept
|
||||
: _max_capacity{max_capacity}, _max_depth{max_depth} {
|
||||
: _max_capacity{max_capacity}, _max_depth{max_depth}, loaded_bytes(nullptr, &aligned_free_char) {
|
||||
|
||||
}
|
||||
inline bool document::parser::is_valid() const noexcept { return valid; }
|
||||
|
@ -387,15 +388,54 @@ inline const document &document::parser::get_document() const noexcept(false) {
|
|||
|
||||
#endif // SIMDJSON_EXCEPTIONS
|
||||
|
||||
inline simdjson_result<size_t> document::parser::read_file(const std::string &path) noexcept {
|
||||
// Open the file
|
||||
std::FILE *fp = std::fopen(path.c_str(), "rb");
|
||||
if (fp == nullptr) {
|
||||
return IO_ERROR;
|
||||
}
|
||||
|
||||
// Get the file size
|
||||
if(std::fseek(fp, 0, SEEK_END) < 0) {
|
||||
std::fclose(fp);
|
||||
return IO_ERROR;
|
||||
}
|
||||
long len = std::ftell(fp);
|
||||
if((len < 0) || (len == LONG_MAX)) {
|
||||
std::fclose(fp);
|
||||
return IO_ERROR;
|
||||
}
|
||||
|
||||
// Make sure we have enough capacity to load the file
|
||||
if (_loaded_bytes_capacity < size_t(len)) {
|
||||
loaded_bytes.reset( internal::allocate_padded_buffer(len) );
|
||||
if (!loaded_bytes) {
|
||||
std::fclose(fp);
|
||||
return MEMALLOC;
|
||||
}
|
||||
_loaded_bytes_capacity = len;
|
||||
}
|
||||
|
||||
// Read the string
|
||||
std::rewind(fp);
|
||||
size_t bytes_read = std::fread(loaded_bytes.get(), 1, len, fp);
|
||||
if (std::fclose(fp) != 0 || bytes_read != size_t(len)) {
|
||||
return IO_ERROR;
|
||||
}
|
||||
|
||||
return bytes_read;
|
||||
}
|
||||
|
||||
inline document::doc_result document::parser::load(const std::string &path) noexcept {
|
||||
auto [json, _error] = padded_string::load(path);
|
||||
if (_error) { return doc_result(doc, _error); }
|
||||
return parse(json);
|
||||
auto [len, code] = read_file(path);
|
||||
if (code) { return doc_result(doc, code); }
|
||||
|
||||
return parse(loaded_bytes.get(), len, false);
|
||||
}
|
||||
|
||||
inline document::stream document::parser::load_many(const std::string &path, size_t batch_size) noexcept {
|
||||
auto [json, _error] = padded_string::load(path);
|
||||
return stream(*this, reinterpret_cast<const uint8_t*>(json.data()), json.length(), batch_size, _error);
|
||||
auto [len, code] = read_file(path);
|
||||
return stream(*this, (const uint8_t*)loaded_bytes.get(), len, batch_size, code);
|
||||
}
|
||||
|
||||
inline document::doc_result document::parser::parse(const uint8_t *buf, size_t len, bool realloc_if_needed) noexcept {
|
||||
|
@ -480,7 +520,7 @@ inline error_code document::parser::set_capacity(size_t capacity) noexcept {
|
|||
// Initialize stage 1 output
|
||||
//
|
||||
uint32_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
|
||||
structural_indexes.reset( new (std::nothrow) uint32_t[max_structures]); // TODO realloc
|
||||
structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); // TODO realloc
|
||||
if (!structural_indexes) {
|
||||
return MEMALLOC;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue