Create parser.parse_many() API
This commit is contained in:
parent
b2220d6157
commit
cfef4ff2ad
2
Makefile
2
Makefile
|
@ -65,7 +65,7 @@ SRCHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/we
|
||||||
SRCHEADERS_SRC=src/isadetection.h src/jsoncharutils.h src/simdprune_tables.h src/error.cpp src/jsonioutil.cpp src/implementation.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document_parser_callbacks.h
|
SRCHEADERS_SRC=src/isadetection.h src/jsoncharutils.h src/simdprune_tables.h src/error.cpp src/jsonioutil.cpp src/implementation.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document_parser_callbacks.h
|
||||||
SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE)
|
SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE)
|
||||||
|
|
||||||
INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
|
INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
|
||||||
|
|
||||||
ifeq ($(SIMDJSON_TEST_AMALGAMATED_HEADERS),1)
|
ifeq ($(SIMDJSON_TEST_AMALGAMATED_HEADERS),1)
|
||||||
HEADERS=singleheader/simdjson.h
|
HEADERS=singleheader/simdjson.h
|
||||||
|
|
|
@ -4,11 +4,13 @@ set(SIMDJSON_INCLUDE
|
||||||
${SIMDJSON_INCLUDE_DIR}/simdjson/common_defs.h
|
${SIMDJSON_INCLUDE_DIR}/simdjson/common_defs.h
|
||||||
${SIMDJSON_INCLUDE_DIR}/simdjson/compiler_check.h
|
${SIMDJSON_INCLUDE_DIR}/simdjson/compiler_check.h
|
||||||
${SIMDJSON_INCLUDE_DIR}/simdjson/document_iterator.h
|
${SIMDJSON_INCLUDE_DIR}/simdjson/document_iterator.h
|
||||||
|
${SIMDJSON_INCLUDE_DIR}/simdjson/document_stream.h
|
||||||
${SIMDJSON_INCLUDE_DIR}/simdjson/document.h
|
${SIMDJSON_INCLUDE_DIR}/simdjson/document.h
|
||||||
${SIMDJSON_INCLUDE_DIR}/simdjson/error.h
|
${SIMDJSON_INCLUDE_DIR}/simdjson/error.h
|
||||||
${SIMDJSON_INCLUDE_DIR}/simdjson/implementation.h
|
${SIMDJSON_INCLUDE_DIR}/simdjson/implementation.h
|
||||||
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/document.h
|
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/document_stream.h
|
||||||
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/document_iterator.h
|
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/document_iterator.h
|
||||||
|
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/document.h
|
||||||
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/jsonstream.h
|
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/jsonstream.h
|
||||||
${SIMDJSON_INCLUDE_DIR}/simdjson/internal/jsonformatutils.h
|
${SIMDJSON_INCLUDE_DIR}/simdjson/internal/jsonformatutils.h
|
||||||
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonioutil.h
|
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonioutil.h
|
||||||
|
|
|
@ -9,18 +9,20 @@
|
||||||
#include "simdjson/padded_string.h"
|
#include "simdjson/padded_string.h"
|
||||||
#include "simdjson/implementation.h"
|
#include "simdjson/implementation.h"
|
||||||
#include "simdjson/document.h"
|
#include "simdjson/document.h"
|
||||||
#include "simdjson/jsonstream.h"
|
#include "simdjson/document_stream.h"
|
||||||
#include "simdjson/jsonminifier.h"
|
#include "simdjson/jsonminifier.h"
|
||||||
|
|
||||||
// Deprecated API
|
// Deprecated API
|
||||||
#include "simdjson/parsedjsoniterator.h"
|
#include "simdjson/parsedjsoniterator.h"
|
||||||
#include "simdjson/jsonparser.h"
|
#include "simdjson/jsonparser.h"
|
||||||
#include "simdjson/parsedjson.h"
|
#include "simdjson/parsedjson.h"
|
||||||
|
#include "simdjson/jsonstream.h"
|
||||||
#include "simdjson/document_iterator.h"
|
#include "simdjson/document_iterator.h"
|
||||||
|
|
||||||
// Inline functions
|
// Inline functions
|
||||||
#include "simdjson/inline/document.h"
|
#include "simdjson/inline/document.h"
|
||||||
#include "simdjson/inline/document_iterator.h"
|
#include "simdjson/inline/document_iterator.h"
|
||||||
|
#include "simdjson/inline/document_stream.h"
|
||||||
#include "simdjson/inline/jsonstream.h"
|
#include "simdjson/inline/jsonstream.h"
|
||||||
|
|
||||||
#endif // SIMDJSON_H
|
#endif // SIMDJSON_H
|
||||||
|
|
|
@ -52,11 +52,13 @@ public:
|
||||||
class object;
|
class object;
|
||||||
class key_value_pair;
|
class key_value_pair;
|
||||||
class parser;
|
class parser;
|
||||||
|
class stream;
|
||||||
|
|
||||||
template<typename T=element>
|
template<typename T=element>
|
||||||
class element_result;
|
class element_result;
|
||||||
class doc_result;
|
class doc_result;
|
||||||
class doc_ref_result;
|
class doc_ref_result;
|
||||||
|
class stream_result;
|
||||||
|
|
||||||
// Nested classes. See definitions later in file.
|
// Nested classes. See definitions later in file.
|
||||||
using iterator = document_iterator<DEFAULT_MAX_DEPTH>;
|
using iterator = document_iterator<DEFAULT_MAX_DEPTH>;
|
||||||
|
@ -315,6 +317,7 @@ public:
|
||||||
private:
|
private:
|
||||||
doc_ref_result(document &_doc, error_code _error) noexcept;
|
doc_ref_result(document &_doc, error_code _error) noexcept;
|
||||||
friend class document::parser;
|
friend class document::parser;
|
||||||
|
friend class document::stream;
|
||||||
}; // class document::doc_ref_result
|
}; // class document::doc_ref_result
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -927,6 +930,255 @@ public:
|
||||||
// We do not want to allow implicit conversion from C string to std::string.
|
// We do not want to allow implicit conversion from C string to std::string.
|
||||||
really_inline doc_ref_result parse(const char *buf) noexcept = delete;
|
really_inline doc_ref_result parse(const char *buf) noexcept = delete;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a buffer containing many JSON documents.
|
||||||
|
*
|
||||||
|
* document::parser parser;
|
||||||
|
* for (const document &doc : parser.parse_many(buf, len)) {
|
||||||
|
* cout << std::string(doc["title"]) << endl;
|
||||||
|
* }
|
||||||
|
*
|
||||||
|
* ### Format
|
||||||
|
*
|
||||||
|
* The buffer must contain a series of one or more JSON documents, concatenated into a single
|
||||||
|
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
|
||||||
|
* then starts parsing the next document at that point. (It does this with more parallelism and
|
||||||
|
* lookahead than you might think, though.)
|
||||||
|
*
|
||||||
|
* documents that consist of an object or array may omit the whitespace between them, concatenating
|
||||||
|
* with no separator. documents that consist of a single primitive (i.e. documents that are not
|
||||||
|
* arrays or objects) MUST be separated with whitespace.
|
||||||
|
*
|
||||||
|
* ### Error Handling
|
||||||
|
*
|
||||||
|
* All errors are returned during iteration: if there is a global error such as memory allocation,
|
||||||
|
* it will be yielded as the first result. Iteration always stops after the first error.
|
||||||
|
*
|
||||||
|
* As with all other simdjson methods, non-exception error handling is readily available through
|
||||||
|
* the same interface, requiring you to check the error before using the document:
|
||||||
|
*
|
||||||
|
* document::parser parser;
|
||||||
|
* for (auto [doc, error] : parser.parse_many(buf, len)) {
|
||||||
|
* if (error) { cerr << error_message(error) << endl; exit(1); }
|
||||||
|
* cout << std::string(doc["title"]) << endl;
|
||||||
|
* }
|
||||||
|
*
|
||||||
|
* ### REQUIRED: Buffer Padding
|
||||||
|
*
|
||||||
|
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
|
||||||
|
* those bytes are initialized to, as long as they are allocated.
|
||||||
|
*
|
||||||
|
* ### Threads
|
||||||
|
*
|
||||||
|
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
|
||||||
|
* hood to do some lookahead.
|
||||||
|
*
|
||||||
|
* ### Parser Capacity
|
||||||
|
*
|
||||||
|
* If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
|
||||||
|
* allocated, it must have a capacity at least as large as batch_size.
|
||||||
|
*
|
||||||
|
* @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
|
||||||
|
* @param len The length of the concatenated JSON.
|
||||||
|
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
|
||||||
|
* spot is cache-related: small enough to fit in cache, yet big enough to
|
||||||
|
* parse as many documents as possible in one tight loop.
|
||||||
|
* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
|
||||||
|
* @return The stream. If there is an error, it will be returned during iteration. An empty input
|
||||||
|
* will yield 0 documents rather than an EMPTY error. Errors:
|
||||||
|
* - MEMALLOC if the parser is unallocated and memory allocation fails
|
||||||
|
* - CAPACITY if the parser already has a capacity, and it is less than batch_size
|
||||||
|
* - other json errors if parsing fails.
|
||||||
|
*/
|
||||||
|
inline stream parse_many(const uint8_t *buf, size_t len, size_t batch_size = 1000000) noexcept;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a buffer containing many JSON documents.
|
||||||
|
*
|
||||||
|
* document::parser parser;
|
||||||
|
* for (const document &doc : parser.parse_many(buf, len)) {
|
||||||
|
* cout << std::string(doc["title"]) << endl;
|
||||||
|
* }
|
||||||
|
*
|
||||||
|
* ### Format
|
||||||
|
*
|
||||||
|
* The buffer must contain a series of one or more JSON documents, concatenated into a single
|
||||||
|
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
|
||||||
|
* then starts parsing the next document at that point. (It does this with more parallelism and
|
||||||
|
* lookahead than you might think, though.)
|
||||||
|
*
|
||||||
|
* documents that consist of an object or array may omit the whitespace between them, concatenating
|
||||||
|
* with no separator. documents that consist of a single primitive (i.e. documents that are not
|
||||||
|
* arrays or objects) MUST be separated with whitespace.
|
||||||
|
*
|
||||||
|
* ### Error Handling
|
||||||
|
*
|
||||||
|
* All errors are returned during iteration: if there is a global error such as memory allocation,
|
||||||
|
* it will be yielded as the first result. Iteration always stops after the first error.
|
||||||
|
*
|
||||||
|
* As with all other simdjson methods, non-exception error handling is readily available through
|
||||||
|
* the same interface, requiring you to check the error before using the document:
|
||||||
|
*
|
||||||
|
* document::parser parser;
|
||||||
|
* for (auto [doc, error] : parser.parse_many(buf, len)) {
|
||||||
|
* if (error) { cerr << error_message(error) << endl; exit(1); }
|
||||||
|
* cout << std::string(doc["title"]) << endl;
|
||||||
|
* }
|
||||||
|
*
|
||||||
|
* ### REQUIRED: Buffer Padding
|
||||||
|
*
|
||||||
|
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
|
||||||
|
* those bytes are initialized to, as long as they are allocated.
|
||||||
|
*
|
||||||
|
* ### Threads
|
||||||
|
*
|
||||||
|
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
|
||||||
|
* hood to do some lookahead.
|
||||||
|
*
|
||||||
|
* ### Parser Capacity
|
||||||
|
*
|
||||||
|
* If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
|
||||||
|
* allocated, it must have a capacity at least as large as batch_size.
|
||||||
|
*
|
||||||
|
* @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
|
||||||
|
* @param len The length of the concatenated JSON.
|
||||||
|
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
|
||||||
|
* spot is cache-related: small enough to fit in cache, yet big enough to
|
||||||
|
* parse as many documents as possible in one tight loop.
|
||||||
|
* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
|
||||||
|
* @return The stream. If there is an error, it will be returned during iteration. An empty input
|
||||||
|
* will yield 0 documents rather than an EMPTY error. Errors:
|
||||||
|
* - MEMALLOC if the parser is unallocated and memory allocation fails
|
||||||
|
* - CAPACITY if the parser already has a capacity, and it is less than batch_size
|
||||||
|
* - other json errors if parsing fails
|
||||||
|
*/
|
||||||
|
inline stream parse_many(const char *buf, size_t len, size_t batch_size = 1000000) noexcept;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a buffer containing many JSON documents.
|
||||||
|
*
|
||||||
|
* document::parser parser;
|
||||||
|
* for (const document &doc : parser.parse_many(buf, len)) {
|
||||||
|
* cout << std::string(doc["title"]) << endl;
|
||||||
|
* }
|
||||||
|
*
|
||||||
|
* ### Format
|
||||||
|
*
|
||||||
|
* The buffer must contain a series of one or more JSON documents, concatenated into a single
|
||||||
|
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
|
||||||
|
* then starts parsing the next document at that point. (It does this with more parallelism and
|
||||||
|
* lookahead than you might think, though.)
|
||||||
|
*
|
||||||
|
* documents that consist of an object or array may omit the whitespace between them, concatenating
|
||||||
|
* with no separator. documents that consist of a single primitive (i.e. documents that are not
|
||||||
|
* arrays or objects) MUST be separated with whitespace.
|
||||||
|
*
|
||||||
|
* ### Error Handling
|
||||||
|
*
|
||||||
|
* All errors are returned during iteration: if there is a global error such as memory allocation,
|
||||||
|
* it will be yielded as the first result. Iteration always stops after the first error.
|
||||||
|
*
|
||||||
|
* As with all other simdjson methods, non-exception error handling is readily available through
|
||||||
|
* the same interface, requiring you to check the error before using the document:
|
||||||
|
*
|
||||||
|
* document::parser parser;
|
||||||
|
* for (auto [doc, error] : parser.parse_many(buf, len)) {
|
||||||
|
* if (error) { cerr << error_message(error) << endl; exit(1); }
|
||||||
|
* cout << std::string(doc["title"]) << endl;
|
||||||
|
* }
|
||||||
|
*
|
||||||
|
* ### REQUIRED: Buffer Padding
|
||||||
|
*
|
||||||
|
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
|
||||||
|
* those bytes are initialized to, as long as they are allocated.
|
||||||
|
*
|
||||||
|
* ### Threads
|
||||||
|
*
|
||||||
|
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
|
||||||
|
* hood to do some lookahead.
|
||||||
|
*
|
||||||
|
* ### Parser Capacity
|
||||||
|
*
|
||||||
|
* If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
|
||||||
|
* allocated, it must have a capacity at least as large as batch_size.
|
||||||
|
*
|
||||||
|
* @param s The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
|
||||||
|
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
|
||||||
|
* spot is cache-related: small enough to fit in cache, yet big enough to
|
||||||
|
* parse as many documents as possible in one tight loop.
|
||||||
|
* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
|
||||||
|
* @return he stream. If there is an error, it will be returned during iteration. An empty input
|
||||||
|
* will yield 0 documents rather than an EMPTY error. Errors:
|
||||||
|
* - MEMALLOC if the parser is unallocated and memory allocation fails
|
||||||
|
* - CAPACITY if the parser already has a capacity, and it is less than batch_size
|
||||||
|
* - other json errors if parsing fails
|
||||||
|
*/
|
||||||
|
inline stream parse_many(const std::string &s, size_t batch_size = 1000000) noexcept;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a buffer containing many JSON documents.
|
||||||
|
*
|
||||||
|
* document::parser parser;
|
||||||
|
* for (const document &doc : parser.parse_many(buf, len)) {
|
||||||
|
* cout << std::string(doc["title"]) << endl;
|
||||||
|
* }
|
||||||
|
*
|
||||||
|
* ### Format
|
||||||
|
*
|
||||||
|
* The buffer must contain a series of one or more JSON documents, concatenated into a single
|
||||||
|
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
|
||||||
|
* then starts parsing the next document at that point. (It does this with more parallelism and
|
||||||
|
* lookahead than you might think, though.)
|
||||||
|
*
|
||||||
|
* documents that consist of an object or array may omit the whitespace between them, concatenating
|
||||||
|
* with no separator. documents that consist of a single primitive (i.e. documents that are not
|
||||||
|
* arrays or objects) MUST be separated with whitespace.
|
||||||
|
*
|
||||||
|
* ### Error Handling
|
||||||
|
*
|
||||||
|
* All errors are returned during iteration: if there is a global error such as memory allocation,
|
||||||
|
* it will be yielded as the first result. Iteration always stops after the first error.
|
||||||
|
*
|
||||||
|
* As with all other simdjson methods, non-exception error handling is readily available through
|
||||||
|
* the same interface, requiring you to check the error before using the document:
|
||||||
|
*
|
||||||
|
* document::parser parser;
|
||||||
|
* for (auto [doc, error] : parser.parse_many(buf, len)) {
|
||||||
|
* if (error) { cerr << error_message(error) << endl; exit(1); }
|
||||||
|
* cout << std::string(doc["title"]) << endl;
|
||||||
|
* }
|
||||||
|
*
|
||||||
|
* ### REQUIRED: Buffer Padding
|
||||||
|
*
|
||||||
|
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
|
||||||
|
* those bytes are initialized to, as long as they are allocated.
|
||||||
|
*
|
||||||
|
* ### Threads
|
||||||
|
*
|
||||||
|
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
|
||||||
|
* hood to do some lookahead.
|
||||||
|
*
|
||||||
|
* ### Parser Capacity
|
||||||
|
*
|
||||||
|
* If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
|
||||||
|
* allocated, it must have a capacity at least as large as batch_size.
|
||||||
|
*
|
||||||
|
* @param s The concatenated JSON to parse.
|
||||||
|
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
|
||||||
|
* spot is cache-related: small enough to fit in cache, yet big enough to
|
||||||
|
* parse as many documents as possible in one tight loop.
|
||||||
|
* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
|
||||||
|
* @return he stream. If there is an error, it will be returned during iteration. An empty input
|
||||||
|
* will yield 0 documents rather than an EMPTY error. Errors:
|
||||||
|
* - MEMALLOC if the parser is unallocated and memory allocation fails
|
||||||
|
* - CAPACITY if the parser already has a capacity, and it is less than batch_size
|
||||||
|
* - other json errors if parsing fails
|
||||||
|
*/
|
||||||
|
inline stream parse_many(const padded_string &s, size_t batch_size = 1000000) noexcept;
|
||||||
|
|
||||||
|
// We do not want to allow implicit conversion from C string to std::string.
|
||||||
|
really_inline doc_ref_result parse_many(const char *buf, size_t batch_size = 1000000) noexcept = delete;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Current capacity: the largest document this parser can support without reallocating.
|
* Current capacity: the largest document this parser can support without reallocating.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -0,0 +1,155 @@
|
||||||
|
#ifndef SIMDJSON_DOCUMENT_STREAM_H
|
||||||
|
#define SIMDJSON_DOCUMENT_STREAM_H
|
||||||
|
|
||||||
|
#include <thread>
|
||||||
|
#include "simdjson/document.h"
|
||||||
|
|
||||||
|
namespace simdjson {
|
||||||
|
|
||||||
|
template <class string_container = padded_string> class JsonStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A forward-only stream of documents.
|
||||||
|
*
|
||||||
|
* Produced by document::parser::parse_many.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
class document::stream {
|
||||||
|
public:
|
||||||
|
really_inline ~stream() noexcept;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Take another stream's buffers and state.
|
||||||
|
*
|
||||||
|
* @param other The stream to take. Its capacity is zeroed.
|
||||||
|
*/
|
||||||
|
stream(document::stream &&other) = default;
|
||||||
|
stream(const document::stream &) = delete; // Disallow copying
|
||||||
|
/**
|
||||||
|
* Take another stream's buffers and state.
|
||||||
|
*
|
||||||
|
* @param other The stream to take. Its capacity is zeroed.
|
||||||
|
*/
|
||||||
|
stream &operator=(document::stream &&other) = default;
|
||||||
|
stream &operator=(const document::stream &) = delete; // Disallow copying
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An iterator through a forward-only stream of documents.
|
||||||
|
*/
|
||||||
|
class iterator {
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* Get the current document (or error).
|
||||||
|
*/
|
||||||
|
really_inline doc_ref_result operator*() noexcept;
|
||||||
|
/**
|
||||||
|
* Advance to the next document.
|
||||||
|
*/
|
||||||
|
inline iterator& operator++() noexcept;
|
||||||
|
/**
|
||||||
|
* Check if we're at the end yet.
|
||||||
|
* @param other the end iterator to compare to.
|
||||||
|
*/
|
||||||
|
really_inline bool operator!=(const iterator &other) const noexcept;
|
||||||
|
|
||||||
|
private:
|
||||||
|
iterator(stream& stream, bool finished) noexcept;
|
||||||
|
/** The stream parser we're iterating through. */
|
||||||
|
stream& _stream;
|
||||||
|
/** Whether we're finished or not. */
|
||||||
|
bool finished;
|
||||||
|
friend class stream;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start iterating the documents in the stream.
|
||||||
|
*/
|
||||||
|
really_inline iterator begin() noexcept;
|
||||||
|
/**
|
||||||
|
* The end of the stream, for iterator comparison purposes.
|
||||||
|
*/
|
||||||
|
really_inline iterator end() noexcept;
|
||||||
|
|
||||||
|
private:
|
||||||
|
really_inline stream(document::parser &parser, const uint8_t *buf, size_t len, size_t batch_size = 1000000) noexcept;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse the next document found in the buffer previously given to stream.
|
||||||
|
*
|
||||||
|
* The content should be a valid JSON document encoded as UTF-8. If there is a
|
||||||
|
* UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
|
||||||
|
* discouraged.
|
||||||
|
*
|
||||||
|
* You do NOT need to pre-allocate a parser. This function takes care of
|
||||||
|
* pre-allocating a capacity defined by the batch_size defined when creating the
|
||||||
|
* stream object.
|
||||||
|
*
|
||||||
|
* The function returns simdjson::SUCCESS_AND_HAS_MORE (an integer = 1) in case
|
||||||
|
* of success and indicates that the buffer still contains more data to be parsed,
|
||||||
|
* meaning this function can be called again to return the next JSON document
|
||||||
|
* after this one.
|
||||||
|
*
|
||||||
|
* The function returns simdjson::SUCCESS (as integer = 0) in case of success
|
||||||
|
* and indicates that the buffer has successfully been parsed to the end.
|
||||||
|
* Every document it contained has been parsed without error.
|
||||||
|
*
|
||||||
|
* The function returns an error code from simdjson/simdjson.h in case of failure
|
||||||
|
* such as simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
|
||||||
|
* the simdjson::error_message function converts these error codes into a string).
|
||||||
|
*
|
||||||
|
* You can also check validity by calling parser.is_valid(). The same parser can
|
||||||
|
* and should be reused for the other documents in the buffer. */
|
||||||
|
inline error_code json_parse() noexcept;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the location (index) of where the next document should be in the
|
||||||
|
* buffer.
|
||||||
|
* Can be used for debugging, it tells the user the position of the end of the
|
||||||
|
* last
|
||||||
|
* valid JSON document parsed
|
||||||
|
*/
|
||||||
|
inline size_t get_current_buffer_loc() const { return current_buffer_loc; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the total amount of complete documents parsed by the JsonStream,
|
||||||
|
* in the current buffer, at the given time.
|
||||||
|
*/
|
||||||
|
inline size_t get_n_parsed_docs() const { return n_parsed_docs; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the total amount of data (in bytes) parsed by the JsonStream,
|
||||||
|
* in the current buffer, at the given time.
|
||||||
|
*/
|
||||||
|
inline size_t get_n_bytes_parsed() const { return n_bytes_parsed; }
|
||||||
|
|
||||||
|
inline const uint8_t *buf() const { return _buf + buf_start; }
|
||||||
|
|
||||||
|
inline void advance(size_t offset) { buf_start += offset; }
|
||||||
|
|
||||||
|
inline size_t remaining() const { return _len - buf_start; }
|
||||||
|
|
||||||
|
document::parser &parser;
|
||||||
|
const uint8_t *_buf;
|
||||||
|
const size_t _len;
|
||||||
|
size_t _batch_size; // this is actually variable!
|
||||||
|
size_t buf_start{0};
|
||||||
|
size_t next_json{0};
|
||||||
|
bool load_next_batch{true};
|
||||||
|
size_t current_buffer_loc{0};
|
||||||
|
#ifdef SIMDJSON_THREADS_ENABLED
|
||||||
|
size_t last_json_buffer_loc{0};
|
||||||
|
#endif
|
||||||
|
size_t n_parsed_docs{0};
|
||||||
|
size_t n_bytes_parsed{0};
|
||||||
|
error_code error{SUCCESS_AND_HAS_MORE};
|
||||||
|
#ifdef SIMDJSON_THREADS_ENABLED
|
||||||
|
error_code stage1_is_ok_thread{SUCCESS};
|
||||||
|
std::thread stage_1_thread;
|
||||||
|
document::parser parser_thread;
|
||||||
|
#endif
|
||||||
|
template <class string_container> friend class JsonStream;
|
||||||
|
friend class document::parser;
|
||||||
|
}; // end of class JsonStream
|
||||||
|
|
||||||
|
} // end of namespace simdjson
|
||||||
|
#endif // SIMDJSON_DOCUMENT_STREAM_H
|
|
@ -4,6 +4,7 @@
|
||||||
// Inline implementations go in here.
|
// Inline implementations go in here.
|
||||||
|
|
||||||
#include "simdjson/document.h"
|
#include "simdjson/document.h"
|
||||||
|
#include "simdjson/document_stream.h"
|
||||||
#include "simdjson/implementation.h"
|
#include "simdjson/implementation.h"
|
||||||
#include "simdjson/internal/jsonformatutils.h"
|
#include "simdjson/internal/jsonformatutils.h"
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
@ -515,6 +516,19 @@ really_inline document::doc_ref_result document::parser::parse(const padded_stri
|
||||||
return parse(s.data(), s.length(), false);
|
return parse(s.data(), s.length(), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline document::stream document::parser::parse_many(const uint8_t *buf, size_t len, size_t batch_size) noexcept {
|
||||||
|
return stream(*this, buf, len, batch_size);
|
||||||
|
}
|
||||||
|
inline document::stream document::parser::parse_many(const char *buf, size_t len, size_t batch_size) noexcept {
|
||||||
|
return parse_many((const uint8_t *)buf, len, batch_size);
|
||||||
|
}
|
||||||
|
inline document::stream document::parser::parse_many(const std::string &s, size_t batch_size) noexcept {
|
||||||
|
return parse_many(s.data(), s.length(), batch_size);
|
||||||
|
}
|
||||||
|
inline document::stream document::parser::parse_many(const padded_string &s, size_t batch_size) noexcept {
|
||||||
|
return parse_many(s.data(), s.length(), batch_size);
|
||||||
|
}
|
||||||
|
|
||||||
really_inline size_t document::parser::capacity() const noexcept {
|
really_inline size_t document::parser::capacity() const noexcept {
|
||||||
return _capacity;
|
return _capacity;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,288 @@
|
||||||
|
#ifndef SIMDJSON_INLINE_DOCUMENT_STREAM_H
|
||||||
|
#define SIMDJSON_INLINE_DOCUMENT_STREAM_H
|
||||||
|
|
||||||
|
#include "simdjson/jsonstream.h"
|
||||||
|
#include <algorithm>
|
||||||
|
#include <limits>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
namespace simdjson::internal {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This algorithm is used to quickly identify the buffer position of
|
||||||
|
* the last JSON document inside the current batch.
|
||||||
|
*
|
||||||
|
* It does its work by finding the last pair of structural characters
|
||||||
|
* that represent the end followed by the start of a document.
|
||||||
|
*
|
||||||
|
* Simply put, we iterate over the structural characters, starting from
|
||||||
|
* the end. We consider that we found the end of a JSON document when the
|
||||||
|
* first element of the pair is NOT one of these characters: '{' '[' ';' ','
|
||||||
|
* and when the second element is NOT one of these characters: '}' '}' ';' ','.
|
||||||
|
*
|
||||||
|
* This simple comparison works most of the time, but it does not cover cases
|
||||||
|
* where the batch's structural indexes contain a perfect amount of documents.
|
||||||
|
* In such a case, we do not have access to the structural index which follows
|
||||||
|
* the last document, therefore, we do not have access to the second element in
|
||||||
|
* the pair, and means that we cannot identify the last document. To fix this
|
||||||
|
* issue, we keep a count of the open and closed curly/square braces we found
|
||||||
|
* while searching for the pair. When we find a pair AND the count of open and
|
||||||
|
* closed curly/square braces is the same, we know that we just passed a
|
||||||
|
* complete
|
||||||
|
* document, therefore the last json buffer location is the end of the batch
|
||||||
|
* */
|
||||||
|
inline size_t find_last_json_buf_idx(const uint8_t *buf, size_t size, const document::parser &parser) {
|
||||||
|
// this function can be generally useful
|
||||||
|
if (parser.n_structural_indexes == 0)
|
||||||
|
return 0;
|
||||||
|
auto last_i = parser.n_structural_indexes - 1;
|
||||||
|
if (parser.structural_indexes[last_i] == size) {
|
||||||
|
if (last_i == 0)
|
||||||
|
return 0;
|
||||||
|
last_i = parser.n_structural_indexes - 2;
|
||||||
|
}
|
||||||
|
auto arr_cnt = 0;
|
||||||
|
auto obj_cnt = 0;
|
||||||
|
for (auto i = last_i; i > 0; i--) {
|
||||||
|
auto idxb = parser.structural_indexes[i];
|
||||||
|
switch (buf[idxb]) {
|
||||||
|
case ':':
|
||||||
|
case ',':
|
||||||
|
continue;
|
||||||
|
case '}':
|
||||||
|
obj_cnt--;
|
||||||
|
continue;
|
||||||
|
case ']':
|
||||||
|
arr_cnt--;
|
||||||
|
continue;
|
||||||
|
case '{':
|
||||||
|
obj_cnt++;
|
||||||
|
break;
|
||||||
|
case '[':
|
||||||
|
arr_cnt++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto idxa = parser.structural_indexes[i - 1];
|
||||||
|
switch (buf[idxa]) {
|
||||||
|
case '{':
|
||||||
|
case '[':
|
||||||
|
case ':':
|
||||||
|
case ',':
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!arr_cnt && !obj_cnt) {
|
||||||
|
return last_i + 1;
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns true if the provided byte value is an ASCII character
|
||||||
|
static inline bool is_ascii(char c) {
|
||||||
|
return ((unsigned char)c) <= 127;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if the string ends with UTF-8 values, backtrack
|
||||||
|
// up to the first ASCII character. May return 0.
|
||||||
|
static inline size_t trimmed_length_safe_utf8(const char * c, size_t len) {
|
||||||
|
while ((len > 0) and (not is_ascii(c[len - 1]))) {
|
||||||
|
len--;
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace simdjson::internal
|
||||||
|
|
||||||
|
namespace simdjson {
|
||||||
|
|
||||||
|
really_inline document::stream::stream(
|
||||||
|
document::parser &_parser,
|
||||||
|
const uint8_t *buf,
|
||||||
|
size_t len,
|
||||||
|
size_t batch_size
|
||||||
|
) noexcept : parser{_parser}, _buf{buf}, _len{len}, _batch_size(batch_size) {
|
||||||
|
error = json_parse();
|
||||||
|
}
|
||||||
|
|
||||||
|
inline document::stream::~stream() noexcept {
|
||||||
|
#ifdef SIMDJSON_THREADS_ENABLED
|
||||||
|
if (stage_1_thread.joinable()) {
|
||||||
|
stage_1_thread.join();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
really_inline document::stream::iterator document::stream::begin() noexcept {
|
||||||
|
return iterator(*this, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
really_inline document::stream::iterator document::stream::end() noexcept {
|
||||||
|
return iterator(*this, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
really_inline document::stream::iterator::iterator(stream& stream, bool _is_end) noexcept
|
||||||
|
: _stream{stream}, finished{_is_end} {
|
||||||
|
}
|
||||||
|
|
||||||
|
really_inline document::doc_ref_result document::stream::iterator::operator*() noexcept {
|
||||||
|
return doc_ref_result(_stream.parser.doc, _stream.error == SUCCESS_AND_HAS_MORE ? SUCCESS : _stream.error);
|
||||||
|
}
|
||||||
|
|
||||||
|
really_inline document::stream::iterator& document::stream::iterator::operator++() noexcept {
|
||||||
|
if (_stream.error == SUCCESS_AND_HAS_MORE) {
|
||||||
|
_stream.error = _stream.json_parse();
|
||||||
|
} else {
|
||||||
|
finished = true;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
really_inline bool document::stream::iterator::operator!=(const document::stream::iterator &other) const noexcept {
|
||||||
|
return finished != other.finished;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef SIMDJSON_THREADS_ENABLED
|
||||||
|
|
||||||
|
// threaded version of json_parse
|
||||||
|
// todo: simplify this code further
|
||||||
|
inline error_code document::stream::json_parse() noexcept {
|
||||||
|
// TODO we should bump the parser *anytime* capacity is less than batch size, not just 0.
|
||||||
|
if (unlikely(parser.capacity() == 0)) {
|
||||||
|
const bool allocok = parser.allocate_capacity(_batch_size);
|
||||||
|
if (!allocok) {
|
||||||
|
return simdjson::MEMALLOC;
|
||||||
|
}
|
||||||
|
} else if (unlikely(parser.capacity() < _batch_size)) {
|
||||||
|
return simdjson::CAPACITY;
|
||||||
|
}
|
||||||
|
if (unlikely(parser_thread.capacity() < _batch_size)) {
|
||||||
|
const bool allocok_thread = parser_thread.allocate_capacity(_batch_size);
|
||||||
|
if (!allocok_thread) {
|
||||||
|
return simdjson::MEMALLOC;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (unlikely(load_next_batch)) {
|
||||||
|
// First time loading
|
||||||
|
if (!stage_1_thread.joinable()) {
|
||||||
|
_batch_size = (std::min)(_batch_size, remaining());
|
||||||
|
_batch_size = internal::trimmed_length_safe_utf8((const char *)buf(), _batch_size);
|
||||||
|
if (_batch_size == 0) {
|
||||||
|
return simdjson::UTF8_ERROR;
|
||||||
|
}
|
||||||
|
auto stage1_is_ok = error_code(simdjson::active_implementation->stage1(buf(), _batch_size, parser, true));
|
||||||
|
if (stage1_is_ok != simdjson::SUCCESS) {
|
||||||
|
return stage1_is_ok;
|
||||||
|
}
|
||||||
|
size_t last_index = internal::find_last_json_buf_idx(buf(), _batch_size, parser);
|
||||||
|
if (last_index == 0) {
|
||||||
|
if (parser.n_structural_indexes == 0) {
|
||||||
|
return simdjson::EMPTY;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
parser.n_structural_indexes = last_index + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// the second thread is running or done.
|
||||||
|
else {
|
||||||
|
stage_1_thread.join();
|
||||||
|
if (stage1_is_ok_thread != simdjson::SUCCESS) {
|
||||||
|
return stage1_is_ok_thread;
|
||||||
|
}
|
||||||
|
std::swap(parser.structural_indexes, parser_thread.structural_indexes);
|
||||||
|
parser.n_structural_indexes = parser_thread.n_structural_indexes;
|
||||||
|
advance(last_json_buffer_loc);
|
||||||
|
n_bytes_parsed += last_json_buffer_loc;
|
||||||
|
}
|
||||||
|
// let us decide whether we will start a new thread
|
||||||
|
if (remaining() - _batch_size > 0) {
|
||||||
|
last_json_buffer_loc =
|
||||||
|
parser.structural_indexes[internal::find_last_json_buf_idx(buf(), _batch_size, parser)];
|
||||||
|
_batch_size = (std::min)(_batch_size, remaining() - last_json_buffer_loc);
|
||||||
|
if (_batch_size > 0) {
|
||||||
|
_batch_size = internal::trimmed_length_safe_utf8(
|
||||||
|
(const char *)(buf() + last_json_buffer_loc), _batch_size);
|
||||||
|
if (_batch_size == 0) {
|
||||||
|
return simdjson::UTF8_ERROR;
|
||||||
|
}
|
||||||
|
// let us capture read-only variables
|
||||||
|
const uint8_t *const b = buf() + last_json_buffer_loc;
|
||||||
|
const size_t bs = _batch_size;
|
||||||
|
// we call the thread on a lambda that will update
|
||||||
|
// this->stage1_is_ok_thread
|
||||||
|
// there is only one thread that may write to this value
|
||||||
|
stage_1_thread = std::thread([this, b, bs] {
|
||||||
|
this->stage1_is_ok_thread = error_code(simdjson::active_implementation->stage1(b, bs, this->parser_thread, true));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
next_json = 0;
|
||||||
|
load_next_batch = false;
|
||||||
|
} // load_next_batch
|
||||||
|
error_code res = simdjson::active_implementation->stage2(buf(), remaining(), parser, next_json);
|
||||||
|
if (res == simdjson::SUCCESS_AND_HAS_MORE) {
|
||||||
|
n_parsed_docs++;
|
||||||
|
current_buffer_loc = parser.structural_indexes[next_json];
|
||||||
|
load_next_batch = (current_buffer_loc == last_json_buffer_loc);
|
||||||
|
} else if (res == simdjson::SUCCESS) {
|
||||||
|
n_parsed_docs++;
|
||||||
|
if (remaining() > _batch_size) {
|
||||||
|
current_buffer_loc = parser.structural_indexes[next_json - 1];
|
||||||
|
load_next_batch = true;
|
||||||
|
res = simdjson::SUCCESS_AND_HAS_MORE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // SIMDJSON_THREADS_ENABLED
|
||||||
|
|
||||||
|
// single-threaded version of json_parse
|
||||||
|
inline error_code document::stream::json_parse() noexcept {
|
||||||
|
if (unlikely(parser.capacity() == 0)) {
|
||||||
|
const bool allocok = parser.allocate_capacity(_batch_size);
|
||||||
|
if (!allocok) {
|
||||||
|
return MEMALLOC;
|
||||||
|
}
|
||||||
|
} else if (unlikely(parser.capacity() < _batch_size)) {
|
||||||
|
return CAPACITY;
|
||||||
|
}
|
||||||
|
if (unlikely(load_next_batch)) {
|
||||||
|
advance(current_buffer_loc);
|
||||||
|
n_bytes_parsed += current_buffer_loc;
|
||||||
|
_batch_size = (std::min)(_batch_size, remaining());
|
||||||
|
_batch_size = internal::trimmed_length_safe_utf8((const char *)buf(), _batch_size);
|
||||||
|
auto stage1_is_ok = (error_code)simdjson::active_implementation->stage1(buf(), _batch_size, parser, true);
|
||||||
|
if (stage1_is_ok != simdjson::SUCCESS) {
|
||||||
|
return stage1_is_ok;
|
||||||
|
}
|
||||||
|
size_t last_index = internal::find_last_json_buf_idx(buf(), _batch_size, parser);
|
||||||
|
if (last_index == 0) {
|
||||||
|
if (parser.n_structural_indexes == 0) {
|
||||||
|
return EMPTY;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
parser.n_structural_indexes = last_index + 1;
|
||||||
|
}
|
||||||
|
load_next_batch = false;
|
||||||
|
} // load_next_batch
|
||||||
|
error_code res = simdjson::active_implementation->stage2(buf(), remaining(), parser, next_json);
|
||||||
|
if (likely(res == simdjson::SUCCESS_AND_HAS_MORE)) {
|
||||||
|
n_parsed_docs++;
|
||||||
|
current_buffer_loc = parser.structural_indexes[next_json];
|
||||||
|
} else if (res == simdjson::SUCCESS) {
|
||||||
|
n_parsed_docs++;
|
||||||
|
if (remaining() > _batch_size) {
|
||||||
|
current_buffer_loc = parser.structural_indexes[next_json - 1];
|
||||||
|
next_json = 1;
|
||||||
|
load_next_batch = true;
|
||||||
|
res = simdjson::SUCCESS_AND_HAS_MORE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
#endif // SIMDJSON_THREADS_ENABLED
|
||||||
|
|
||||||
|
} // end of namespace simdjson
|
||||||
|
#endif // SIMDJSON_INLINE_DOCUMENT_STREAM_H
|
|
@ -1,262 +1,35 @@
|
||||||
|
// TODO Remove this -- deprecated API and files
|
||||||
|
|
||||||
#ifndef SIMDJSON_INLINE_JSONSTREAM_H
|
#ifndef SIMDJSON_INLINE_JSONSTREAM_H
|
||||||
#define SIMDJSON_INLINE_JSONSTREAM_H
|
#define SIMDJSON_INLINE_JSONSTREAM_H
|
||||||
|
|
||||||
#include "simdjson/jsonstream.h"
|
#include "simdjson/jsonstream.h"
|
||||||
#include <algorithm>
|
#include "simdjson/document.h"
|
||||||
#include <limits>
|
#include "simdjson/document_stream.h"
|
||||||
#include <stdexcept>
|
|
||||||
#include <thread>
|
|
||||||
|
|
||||||
namespace simdjson::internal {
|
|
||||||
|
|
||||||
/* This algorithm is used to quickly identify the buffer position of
|
|
||||||
* the last JSON document inside the current batch.
|
|
||||||
*
|
|
||||||
* It does its work by finding the last pair of structural characters
|
|
||||||
* that represent the end followed by the start of a document.
|
|
||||||
*
|
|
||||||
* Simply put, we iterate over the structural characters, starting from
|
|
||||||
* the end. We consider that we found the end of a JSON document when the
|
|
||||||
* first element of the pair is NOT one of these characters: '{' '[' ';' ','
|
|
||||||
* and when the second element is NOT one of these characters: '}' '}' ';' ','.
|
|
||||||
*
|
|
||||||
* This simple comparison works most of the time, but it does not cover cases
|
|
||||||
* where the batch's structural indexes contain a perfect amount of documents.
|
|
||||||
* In such a case, we do not have access to the structural index which follows
|
|
||||||
* the last document, therefore, we do not have access to the second element in
|
|
||||||
* the pair, and means that we cannot identify the last document. To fix this
|
|
||||||
* issue, we keep a count of the open and closed curly/square braces we found
|
|
||||||
* while searching for the pair. When we find a pair AND the count of open and
|
|
||||||
* closed curly/square braces is the same, we know that we just passed a
|
|
||||||
* complete
|
|
||||||
* document, therefore the last json buffer location is the end of the batch
|
|
||||||
* */
|
|
||||||
inline size_t find_last_json_buf_idx(const uint8_t *buf, size_t size, const document::parser &parser) {
|
|
||||||
// this function can be generally useful
|
|
||||||
if (parser.n_structural_indexes == 0)
|
|
||||||
return 0;
|
|
||||||
auto last_i = parser.n_structural_indexes - 1;
|
|
||||||
if (parser.structural_indexes[last_i] == size) {
|
|
||||||
if (last_i == 0)
|
|
||||||
return 0;
|
|
||||||
last_i = parser.n_structural_indexes - 2;
|
|
||||||
}
|
|
||||||
auto arr_cnt = 0;
|
|
||||||
auto obj_cnt = 0;
|
|
||||||
for (auto i = last_i; i > 0; i--) {
|
|
||||||
auto idxb = parser.structural_indexes[i];
|
|
||||||
switch (buf[idxb]) {
|
|
||||||
case ':':
|
|
||||||
case ',':
|
|
||||||
continue;
|
|
||||||
case '}':
|
|
||||||
obj_cnt--;
|
|
||||||
continue;
|
|
||||||
case ']':
|
|
||||||
arr_cnt--;
|
|
||||||
continue;
|
|
||||||
case '{':
|
|
||||||
obj_cnt++;
|
|
||||||
break;
|
|
||||||
case '[':
|
|
||||||
arr_cnt++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
auto idxa = parser.structural_indexes[i - 1];
|
|
||||||
switch (buf[idxa]) {
|
|
||||||
case '{':
|
|
||||||
case '[':
|
|
||||||
case ':':
|
|
||||||
case ',':
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!arr_cnt && !obj_cnt) {
|
|
||||||
return last_i + 1;
|
|
||||||
}
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// returns true if the provided byte value is an ASCII character
|
|
||||||
static inline bool is_ascii(char c) {
|
|
||||||
return ((unsigned char)c) <= 127;
|
|
||||||
}
|
|
||||||
|
|
||||||
// if the string ends with UTF-8 values, backtrack
|
|
||||||
// up to the first ASCII character. May return 0.
|
|
||||||
static inline size_t trimmed_length_safe_utf8(const char * c, size_t len) {
|
|
||||||
while ((len > 0) and (not is_ascii(c[len - 1]))) {
|
|
||||||
len--;
|
|
||||||
}
|
|
||||||
return len;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace simdjson::internal
|
|
||||||
|
|
||||||
namespace simdjson {
|
namespace simdjson {
|
||||||
|
|
||||||
template <class string_container>
|
template <class string_container>
|
||||||
JsonStream<string_container>::JsonStream(const string_container &s,
|
inline JsonStream<string_container>::JsonStream(const string_container &s, size_t _batch_size) noexcept
|
||||||
size_t batchSize)
|
: str(s), batch_size(_batch_size) {
|
||||||
: str(s), _batch_size(batchSize) {
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class string_container> JsonStream<string_container>::~JsonStream() {
|
|
||||||
#ifdef SIMDJSON_THREADS_ENABLED
|
|
||||||
if (stage_1_thread.joinable()) {
|
|
||||||
stage_1_thread.join();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef SIMDJSON_THREADS_ENABLED
|
|
||||||
|
|
||||||
// threaded version of json_parse
|
|
||||||
// todo: simplify this code further
|
|
||||||
template <class string_container>
|
template <class string_container>
|
||||||
int JsonStream<string_container>::json_parse(document::parser &parser) {
|
inline JsonStream<string_container>::~JsonStream() noexcept {
|
||||||
if (unlikely(parser.capacity() == 0)) {
|
if (stream) { delete stream; }
|
||||||
const bool allocok = parser.allocate_capacity(_batch_size);
|
|
||||||
if (!allocok) {
|
|
||||||
return parser.error = simdjson::MEMALLOC;
|
|
||||||
}
|
|
||||||
} else if (unlikely(parser.capacity() < _batch_size)) {
|
|
||||||
return parser.error = simdjson::CAPACITY;
|
|
||||||
}
|
|
||||||
if (unlikely(parser_thread.capacity() < _batch_size)) {
|
|
||||||
const bool allocok_thread = parser_thread.allocate_capacity(_batch_size);
|
|
||||||
if (!allocok_thread) {
|
|
||||||
return parser.error = simdjson::MEMALLOC;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (unlikely(load_next_batch)) {
|
|
||||||
// First time loading
|
|
||||||
if (!stage_1_thread.joinable()) {
|
|
||||||
_batch_size = (std::min)(_batch_size, remaining());
|
|
||||||
_batch_size = internal::trimmed_length_safe_utf8((const char *)buf(), _batch_size);
|
|
||||||
if (_batch_size == 0) {
|
|
||||||
return parser.error = simdjson::UTF8_ERROR;
|
|
||||||
}
|
|
||||||
auto stage1_is_ok = error_code(simdjson::active_implementation->stage1(buf(), _batch_size, parser, true));
|
|
||||||
if (stage1_is_ok != simdjson::SUCCESS) {
|
|
||||||
return parser.error = stage1_is_ok;
|
|
||||||
}
|
|
||||||
size_t last_index = internal::find_last_json_buf_idx(buf(), _batch_size, parser);
|
|
||||||
if (last_index == 0) {
|
|
||||||
if (parser.n_structural_indexes == 0) {
|
|
||||||
return parser.error = simdjson::EMPTY;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
parser.n_structural_indexes = last_index + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// the second thread is running or done.
|
|
||||||
else {
|
|
||||||
stage_1_thread.join();
|
|
||||||
if (stage1_is_ok_thread != simdjson::SUCCESS) {
|
|
||||||
return parser.error = stage1_is_ok_thread;
|
|
||||||
}
|
|
||||||
std::swap(parser.structural_indexes, parser_thread.structural_indexes);
|
|
||||||
parser.n_structural_indexes = parser_thread.n_structural_indexes;
|
|
||||||
advance(last_json_buffer_loc);
|
|
||||||
n_bytes_parsed += last_json_buffer_loc;
|
|
||||||
}
|
|
||||||
// let us decide whether we will start a new thread
|
|
||||||
if (remaining() - _batch_size > 0) {
|
|
||||||
last_json_buffer_loc =
|
|
||||||
parser.structural_indexes[internal::find_last_json_buf_idx(buf(), _batch_size, parser)];
|
|
||||||
_batch_size = (std::min)(_batch_size, remaining() - last_json_buffer_loc);
|
|
||||||
if (_batch_size > 0) {
|
|
||||||
_batch_size = internal::trimmed_length_safe_utf8(
|
|
||||||
(const char *)(buf() + last_json_buffer_loc), _batch_size);
|
|
||||||
if (_batch_size == 0) {
|
|
||||||
return parser.error = simdjson::UTF8_ERROR;
|
|
||||||
}
|
|
||||||
// let us capture read-only variables
|
|
||||||
const uint8_t *const b = buf() + last_json_buffer_loc;
|
|
||||||
const size_t bs = _batch_size;
|
|
||||||
// we call the thread on a lambda that will update
|
|
||||||
// this->stage1_is_ok_thread
|
|
||||||
// there is only one thread that may write to this value
|
|
||||||
stage_1_thread = std::thread([this, b, bs] {
|
|
||||||
this->stage1_is_ok_thread = error_code(simdjson::active_implementation->stage1(b, bs, this->parser_thread, true));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
next_json = 0;
|
|
||||||
load_next_batch = false;
|
|
||||||
} // load_next_batch
|
|
||||||
int res = simdjson::active_implementation->stage2(buf(), remaining(), parser, next_json);
|
|
||||||
if (res == simdjson::SUCCESS_AND_HAS_MORE) {
|
|
||||||
n_parsed_docs++;
|
|
||||||
current_buffer_loc = parser.structural_indexes[next_json];
|
|
||||||
load_next_batch = (current_buffer_loc == last_json_buffer_loc);
|
|
||||||
} else if (res == simdjson::SUCCESS) {
|
|
||||||
n_parsed_docs++;
|
|
||||||
if (remaining() > _batch_size) {
|
|
||||||
current_buffer_loc = parser.structural_indexes[next_json - 1];
|
|
||||||
load_next_batch = true;
|
|
||||||
res = simdjson::SUCCESS_AND_HAS_MORE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // SIMDJSON_THREADS_ENABLED
|
|
||||||
|
|
||||||
// single-threaded version of json_parse
|
|
||||||
template <class string_container>
|
template <class string_container>
|
||||||
int JsonStream<string_container>::json_parse(document::parser &parser) {
|
inline int JsonStream<string_container>::json_parse(document::parser &parser) noexcept {
|
||||||
if (unlikely(parser.capacity() == 0)) {
|
if (unlikely(stream == nullptr)) {
|
||||||
const bool allocok = parser.allocate_capacity(_batch_size);
|
stream = new document::stream(parser, reinterpret_cast<const uint8_t*>(str.data()), str.length(), batch_size);
|
||||||
if (!allocok) {
|
|
||||||
parser.valid = false;
|
|
||||||
return parser.error = MEMALLOC;
|
|
||||||
}
|
|
||||||
} else if (unlikely(parser.capacity() < _batch_size)) {
|
|
||||||
parser.valid = false;
|
|
||||||
return parser.error = CAPACITY;
|
|
||||||
}
|
|
||||||
if (unlikely(load_next_batch)) {
|
|
||||||
advance(current_buffer_loc);
|
|
||||||
n_bytes_parsed += current_buffer_loc;
|
|
||||||
_batch_size = (std::min)(_batch_size, remaining());
|
|
||||||
_batch_size = internal::trimmed_length_safe_utf8((const char *)buf(), _batch_size);
|
|
||||||
auto stage1_is_ok = (error_code)simdjson::active_implementation->stage1(buf(), _batch_size, parser, true);
|
|
||||||
if (stage1_is_ok != simdjson::SUCCESS) {
|
|
||||||
parser.valid = false;
|
|
||||||
return parser.error = stage1_is_ok;
|
|
||||||
}
|
|
||||||
size_t last_index = internal::find_last_json_buf_idx(buf(), _batch_size, parser);
|
|
||||||
if (last_index == 0) {
|
|
||||||
if (parser.n_structural_indexes == 0) {
|
|
||||||
parser.valid = false;
|
|
||||||
return parser.error = EMPTY;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
parser.n_structural_indexes = last_index + 1;
|
|
||||||
}
|
|
||||||
load_next_batch = false;
|
|
||||||
} // load_next_batch
|
|
||||||
int res = simdjson::active_implementation->stage2(buf(), remaining(), parser, next_json);
|
|
||||||
if (likely(res == simdjson::SUCCESS_AND_HAS_MORE)) {
|
|
||||||
n_parsed_docs++;
|
|
||||||
current_buffer_loc = parser.structural_indexes[next_json];
|
|
||||||
} else if (res == simdjson::SUCCESS) {
|
|
||||||
n_parsed_docs++;
|
|
||||||
if (remaining() > _batch_size) {
|
|
||||||
current_buffer_loc = parser.structural_indexes[next_json - 1];
|
|
||||||
next_json = 1;
|
|
||||||
load_next_batch = true;
|
|
||||||
res = simdjson::SUCCESS_AND_HAS_MORE;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
printf("E\n");
|
if (&parser != &stream->parser) { return stream->error = TAPE_ERROR; }
|
||||||
|
stream->error = stream->json_parse();
|
||||||
}
|
}
|
||||||
return res;
|
return stream->error;
|
||||||
}
|
}
|
||||||
#endif // SIMDJSON_THREADS_ENABLED
|
|
||||||
|
|
||||||
} // end of namespace simdjson
|
} // namespace simdjson
|
||||||
|
|
||||||
#endif // SIMDJSON_INLINE_JSONSTREAM_H
|
#endif // SIMDJSON_INLINE_JSONSTREAM_H
|
||||||
|
|
|
@ -1,51 +1,46 @@
|
||||||
|
// TODO Remove this -- deprecated API and files
|
||||||
|
|
||||||
#ifndef SIMDJSON_JSONSTREAM_H
|
#ifndef SIMDJSON_JSONSTREAM_H
|
||||||
#define SIMDJSON_JSONSTREAM_H
|
#define SIMDJSON_JSONSTREAM_H
|
||||||
|
|
||||||
#include <thread>
|
#include "simdjson/document_stream.h"
|
||||||
#include "simdjson/padded_string.h"
|
#include "simdjson/padded_string.h"
|
||||||
#include "simdjson/simdjson.h"
|
|
||||||
|
|
||||||
namespace simdjson {
|
namespace simdjson {
|
||||||
|
|
||||||
/*************************************************************************************
|
/**
|
||||||
* The main motivation for this piece of software is to achieve maximum speed
|
* @deprecated use document::stream instead.
|
||||||
*and offer
|
*
|
||||||
|
* The main motivation for this piece of software is to achieve maximum speed and offer
|
||||||
* good quality of life while parsing files containing multiple JSON documents.
|
* good quality of life while parsing files containing multiple JSON documents.
|
||||||
*
|
*
|
||||||
* Since we want to offer flexibility and not restrict ourselves to a specific
|
* Since we want to offer flexibility and not restrict ourselves to a specific file
|
||||||
*file
|
* format, we support any file that contains any valid JSON documents separated by one
|
||||||
* format, we support any file that contains any valid JSON documents separated
|
|
||||||
*by one
|
|
||||||
* or more character that is considered a whitespace by the JSON spec.
|
* or more character that is considered a whitespace by the JSON spec.
|
||||||
* Namely: space, nothing, linefeed, carriage return, horizontal tab.
|
* Namely: space, nothing, linefeed, carriage return, horizontal tab.
|
||||||
* Anything that is not whitespace will be parsed as a JSON document and could
|
* Anything that is not whitespace will be parsed as a JSON document and could lead
|
||||||
*lead
|
|
||||||
* to failure.
|
* to failure.
|
||||||
*
|
*
|
||||||
* To offer maximum parsing speed, our implementation processes the data inside
|
* To offer maximum parsing speed, our implementation processes the data inside the
|
||||||
*the
|
|
||||||
* buffer by batches and their size is defined by the parameter "batch_size".
|
* buffer by batches and their size is defined by the parameter "batch_size".
|
||||||
* By loading data in batches, we can optimize the time spent allocating data in
|
* By loading data in batches, we can optimize the time spent allocating data in the
|
||||||
*the
|
|
||||||
* parser and can also open the possibility of multi-threading.
|
* parser and can also open the possibility of multi-threading.
|
||||||
* The batch_size must be at least as large as the biggest document in the file,
|
* The batch_size must be at least as large as the biggest document in the file, but
|
||||||
*but
|
|
||||||
* not too large in order to submerge the chached memory. We found that 1MB is
|
* not too large in order to submerge the chached memory. We found that 1MB is
|
||||||
* somewhat a sweet spot for now. Eventually, this batch_size could be fully
|
* somewhat a sweet spot for now. Eventually, this batch_size could be fully
|
||||||
* automated and be optimal at all times.
|
* automated and be optimal at all times.
|
||||||
************************************************************************************/
|
*
|
||||||
/**
|
* The template parameter (string_container) must
|
||||||
* The template parameter (string_container) must
|
* support the data() and size() methods, returning a pointer
|
||||||
* support the data() and size() methods, returning a pointer
|
* to a char* and to the number of bytes respectively.
|
||||||
* to a char* and to the number of bytes respectively.
|
* The simdjson parser may read up to SIMDJSON_PADDING bytes beyond the end
|
||||||
* The simdjson parser may read up to SIMDJSON_PADDING bytes beyond the end
|
* of the string, so if you do not use a padded_string container,
|
||||||
* of the string, so if you do not use a padded_string container,
|
* you have the responsability to overallocated. If you fail to
|
||||||
* you have the responsability to overallocated. If you fail to
|
* do so, your software may crash if you cross a page boundary,
|
||||||
* do so, your software may crash if you cross a page boundary,
|
* and you should expect memory checkers to object.
|
||||||
* and you should expect memory checkers to object.
|
* Most users should use a simdjson::padded_string.
|
||||||
* Most users should use a simdjson::padded_string.
|
*/
|
||||||
*/
|
template <class string_container> class JsonStream {
|
||||||
template <class string_container = padded_string> class JsonStream {
|
|
||||||
public:
|
public:
|
||||||
/* Create a JsonStream object that can be used to parse sequentially the valid
|
/* Create a JsonStream object that can be used to parse sequentially the valid
|
||||||
* JSON documents found in the buffer "buf".
|
* JSON documents found in the buffer "buf".
|
||||||
|
@ -66,9 +61,9 @@ public:
|
||||||
* get_n_parsed_docs, get_n_bytes_parsed, etc.
|
* get_n_parsed_docs, get_n_bytes_parsed, etc.
|
||||||
*
|
*
|
||||||
* */
|
* */
|
||||||
JsonStream(const string_container &s, size_t batch_size = 1000000);
|
JsonStream(const string_container &s, size_t _batch_size = 1000000) noexcept;
|
||||||
|
|
||||||
~JsonStream();
|
~JsonStream() noexcept;
|
||||||
|
|
||||||
/* Parse the next document found in the buffer previously given to JsonStream.
|
/* Parse the next document found in the buffer previously given to JsonStream.
|
||||||
|
|
||||||
|
@ -102,48 +97,29 @@ public:
|
||||||
* You can also check validity by calling parser.is_valid(). The same parser
|
* You can also check validity by calling parser.is_valid(). The same parser
|
||||||
can
|
can
|
||||||
* and should be reused for the other documents in the buffer. */
|
* and should be reused for the other documents in the buffer. */
|
||||||
int json_parse(document::parser &parser);
|
int json_parse(document::parser &parser) noexcept;
|
||||||
|
|
||||||
/* Returns the location (index) of where the next document should be in the
|
/* Returns the location (index) of where the next document should be in the
|
||||||
* buffer.
|
* buffer.
|
||||||
* Can be used for debugging, it tells the user the position of the end of the
|
* Can be used for debugging, it tells the user the position of the end of the
|
||||||
* last
|
* last
|
||||||
* valid JSON document parsed*/
|
* valid JSON document parsed*/
|
||||||
inline size_t get_current_buffer_loc() const { return current_buffer_loc; }
|
inline size_t get_current_buffer_loc() const noexcept { return stream ? stream->current_buffer_loc : 0; }
|
||||||
|
|
||||||
/* Returns the total amount of complete documents parsed by the JsonStream,
|
/* Returns the total amount of complete documents parsed by the JsonStream,
|
||||||
* in the current buffer, at the given time.*/
|
* in the current buffer, at the given time.*/
|
||||||
inline size_t get_n_parsed_docs() const { return n_parsed_docs; }
|
inline size_t get_n_parsed_docs() const noexcept { return stream ? stream->n_parsed_docs : 0; }
|
||||||
|
|
||||||
/* Returns the total amount of data (in bytes) parsed by the JsonStream,
|
/* Returns the total amount of data (in bytes) parsed by the JsonStream,
|
||||||
* in the current buffer, at the given time.*/
|
* in the current buffer, at the given time.*/
|
||||||
inline size_t get_n_bytes_parsed() const { return n_bytes_parsed; }
|
inline size_t get_n_bytes_parsed() const noexcept { return stream ? stream->n_bytes_parsed : 0; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
inline const uint8_t *buf() const { return reinterpret_cast<uint8_t*>(str.data()) + str_start; }
|
|
||||||
|
|
||||||
inline void advance(size_t offset) { str_start += offset; }
|
|
||||||
|
|
||||||
inline size_t remaining() const { return str.size() - str_start; }
|
|
||||||
|
|
||||||
const string_container &str;
|
const string_container &str;
|
||||||
size_t _batch_size; // this is actually variable!
|
const size_t batch_size;
|
||||||
size_t str_start{0};
|
document::stream *stream{nullptr};
|
||||||
size_t next_json{0};
|
|
||||||
bool load_next_batch{true};
|
|
||||||
size_t current_buffer_loc{0};
|
|
||||||
#ifdef SIMDJSON_THREADS_ENABLED
|
|
||||||
size_t last_json_buffer_loc{0};
|
|
||||||
#endif
|
|
||||||
size_t n_parsed_docs{0};
|
|
||||||
size_t n_bytes_parsed{0};
|
|
||||||
simdjson::implementation *stage_parser;
|
|
||||||
#ifdef SIMDJSON_THREADS_ENABLED
|
|
||||||
error_code stage1_is_ok_thread{SUCCESS};
|
|
||||||
std::thread stage_1_thread;
|
|
||||||
document::parser parser_thread;
|
|
||||||
#endif
|
|
||||||
}; // end of class JsonStream
|
}; // end of class JsonStream
|
||||||
|
|
||||||
} // end of namespace simdjson
|
} // end of namespace simdjson
|
||||||
|
|
||||||
#endif // SIMDJSON_JSONSTREAM_H
|
#endif // SIMDJSON_JSONSTREAM_H
|
||||||
|
|
|
@ -91,7 +91,7 @@ struct padded_string final {
|
||||||
}
|
}
|
||||||
|
|
||||||
~padded_string() {
|
~padded_string() {
|
||||||
aligned_free_char(data_ptr);
|
aligned_free_char(data_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t size() const { return viable_size; }
|
size_t size() const { return viable_size; }
|
||||||
|
|
|
@ -270,6 +270,7 @@ static bool parse_json_message_issue467(char const* message, std::size_t len, si
|
||||||
}
|
}
|
||||||
|
|
||||||
bool json_issue467() {
|
bool json_issue467() {
|
||||||
|
printf("Running json_issue467.\n");
|
||||||
const char * single_message = "{\"error\":[],\"result\":{\"token\":\"xxx\"}}";
|
const char * single_message = "{\"error\":[],\"result\":{\"token\":\"xxx\"}}";
|
||||||
const char* two_messages = "{\"error\":[],\"result\":{\"token\":\"xxx\"}}{\"error\":[],\"result\":{\"token\":\"xxx\"}}";
|
const char* two_messages = "{\"error\":[],\"result\":{\"token\":\"xxx\"}}{\"error\":[],\"result\":{\"token\":\"xxx\"}}";
|
||||||
|
|
||||||
|
@ -393,8 +394,8 @@ bool navigate_test() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// returns true if successful
|
// returns true if successful
|
||||||
bool stream_utf8_test() {
|
bool JsonStream_utf8_test() {
|
||||||
printf("Running stream_utf8_test");
|
printf("Running JsonStream_utf8_test");
|
||||||
fflush(NULL);
|
fflush(NULL);
|
||||||
const size_t n_records = 10000;
|
const size_t n_records = 10000;
|
||||||
std::string data;
|
std::string data;
|
||||||
|
@ -406,11 +407,11 @@ bool stream_utf8_test() {
|
||||||
i, i, (i % 2) ? "⺃" : "⺕", i % 10, i % 10);
|
i, i, (i % 2) ? "⺃" : "⺕", i % 10, i % 10);
|
||||||
data += std::string(buf, n);
|
data += std::string(buf, n);
|
||||||
}
|
}
|
||||||
for(size_t i = 1000; i < 2000; i += (i>1050?10:1)) {
|
for(size_t batch_size = 1000; batch_size < 2000; batch_size += (batch_size>1050?10:1)) {
|
||||||
printf(".");
|
printf(".");
|
||||||
fflush(NULL);
|
fflush(NULL);
|
||||||
simdjson::padded_string str(data);
|
simdjson::padded_string str(data);
|
||||||
simdjson::JsonStream<simdjson::padded_string> js{str, i};
|
simdjson::JsonStream<simdjson::padded_string> js{str, batch_size};
|
||||||
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
|
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
|
||||||
size_t count = 0;
|
size_t count = 0;
|
||||||
simdjson::document::parser parser;
|
simdjson::document::parser parser;
|
||||||
|
@ -446,7 +447,7 @@ bool stream_utf8_test() {
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
if(count != n_records) {
|
if(count != n_records) {
|
||||||
printf("Something is wrong in stream_test at window size = %zu.\n", i);
|
printf("Something is wrong in JsonStream_utf8_test at window size = %zu.\n", batch_size);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -455,8 +456,8 @@ bool stream_utf8_test() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// returns true if successful
|
// returns true if successful
|
||||||
bool stream_test() {
|
bool JsonStream_test() {
|
||||||
printf("Running stream_test");
|
printf("Running JsonStream_test");
|
||||||
fflush(NULL);
|
fflush(NULL);
|
||||||
const size_t n_records = 10000;
|
const size_t n_records = 10000;
|
||||||
std::string data;
|
std::string data;
|
||||||
|
@ -468,11 +469,11 @@ bool stream_test() {
|
||||||
i, i, (i % 2) ? "homme" : "femme", i % 10, i % 10);
|
i, i, (i % 2) ? "homme" : "femme", i % 10, i % 10);
|
||||||
data += std::string(buf, n);
|
data += std::string(buf, n);
|
||||||
}
|
}
|
||||||
for(size_t i = 1000; i < 2000; i += (i>1050?10:1)) {
|
for(size_t batch_size = 1000; batch_size < 2000; batch_size += (batch_size>1050?10:1)) {
|
||||||
printf(".");
|
printf(".");
|
||||||
fflush(NULL);
|
fflush(NULL);
|
||||||
simdjson::padded_string str(data);
|
simdjson::padded_string str(data);
|
||||||
simdjson::JsonStream<simdjson::padded_string> js{str, i};
|
simdjson::JsonStream<simdjson::padded_string> js{str, batch_size};
|
||||||
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
|
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
|
||||||
size_t count = 0;
|
size_t count = 0;
|
||||||
simdjson::document::parser parser;
|
simdjson::document::parser parser;
|
||||||
|
@ -508,7 +509,55 @@ bool stream_test() {
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
if(count != n_records) {
|
if(count != n_records) {
|
||||||
printf("Something is wrong in stream_test at window size = %zu.\n", i);
|
printf("Something is wrong in JsonStream_test at window size = %zu.\n", batch_size);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("ok\n");
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns true if successful
|
||||||
|
bool document_stream_test() {
|
||||||
|
printf("Running document_stream_test");
|
||||||
|
fflush(NULL);
|
||||||
|
const size_t n_records = 10000;
|
||||||
|
std::string data;
|
||||||
|
char buf[1024];
|
||||||
|
for (size_t i = 0; i < n_records; ++i) {
|
||||||
|
auto n = sprintf(buf,
|
||||||
|
"{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
|
||||||
|
"\"ete\": {\"id\": %zu, \"name\": \"eventail%zu\"}}",
|
||||||
|
i, i, (i % 2) ? "homme" : "femme", i % 10, i % 10);
|
||||||
|
data += std::string(buf, n);
|
||||||
|
}
|
||||||
|
for(size_t batch_size = 1000; batch_size < 2000; batch_size += (batch_size>1050?10:1)) {
|
||||||
|
printf(".");
|
||||||
|
fflush(NULL);
|
||||||
|
simdjson::padded_string str(data);
|
||||||
|
simdjson::document::parser parser;
|
||||||
|
size_t count = 0;
|
||||||
|
for (auto [doc, error] : parser.parse_many(str, batch_size)) {
|
||||||
|
if (error) {
|
||||||
|
printf("Error at on document %zd at batch size %zu: %s\n", count, batch_size, simdjson::error_message(error).c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto [keyid, error2] = doc["id"].as_int64_t();
|
||||||
|
if (error2) {
|
||||||
|
printf("Error getting id as int64 on document %zd at batch size %zu: %s\n", count, batch_size, simdjson::error_message(error2).c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (keyid != int64_t(count)) {
|
||||||
|
printf("key does not match %ld, expected %zd on document %zd at batch size %zu\n", keyid, count, count, batch_size);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
if(count != n_records) {
|
||||||
|
printf("Found wrong number of documents %zd, expected %zd at batch size %zu\n", count, n_records, batch_size);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -781,9 +830,11 @@ int main() {
|
||||||
std::cout << "Running basic tests." << std::endl;
|
std::cout << "Running basic tests." << std::endl;
|
||||||
if(!json_issue467())
|
if(!json_issue467())
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
if(!stream_test())
|
if(!JsonStream_test())
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
if(!stream_utf8_test())
|
if(!JsonStream_utf8_test())
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
if(!document_stream_test())
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
if(!number_test_small_integers())
|
if(!number_test_small_integers())
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
|
|
|
@ -57,6 +57,33 @@ void parser_parse() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void parser_parse_many_error_code() {
|
||||||
|
cout << __func__ << endl;
|
||||||
|
|
||||||
|
// Read files with the parser
|
||||||
|
padded_string json = string("[1, 2, 3] true [ true, false ]");
|
||||||
|
cout << "Parsing " << json.data() << " ..." << endl;
|
||||||
|
document::parser parser;
|
||||||
|
for (auto [doc, error] : parser.parse_many(json)) {
|
||||||
|
if (error) { cerr << "Error: " << error_message(error) << endl; exit(1); }
|
||||||
|
if (!doc.print_json(cout)) { exit(1); }
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void parser_parse_many_exception() {
|
||||||
|
cout << __func__ << endl;
|
||||||
|
|
||||||
|
// Read files with the parser
|
||||||
|
padded_string json = string("[1, 2, 3] true [ true, false ]");
|
||||||
|
cout << "Parsing " << json.data() << " ..." << endl;
|
||||||
|
document::parser parser;
|
||||||
|
for (const document &doc : parser.parse_many(json)) {
|
||||||
|
if (!doc.print_json(cout)) { exit(1); }
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
cout << "Running examples." << endl;
|
cout << "Running examples." << endl;
|
||||||
document_parse_error_code();
|
document_parse_error_code();
|
||||||
|
@ -64,6 +91,8 @@ int main() {
|
||||||
document_parse_padded_string();
|
document_parse_padded_string();
|
||||||
document_parse_get_corpus();
|
document_parse_get_corpus();
|
||||||
parser_parse();
|
parser_parse();
|
||||||
|
parser_parse_many_error_code();
|
||||||
|
parser_parse_many_exception();
|
||||||
cout << "Ran to completion!" << endl;
|
cout << "Ran to completion!" << endl;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue