Make valstat-ish parse APIs

This commit is contained in:
John Keiser 2020-02-13 13:30:12 -08:00
parent bc8bc7d1a8
commit 1f76737510
16 changed files with 687 additions and 767 deletions

View File

@ -70,10 +70,10 @@ LIBHEADERS_HASWELL= src/haswell/bitmanipulation.h src/haswell/bitmask.h src/h
LIBHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/westmere/intrinsics.h src/westmere/numberparsing.h src/westmere/simd.h src/westmere/stage1_find_marks.h src/westmere/stage2_build_tape.h src/westmere/stringparsing.h
LIBHEADERS=src/jsoncharutils.h src/simdprune_tables.h $(LIBHEADERS_GENERIC) $(LIBHEADERS_ARM64) $(LIBHEADERS_HASWELL) $(LIBHEADERS_WESTMERE)
PUBHEADERS=include/simdjson/common_defs.h include/simdjson/isadetection.h include/simdjson/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/document.h include/simdjson/document/iterator.h include/simdjson/document/parser.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/portability.h include/simdjson/architecture.h include/simdjson/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h include/simdjson/stage1_find_marks.h include/simdjson/stage2_build_tape.h
PUBHEADERS=include/simdjson/common_defs.h include/simdjson/isadetection.h include/simdjson/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/document.h include/simdjson/document_iterator.h include/simdjson/document_parser.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/portability.h include/simdjson/architecture.h include/simdjson/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h include/simdjson/stage1_find_marks.h include/simdjson/stage2_build_tape.h
HEADERS=$(PUBHEADERS) $(LIBHEADERS)
LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/error.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document.cpp src/document/parser.cpp
LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/error.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document.cpp src/document_parser.cpp
MINIFIERHEADERS=include/simdjson/jsonminifier.h
MINIFIERLIBFILES=src/jsonminifier.cpp
@ -205,7 +205,7 @@ basictests:tests/basictests.cpp $(HEADERS) $(LIBFILES)
numberparsingcheck:tests/numberparsingcheck.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o numberparsingcheck src/jsonioutil.cpp src/jsonparser.cpp src/error.cpp src/stage1_find_marks.cpp src/document.cpp src/document/parser.cpp tests/numberparsingcheck.cpp -I. $(LIBFLAGS) -DJSON_TEST_NUMBERS
$(CXX) $(CXXFLAGS) -o numberparsingcheck src/jsonioutil.cpp src/jsonparser.cpp src/error.cpp src/stage1_find_marks.cpp src/document.cpp src/document_parser.cpp tests/numberparsingcheck.cpp -I. $(LIBFLAGS) -DJSON_TEST_NUMBERS
integer_tests:tests/integer_tests.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o integer_tests $(LIBFILES) tests/integer_tests.cpp -I. $(LIBFLAGS)
@ -213,7 +213,7 @@ integer_tests:tests/integer_tests.cpp $(HEADERS) $(LIBFILES)
stringparsingcheck:tests/stringparsingcheck.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o stringparsingcheck src/jsonioutil.cpp src/jsonparser.cpp src/error.cpp src/stage1_find_marks.cpp src/document.cpp src/document/parser.cpp tests/stringparsingcheck.cpp -I. $(LIBFLAGS) -DJSON_TEST_STRINGS
$(CXX) $(CXXFLAGS) -o stringparsingcheck src/jsonioutil.cpp src/jsonparser.cpp src/error.cpp src/stage1_find_marks.cpp src/document.cpp src/document_parser.cpp tests/stringparsingcheck.cpp -I. $(LIBFLAGS) -DJSON_TEST_STRINGS
pointercheck:tests/pointercheck.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o pointercheck $(LIBFILES) tests/pointercheck.cpp -I. $(LIBFLAGS)

View File

@ -121,35 +121,29 @@ Another strategy is to reuse pre-allocated buffers. That is, you avoid reallocat
The main API involves populating a `ParsedJson` object which hosts a fully navigable document-object-model (DOM) view of the JSON document. The DOM can be accessed using [JSON Pointer](https://tools.ietf.org/html/rfc6901) paths, for example. The main function is `json_parse` which takes a string containing the JSON document as well as a reference to pre-allocated `ParsedJson` object (which can be reused multiple time). Once you have populated the `ParsedJson` object you can navigate through the DOM with an iterator (e.g., created by `ParsedJson::Iterator pjh(pj)`, see 'Navigating the parsed document').
```C
#include "simdjson/jsonparser.h"
using namespace simdjson;
// Samples:
// Load a document from a file
// Read a particular key / value from the document
// Iterate over an array of things
/...
const char * filename = ... //
// use whatever means you want to get a string (UTF-8) of your JSON document
padded_string p = get_corpus(filename);
ParsedJson pj;
pj.allocate_capacity(p.size()); // allocate memory for parsing up to p.size() bytes
const int res = json_parse(p, pj); // do the parsing, return 0 on success
// parsing is done!
if (res != 0) {
// You can use the "simdjson/simdjson.h" header to access the error message
std::cout << "Error parsing:" << simdjson::error_message(res) << std::endl;
```c++
#include "simdjson.h"
auto doc = simdjson::document::load("myfile.json");
cout << doc;
for (auto i=doc.begin(); i<doc.end(); i++) {
cout << doc[i];
}
// the ParsedJson document can be used here
// pj can be reused with other json_parse calls.
```
It is also possible to use a simpler API if you do not mind having the overhead
A slightly simpler API is available if you don't mind having the overhead
of memory allocation with each new JSON document:
```C
#include "simdjson/jsonparser.h"
using namespace simdjson;
document doc = document::parse("myfile.json");
cout << doc;
/...
const char * filename = ... //

View File

@ -23,7 +23,7 @@ jsonparser.cpp
stage1_find_marks.cpp
stage2_build_tape.cpp
document.cpp
document/parser.cpp
document_parser.cpp
"
# order matters
@ -38,8 +38,8 @@ simdjson/padded_string.h
simdjson/jsonioutil.h
simdjson/jsonminifier.h
simdjson/document.h
simdjson/document/iterator.h
simdjson/document/parser.h
simdjson/document_iterator.h
simdjson/document_parser.h
simdjson/parsedjson.h
simdjson/stage1_find_marks.h
simdjson/stage2_build_tape.h
@ -151,11 +151,11 @@ int main(int argc, char *argv[]) {
}
const char * filename = argv[1];
simdjson::padded_string p = simdjson::get_corpus(filename);
simdjson::document doc;
if (!simdjson::document::try_parse(p, doc)) { // do the parsing
std::cout << "document::try_parse not valid" << std::endl;
auto [doc, error] = simdjson::document::parse(p); // do the parsing
if (error) {
std::cout << "document::parse not valid" << std::endl;
} else {
std::cout << "document::try_parse valid" << std::endl;
std::cout << "document::parse valid" << std::endl;
}
if(argc == 2) {
return EXIT_SUCCESS;

View File

@ -68,8 +68,7 @@ simdjson_compute_stats(const simdjson::padded_string &p) {
__attribute__((noinline)) bool
simdjson_just_parse(const simdjson::padded_string &p) {
simdjson::document doc;
return simdjson::document::try_parse(p, doc) == simdjson::SUCCESS;
return simdjson::document::parse(p).error != simdjson::SUCCESS;
}
void sajson_traverse(std::vector<int64_t> &answer, const sajson::value &node) {

View File

@ -2,8 +2,8 @@ set(SIMDJSON_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
set(SIMDJSON_INCLUDE
${SIMDJSON_INCLUDE_DIR}/simdjson/common_defs.h
${SIMDJSON_INCLUDE_DIR}/simdjson/document.h
${SIMDJSON_INCLUDE_DIR}/simdjson/document/iterator.h
${SIMDJSON_INCLUDE_DIR}/simdjson/document/parser.h
${SIMDJSON_INCLUDE_DIR}/simdjson/document_iterator.h
${SIMDJSON_INCLUDE_DIR}/simdjson/document_parser.h
${SIMDJSON_INCLUDE_DIR}/simdjson/isadetection.h
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonformatutils.h
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonioutil.h

View File

@ -13,6 +13,7 @@
namespace simdjson {
template<size_t max_depth> class document_iterator;
class document_parser;
class document {
public:
@ -27,6 +28,9 @@ public:
document &operator=(const document &o) = delete;
using iterator = document_iterator<DEFAULT_MAX_DEPTH>;
class parser;
class doc_result;
class doc_ref_result;
//
// Tell whether this document has been parsed, or is just empty.
@ -43,8 +47,6 @@ public:
WARN_UNUSED
bool dump_raw_tape(std::ostream &os) const;
class parser;
//
// Parse a JSON document.
//
@ -53,35 +55,10 @@ public:
//
// Throws invalid_json if the JSON is invalid.
//
static document parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true);
static document parse(const char *buf, size_t len, bool realloc_if_needed = true) {
return parse((const uint8_t *)buf, len, realloc_if_needed);
}
static document parse(const std::string &s, bool realloc_if_needed = true) {
return parse(s.data(), s.length(), realloc_if_needed);
}
static document parse(const padded_string &s) {
return parse(s.data(), s.length(), false);
}
//
// Parse a JSON document.
//
// If you will be parsing more than one JSON document, it's recommended to create a
// document::parser object instead, keeping internal buffers around for efficiency reasons.
//
// Returns != SUCCESS if the JSON is invalid.
//
static WARN_UNUSED error_code try_parse(const uint8_t *buf, size_t len, document &dst, bool realloc_if_needed = true) noexcept;
static WARN_UNUSED error_code try_parse(const char *buf, size_t len, document &dst, bool realloc_if_needed = true) {
return try_parse((const uint8_t *)buf, len, dst, realloc_if_needed);
}
static WARN_UNUSED error_code try_parse(const std::string &s, document &dst, bool realloc_if_needed = true) {
return try_parse(s.data(), s.length(), dst, realloc_if_needed);
}
static WARN_UNUSED error_code try_parse(const padded_string &s, document &dst) {
return try_parse(s.data(), s.length(), dst, false);
}
static doc_result parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true);
static doc_result parse(const char *buf, size_t len, bool realloc_if_needed = true);
static doc_result parse(const std::string &s, bool realloc_if_needed = true);
static doc_result parse(const padded_string &s);
std::unique_ptr<uint64_t[]> tape;
std::unique_ptr<uint8_t[]> string_buf;// should be at least byte_capacity
@ -90,30 +67,46 @@ private:
bool set_capacity(size_t len);
};
class document::doc_result {
private:
doc_result(document &&_doc, error_code _error) : doc(std::move(_doc)), error(_error) { }
doc_result(document &&_doc) : doc(std::move(_doc)), error(SUCCESS) { }
doc_result(error_code _error) : doc(), error(_error) { }
friend class document;
public:
~doc_result()=default;
operator bool() noexcept { return error == SUCCESS; }
operator document() {
if (!*this) {
throw invalid_json(error);
}
return std::move(doc);
}
document doc;
error_code error;
};
class document::doc_ref_result {
public:
doc_ref_result(document &_doc, error_code _error) : doc(_doc), error(_error) { }
~doc_ref_result()=default;
operator bool() noexcept { return error == SUCCESS; }
operator document&() {
if (!*this) {
throw invalid_json(error);
}
return doc;
}
document& doc;
error_code error;
};
} // namespace simdjson
#include "simdjson/document/parser.h"
#include "simdjson/document/iterator.h"
// Implementations
namespace simdjson {
inline WARN_UNUSED document document::parse(const uint8_t *buf, size_t len, bool realloc_if_needed) {
document::parser parser;
if (!parser.allocate_capacity(len)) {
throw invalid_json(parser.error = MEMALLOC);
}
return parser.parse_new(buf, len, realloc_if_needed);
}
inline WARN_UNUSED error_code document::try_parse(const uint8_t *buf, size_t len, document &dst, bool realloc_if_needed) noexcept {
document::parser parser;
if (!parser.allocate_capacity(len)) {
return parser.error = MEMALLOC;
}
return parser.try_parse_into(buf, len, dst, realloc_if_needed);
}
} // namespace simdjson
#include "simdjson/document_parser.h"
#include "simdjson/document_iterator.h"
#endif // SIMDJSON_DOCUMENT_H

View File

@ -1,349 +0,0 @@
#ifndef SIMDJSON_DOCUMENT_PARSER_H
#define SIMDJSON_DOCUMENT_PARSER_H
#include <cstring>
#include <memory>
#include "simdjson/common_defs.h"
#include "simdjson/simdjson.h"
#include "simdjson/document.h"
#include "simdjson/padded_string.h"
namespace simdjson {
class document::parser {
public:
//
// Create a JSON parser with zero capacity. Call allocate_capacity() to initialize it.
//
parser()=default;
~parser()=default;
// this is a move only class
parser(parser &&p) = default;
parser(const parser &p) = delete;
parser &operator=(parser &&o) = default;
parser &operator=(const parser &o) = delete;
//
// Parse a JSON document and return a reference to it.
//
// The JSON document still lives in the parser: this is the most efficient way to parse JSON
// documents because it reuses the same buffers, but you *must* use the document before you
// destroy the parser or call parse() again.
//
// Throws invalid_json if the JSON is invalid.
//
const document &parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true);
const document &parse(const char *buf, size_t len, bool realloc_if_needed = true) {
return parse((const uint8_t *)buf, len, realloc_if_needed);
}
const document &parse(const std::string &s, bool realloc_if_needed = true) {
return parse(s.data(), s.length(), realloc_if_needed);
}
const document &parse(const padded_string &s) {
return parse(s.data(), s.length(), false);
}
//
// Parse a JSON document and take the result.
//
// The document can be used even after the parser is deallocated or parse() is called again.
//
// Throws invalid_json if the JSON is invalid.
//
document parse_new(const uint8_t *buf, size_t len, bool realloc_if_needed = true);
document parse_new(const char *buf, size_t len, bool realloc_if_needed = true) {
return parse_new((const uint8_t *)buf, len, realloc_if_needed);
}
document parse_new(const std::string &s, bool realloc_if_needed = true) {
return parse_new(s.data(), s.length(), realloc_if_needed);
}
document parse_new(const padded_string &s) {
return parse_new(s.data(), s.length(), false);
}
//
// Parse a JSON document and set doc to a pointer to it.
//
// The JSON document still lives in the parser: this is the most efficient way to parse JSON
// documents because it reuses the same buffers, but you *must* use the document before you
// destroy the parser or call parse() again.
//
// Returns != SUCCESS if the JSON is invalid.
//
WARN_UNUSED error_code try_parse(const uint8_t *buf, size_t len, const document *& dst, bool realloc_if_needed = true) noexcept;
WARN_UNUSED error_code try_parse(const char *buf, size_t len, const document *& dst, bool realloc_if_needed = true) noexcept {
return try_parse((const uint8_t *)buf, len, dst, realloc_if_needed);
}
WARN_UNUSED error_code try_parse(const std::string &s, const document *&dst, bool realloc_if_needed = true) noexcept {
return try_parse(s.data(), s.length(), dst, realloc_if_needed);
}
WARN_UNUSED error_code try_parse(const padded_string &s, const document *&dst) noexcept {
return try_parse(s.data(), s.length(), dst, false);
}
//
// Parse a JSON document and fill in dst.
//
// The document can be used even after the parser is deallocated or parse() is called again.
//
// Returns != SUCCESS if the JSON is invalid.
//
WARN_UNUSED error_code try_parse_into(const uint8_t *buf, size_t len, document &dst, bool realloc_if_needed = true) noexcept;
WARN_UNUSED error_code try_parse_into(const char *buf, size_t len, document &dst, bool realloc_if_needed = true) noexcept {
return try_parse_into((const uint8_t *)buf, len, dst, realloc_if_needed);
}
WARN_UNUSED error_code try_parse_into(const std::string &s, document &dst, bool realloc_if_needed = true) noexcept {
return try_parse_into(s.data(), s.length(), dst, realloc_if_needed);
}
WARN_UNUSED error_code try_parse_into(const padded_string &s, document &dst) noexcept {
return try_parse_into(s.data(), s.length(), dst, false);
}
//
// Current capacity: the largest document this parser can support without reallocating.
//
size_t capacity() {
return _capacity;
}
//
// The maximum level of nested object and arrays supported by this parser.
//
size_t max_depth() {
return _max_depth;
}
// if needed, allocate memory so that the object is able to process JSON
// documents having up to capacity bytes and max_depth "depth"
WARN_UNUSED bool allocate_capacity(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) {
return set_capacity(capacity) && set_max_depth(max_depth);
}
// type aliases for backcompat
using Iterator = document::iterator;
using InvalidJSON = invalid_json;
// Next location to write to in the tape
uint32_t current_loc{0};
// structural indices passed from stage 1 to stage 2
uint32_t n_structural_indexes{0};
std::unique_ptr<uint32_t[]> structural_indexes;
// location and return address of each open { or [
std::unique_ptr<uint32_t[]> containing_scope_offset;
#ifdef SIMDJSON_USE_COMPUTED_GOTO
std::unique_ptr<void*[]> ret_address;
#else
std::unique_ptr<char[]> ret_address;
#endif
// Next place to write a string
uint8_t *current_string_buf_loc;
bool valid{false};
error_code error{simdjson::UNINITIALIZED};
// Document we're writing to
document doc;
// returns true if the document parsed was valid
bool is_valid() const;
// return an error code corresponding to the last parsing attempt, see
// simdjson.h will return simdjson::UNITIALIZED if no parsing was attempted
int get_error_code() const;
// return the string equivalent of "get_error_code"
std::string get_error_message() const;
//
// for backcompat with ParsedJson
//
// print the json to std::ostream (should be valid)
// return false if the tape is likely wrong (e.g., you did not parse a valid
// JSON).
WARN_UNUSED
bool print_json(std::ostream &os) const;
WARN_UNUSED
bool dump_raw_tape(std::ostream &os) const;
// this should be called when parsing (right before writing the tapes)
void init_stage2();
really_inline error_code on_error(error_code new_error_code) {
error = new_error_code;
return new_error_code;
}
really_inline error_code on_success(error_code success_code) {
error = success_code;
valid = true;
return success_code;
}
really_inline bool on_start_document(uint32_t depth) {
containing_scope_offset[depth] = current_loc;
write_tape(0, 'r');
return true;
}
really_inline bool on_start_object(uint32_t depth) {
containing_scope_offset[depth] = current_loc;
write_tape(0, '{');
return true;
}
really_inline bool on_start_array(uint32_t depth) {
containing_scope_offset[depth] = current_loc;
write_tape(0, '[');
return true;
}
// TODO we're not checking this bool
really_inline bool on_end_document(uint32_t depth) {
// write our doc.tape location to the header scope
// The root scope gets written *at* the previous location.
annotate_previous_loc(containing_scope_offset[depth], current_loc);
write_tape(containing_scope_offset[depth], 'r');
return true;
}
really_inline bool on_end_object(uint32_t depth) {
// write our doc.tape location to the header scope
write_tape(containing_scope_offset[depth], '}');
annotate_previous_loc(containing_scope_offset[depth], current_loc);
return true;
}
really_inline bool on_end_array(uint32_t depth) {
// write our doc.tape location to the header scope
write_tape(containing_scope_offset[depth], ']');
annotate_previous_loc(containing_scope_offset[depth], current_loc);
return true;
}
really_inline bool on_true_atom() {
write_tape(0, 't');
return true;
}
really_inline bool on_false_atom() {
write_tape(0, 'f');
return true;
}
really_inline bool on_null_atom() {
write_tape(0, 'n');
return true;
}
really_inline uint8_t *on_start_string() {
/* we advance the point, accounting for the fact that we have a NULL
* termination */
write_tape(current_string_buf_loc - doc.string_buf.get(), '"');
return current_string_buf_loc + sizeof(uint32_t);
}
really_inline bool on_end_string(uint8_t *dst) {
uint32_t str_length = dst - (current_string_buf_loc + sizeof(uint32_t));
// TODO check for overflow in case someone has a crazy string (>=4GB?)
// But only add the overflow check when the document itself exceeds 4GB
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
// NULL termination is still handy if you expect all your strings to
// be NULL terminated? It comes at a small cost
*dst = 0;
current_string_buf_loc = dst + 1;
return true;
}
really_inline bool on_number_s64(int64_t value) {
write_tape(0, 'l');
std::memcpy(&doc.tape[current_loc], &value, sizeof(value));
++current_loc;
return true;
}
really_inline bool on_number_u64(uint64_t value) {
write_tape(0, 'u');
doc.tape[current_loc++] = value;
return true;
}
really_inline bool on_number_double(double value) {
write_tape(0, 'd');
static_assert(sizeof(value) == sizeof(doc.tape[current_loc]), "mismatch size");
memcpy(&doc.tape[current_loc++], &value, sizeof(double));
// doc.tape[doc.current_loc++] = *((uint64_t *)&d);
return true;
}
//
// Called before a parse is initiated.
//
// - Returns CAPACITY if the document is too large
// - Returns MEMALLOC if we needed to allocate memory and could not
//
WARN_UNUSED error_code init_parse(size_t len);
const document &get_document() const {
if (!is_valid()) {
throw invalid_json(error);
}
return doc;
}
private:
//
// The maximum document length this parser supports.
//
// Buffers are large enough to handle any document up to this length.
//
size_t _capacity{0};
//
// The maximum depth (number of nested objects and arrays) supported by this parser.
//
// Defaults to DEFAULT_MAX_DEPTH.
//
size_t _max_depth{0};
// all nodes are stored on the doc.tape using a 64-bit word.
//
// strings, double and ints are stored as
// a 64-bit word with a pointer to the actual value
//
//
//
// for objects or arrays, store [ or { at the beginning and } and ] at the
// end. For the openings ([ or {), we annotate them with a reference to the
// location on the doc.tape of the end, and for then closings (} and ]), we
// annotate them with a reference to the location of the opening
//
//
// this should be considered a private function
really_inline void write_tape(uint64_t val, uint8_t c) {
doc.tape[current_loc++] = val | ((static_cast<uint64_t>(c)) << 56);
}
really_inline void annotate_previous_loc(uint32_t saved_loc, uint64_t val) {
doc.tape[saved_loc] |= val;
}
WARN_UNUSED error_code try_parse(const uint8_t *buf, size_t len, bool realloc_if_needed) noexcept;
//
// Set the current capacity: the largest document this parser can support without reallocating.
//
// This will allocate *or deallocate* as necessary.
//
// Returns false if allocation fails.
//
WARN_UNUSED bool set_capacity(size_t capacity);
//
// Set the maximum level of nested object and arrays supported by this parser.
//
// This will allocate *or deallocate* as necessary.
//
// Returns false if allocation fails.
//
WARN_UNUSED bool set_max_depth(size_t max_depth);
};
} // namespace simdjson
#endif // SIMDJSON_DOCUMENT_PARSER_H

View File

@ -0,0 +1,597 @@
#ifndef SIMDJSON_DOCUMENT_PARSER_H
#define SIMDJSON_DOCUMENT_PARSER_H
#include <cstring>
#include <memory>
#include "simdjson/common_defs.h"
#include "simdjson/simdjson.h"
#include "simdjson/document.h"
#include "simdjson/padded_string.h"
namespace simdjson {
class document::parser {
public:
//
// Create a JSON parser with zero capacity. Call allocate_capacity() to initialize it.
//
parser()=default;
~parser()=default;
// this is a move only class
parser(document::parser &&p) = default;
parser(const document::parser &p) = delete;
parser &operator=(document::parser &&o) = default;
parser &operator=(const document::parser &o) = delete;
//
// Parse a JSON document and return a reference to it.
//
// The JSON document still lives in the parser: this is the most efficient way to parse JSON
// documents because it reuses the same buffers, but you *must* use the document before you
// destroy the parser or call parse() again.
//
// Throws invalid_json if the JSON is invalid.
//
inline doc_ref_result parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true);
inline doc_ref_result parse(const char *buf, size_t len, bool realloc_if_needed = true);
inline doc_ref_result parse(const std::string &s, bool realloc_if_needed = true);
inline doc_ref_result parse(const padded_string &s);
//
// Current capacity: the largest document this parser can support without reallocating.
//
size_t capacity() {
return _capacity;
}
//
// The maximum level of nested object and arrays supported by this parser.
//
size_t max_depth() {
return _max_depth;
}
// if needed, allocate memory so that the object is able to process JSON
// documents having up to capacity bytes and max_depth "depth"
WARN_UNUSED bool allocate_capacity(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) {
return set_capacity(capacity) && set_max_depth(max_depth);
}
// type aliases for backcompat
using Iterator = document::iterator;
using InvalidJSON = invalid_json;
class doc_result;
// Next location to write to in the tape
uint32_t current_loc{0};
// structural indices passed from stage 1 to stage 2
uint32_t n_structural_indexes{0};
std::unique_ptr<uint32_t[]> structural_indexes;
// location and return address of each open { or [
std::unique_ptr<uint32_t[]> containing_scope_offset;
#ifdef SIMDJSON_USE_COMPUTED_GOTO
std::unique_ptr<void*[]> ret_address;
#else
std::unique_ptr<char[]> ret_address;
#endif
// Next place to write a string
uint8_t *current_string_buf_loc;
bool valid{false};
error_code error{simdjson::UNINITIALIZED};
// Document we're writing to
document doc;
// returns true if the document parsed was valid
bool is_valid() const;
// return an error code corresponding to the last parsing attempt, see
// simdjson.h will return simdjson::UNITIALIZED if no parsing was attempted
int get_error_code() const;
// return the string equivalent of "get_error_code"
std::string get_error_message() const;
//
// for backcompat with ParsedJson
//
// print the json to std::ostream (should be valid)
// return false if the tape is likely wrong (e.g., you did not parse a valid
// JSON).
WARN_UNUSED
bool print_json(std::ostream &os) const;
WARN_UNUSED
bool dump_raw_tape(std::ostream &os) const;
// this should be called when parsing (right before writing the tapes)
void init_stage2();
really_inline error_code on_error(error_code new_error_code) {
error = new_error_code;
return new_error_code;
}
really_inline error_code on_success(error_code success_code) {
error = success_code;
valid = true;
return success_code;
}
really_inline bool on_start_document(uint32_t depth) {
containing_scope_offset[depth] = current_loc;
write_tape(0, 'r');
return true;
}
really_inline bool on_start_object(uint32_t depth) {
containing_scope_offset[depth] = current_loc;
write_tape(0, '{');
return true;
}
really_inline bool on_start_array(uint32_t depth) {
containing_scope_offset[depth] = current_loc;
write_tape(0, '[');
return true;
}
// TODO we're not checking this bool
really_inline bool on_end_document(uint32_t depth) {
// write our doc.tape location to the header scope
// The root scope gets written *at* the previous location.
annotate_previous_loc(containing_scope_offset[depth], current_loc);
write_tape(containing_scope_offset[depth], 'r');
return true;
}
really_inline bool on_end_object(uint32_t depth) {
// write our doc.tape location to the header scope
write_tape(containing_scope_offset[depth], '}');
annotate_previous_loc(containing_scope_offset[depth], current_loc);
return true;
}
really_inline bool on_end_array(uint32_t depth) {
// write our doc.tape location to the header scope
write_tape(containing_scope_offset[depth], ']');
annotate_previous_loc(containing_scope_offset[depth], current_loc);
return true;
}
really_inline bool on_true_atom() {
write_tape(0, 't');
return true;
}
really_inline bool on_false_atom() {
write_tape(0, 'f');
return true;
}
really_inline bool on_null_atom() {
write_tape(0, 'n');
return true;
}
really_inline uint8_t *on_start_string() {
/* we advance the point, accounting for the fact that we have a NULL
* termination */
write_tape(current_string_buf_loc - doc.string_buf.get(), '"');
return current_string_buf_loc + sizeof(uint32_t);
}
really_inline bool on_end_string(uint8_t *dst) {
uint32_t str_length = dst - (current_string_buf_loc + sizeof(uint32_t));
// TODO check for overflow in case someone has a crazy string (>=4GB?)
// But only add the overflow check when the document itself exceeds 4GB
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
// NULL termination is still handy if you expect all your strings to
// be NULL terminated? It comes at a small cost
*dst = 0;
current_string_buf_loc = dst + 1;
return true;
}
really_inline bool on_number_s64(int64_t value) {
write_tape(0, 'l');
std::memcpy(&doc.tape[current_loc], &value, sizeof(value));
++current_loc;
return true;
}
really_inline bool on_number_u64(uint64_t value) {
write_tape(0, 'u');
doc.tape[current_loc++] = value;
return true;
}
really_inline bool on_number_double(double value) {
write_tape(0, 'd');
static_assert(sizeof(value) == sizeof(doc.tape[current_loc]), "mismatch size");
memcpy(&doc.tape[current_loc++], &value, sizeof(double));
// doc.tape[doc.current_loc++] = *((uint64_t *)&d);
return true;
}
//
// Called before a parse is initiated.
//
// - Returns CAPACITY if the document is too large
// - Returns MEMALLOC if we needed to allocate memory and could not
//
WARN_UNUSED error_code init_parse(size_t len);
const document &get_document() const {
if (!is_valid()) {
throw invalid_json(error);
}
return doc;
}
private:
//
// The maximum document length this parser supports.
//
// Buffers are large enough to handle any document up to this length.
//
size_t _capacity{0};
//
// The maximum depth (number of nested objects and arrays) supported by this parser.
//
// Defaults to DEFAULT_MAX_DEPTH.
//
size_t _max_depth{0};
// all nodes are stored on the doc.tape using a 64-bit word.
//
// strings, double and ints are stored as
// a 64-bit word with a pointer to the actual value
//
//
//
// for objects or arrays, store [ or { at the beginning and } and ] at the
// end. For the openings ([ or {), we annotate them with a reference to the
// location on the doc.tape of the end, and for then closings (} and ]), we
// annotate them with a reference to the location of the opening
//
//
// this should be considered a private function
really_inline void write_tape(uint64_t val, uint8_t c) {
doc.tape[current_loc++] = val | ((static_cast<uint64_t>(c)) << 56);
}
really_inline void annotate_previous_loc(uint32_t saved_loc, uint64_t val) {
doc.tape[saved_loc] |= val;
}
//
// Set the current capacity: the largest document this parser can support without reallocating.
//
// This will allocate *or deallocate* as necessary.
//
// Returns false if allocation fails.
//
WARN_UNUSED bool set_capacity(size_t capacity);
//
// Set the maximum level of nested object and arrays supported by this parser.
//
// This will allocate *or deallocate* as necessary.
//
// Returns false if allocation fails.
//
WARN_UNUSED bool set_max_depth(size_t max_depth);
};
//
// C API (json_parse and build_parsed_json) declarations
//
// Parse a document found in buf.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// You need to preallocate document::parser with a capacity of len (e.g.,
// parser.allocate_capacity(len)).
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
// or an error code from simdjson/simdjson.h in case of failure such as
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
// the simdjson::error_message function converts these error codes into a
// string).
//
// You can also check validity by calling parser.is_valid(). The same document::parser can
// be reused for other documents.
//
// If realloc_if_needed is true (default) then a temporary buffer is created
// when needed during processing (a copy of the input string is made). The input
// buf should be readable up to buf + len + SIMDJSON_PADDING if
// realloc_if_needed is false, all bytes at and after buf + len are ignored
// (can be garbage). The document::parser object can be reused.
int json_parse(const uint8_t *buf, size_t len, document::parser &parser,
bool realloc_if_needed = true);
// Parse a document found in buf.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// You need to preallocate document::parser with a capacity of len (e.g.,
// parser.allocate_capacity(len)).
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
// or an error code from simdjson/simdjson.h in case of failure such as
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
// the simdjson::error_message function converts these error codes into a
// string).
//
// You can also check validity
// by calling parser.is_valid(). The same document::parser can be reused for other
// documents.
//
// If realloc_if_needed is true (default) then a temporary buffer is created
// when needed during processing (a copy of the input string is made). The input
// buf should be readable up to buf + len + SIMDJSON_PADDING if
// realloc_if_needed is false, all bytes at and after buf + len are ignored
// (can be garbage). The document::parser object can be reused.
int json_parse(const char *buf, size_t len, document::parser &parser,
bool realloc_if_needed = true);
// We do not want to allow implicit conversion from C string to std::string.
int json_parse(const char *buf, document::parser &parser) = delete;
// Parse a document found in in string s.
// You need to preallocate document::parser with a capacity of len (e.g.,
// parser.allocate_capacity(len)).
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
// or an error code from simdjson/simdjson.h in case of failure such as
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
// the simdjson::error_message function converts these error codes into a
// string).
//
// A temporary buffer is created when needed during processing
// (a copy of the input string is made).
inline int json_parse(const std::string &s, document::parser &parser) {
return json_parse(s.data(), s.length(), parser, true);
}
// Parse a document found in in string s.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// You need to preallocate document::parser with a capacity of len (e.g.,
// parser.allocate_capacity(len)).
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
// or an error code from simdjson/simdjson.h in case of failure such as
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
// the simdjson::error_message function converts these error codes into a
// string).
//
// You can also check validity
// by calling parser.is_valid(). The same document::parser can be reused for other
// documents.
inline int json_parse(const padded_string &s, document::parser &parser) {
return json_parse(s.data(), s.length(), parser, false);
}
// Build a document::parser object. You can check validity
// by calling parser.is_valid(). This does the memory allocation needed for
// document::parser. If realloc_if_needed is true (default) then a temporary buffer is
// created when needed during processing (a copy of the input string is made).
//
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if
// realloc_if_needed is false, all bytes at and after buf + len are ignored
// (can be garbage).
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// This is a convenience function which calls json_parse.
WARN_UNUSED
document::parser build_parsed_json(const uint8_t *buf, size_t len,
bool realloc_if_needed = true);
WARN_UNUSED
// Build a document::parser object. You can check validity
// by calling parser.is_valid(). This does the memory allocation needed for
// document::parser. If realloc_if_needed is true (default) then a temporary buffer is
// created when needed during processing (a copy of the input string is made).
//
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if
// realloc_if_needed is false, all bytes at and after buf + len are ignored
// (can be garbage).
//
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// This is a convenience function which calls json_parse.
inline document::parser build_parsed_json(const char *buf, size_t len,
bool realloc_if_needed = true) {
return build_parsed_json(reinterpret_cast<const uint8_t *>(buf), len,
realloc_if_needed);
}
// We do not want to allow implicit conversion from C string to std::string.
document::parser build_parsed_json(const char *buf) = delete;
// Parse a document found in in string s.
// You need to preallocate document::parser with a capacity of len (e.g.,
// parser.allocate_capacity(len)). Return SUCCESS (an integer = 0) in case of a
// success. You can also check validity by calling parser.is_valid(). The same
// document::parser can be reused for other documents.
//
// A temporary buffer is created when needed during processing
// (a copy of the input string is made).
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// This is a convenience function which calls json_parse.
WARN_UNUSED
inline document::parser build_parsed_json(const std::string &s) {
return build_parsed_json(s.data(), s.length(), true);
}
// Parse a document found in in string s.
// You need to preallocate document::parser with a capacity of len (e.g.,
// parser.allocate_capacity(len)). Return SUCCESS (an integer = 0) in case of a
// success. You can also check validity by calling parser.is_valid(). The same
// document::parser can be reused for other documents.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// This is a convenience function which calls json_parse.
WARN_UNUSED
inline document::parser build_parsed_json(const padded_string &s) {
return build_parsed_json(s.data(), s.length(), false);
}
//
// Stage 1 implementation declarations
//
// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
// you may want to call on a function like trimmed_length_safe_utf8.
// A function like find_last_json_buf_idx may also prove useful.
template <architecture T = architecture::NATIVE>
int find_structural_bits(const uint8_t *buf, size_t len, document::parser &parser, bool streaming);
// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
// you may want to call on a function like trimmed_length_safe_utf8.
// A function like find_last_json_buf_idx may also prove useful.
template <architecture T = architecture::NATIVE>
int find_structural_bits(const char *buf, size_t len, document::parser &parser, bool streaming) {
return find_structural_bits<T>((const uint8_t *)buf, len, parser, streaming);
}
template <architecture T = architecture::NATIVE>
int find_structural_bits(const uint8_t *buf, size_t len, document::parser &parser) {
return find_structural_bits<T>(buf, len, parser, false);
}
template <architecture T = architecture::NATIVE>
int find_structural_bits(const char *buf, size_t len, document::parser &parser) {
return find_structural_bits<T>((const uint8_t *)buf, len, parser);
}
//
// Stage 2 implementation declarations
//
template <architecture T = architecture::NATIVE>
WARN_UNUSED int
unified_machine(const uint8_t *buf, size_t len, document::parser &parser);
template <architecture T = architecture::NATIVE>
WARN_UNUSED int
unified_machine(const char *buf, size_t len, document::parser &parser) {
return unified_machine<T>(reinterpret_cast<const uint8_t *>(buf), len, parser);
}
// Streaming
template <architecture T = architecture::NATIVE>
WARN_UNUSED int
unified_machine(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json);
template <architecture T = architecture::NATIVE>
int unified_machine(const char *buf, size_t len, document::parser &parser, size_t &next_json) {
return unified_machine<T>(reinterpret_cast<const uint8_t *>(buf), len, parser, next_json);
}
} // namespace simdjson
//
// Inline implementation
//
#include "simdjson/document_parser.h"
#include "simdjson/stage1_find_marks.h"
#include "simdjson/stage2_build_tape.h"
namespace simdjson {
inline document::doc_ref_result document::parser::parse(const uint8_t *buf, size_t len, bool realloc_if_needed) {
auto code = (error_code)json_parse(buf, len, *this, realloc_if_needed);
valid = false;
error = UNINITIALIZED;
return document::doc_ref_result(doc, code);
}
really_inline document::doc_ref_result document::parser::parse(const char *buf, size_t len, bool realloc_if_needed) {
return parse((const uint8_t *)buf, len, realloc_if_needed);
}
really_inline document::doc_ref_result document::parser::parse(const std::string &s, bool realloc_if_needed) {
return parse(s.data(), s.length(), realloc_if_needed);
}
really_inline document::doc_ref_result document::parser::parse(const padded_string &s) {
return parse(s.data(), s.length(), false);
}
inline document::doc_result document::parse(const uint8_t *buf, size_t len, bool realloc_if_needed) {
document::parser parser;
if (!parser.allocate_capacity(len)) {
return MEMALLOC;
}
auto [doc, error] = parser.parse(buf, len, realloc_if_needed);
return document::doc_result((document &&)doc, error);
}
inline document::doc_result document::parse(const char *buf, size_t len, bool realloc_if_needed) {
return parse((const uint8_t *)buf, len, realloc_if_needed);
}
inline document::doc_result document::parse(const std::string &s, bool realloc_if_needed) {
return parse(s.data(), s.length(), realloc_if_needed);
}
inline document::doc_result document::parse(const padded_string &s) {
return parse(s.data(), s.length(), false);
}
// json_parse_implementation is the generic function, it is specialized for
// various architectures, e.g., as
// json_parse_implementation<architecture::HASWELL> or
// json_parse_implementation<architecture::ARM64>
template <architecture T>
int json_parse_implementation(const uint8_t *buf, size_t len, document::parser &parser,
bool realloc_if_needed = true) {
int result = parser.init_parse(len);
if (result != SUCCESS) { return result; }
bool reallocated = false;
if (realloc_if_needed) {
const uint8_t *tmp_buf = buf;
buf = (uint8_t *)allocate_padded_buffer(len);
if (buf == NULL)
return simdjson::MEMALLOC;
memcpy((void *)buf, tmp_buf, len);
reallocated = true;
}
int stage1_err = simdjson::find_structural_bits<T>(buf, len, parser);
if (stage1_err != simdjson::SUCCESS) {
if (reallocated) { // must free before we exit
aligned_free((void *)buf);
}
return stage1_err;
}
int res = unified_machine<T>(buf, len, parser);
if (reallocated) {
aligned_free((void *)buf);
}
return res;
}
} // namespace simdjson
#endif // SIMDJSON_DOCUMENT_PARSER_H

View File

@ -1,219 +1,8 @@
#ifndef SIMDJSON_JSONPARSER_H
#define SIMDJSON_JSONPARSER_H
#include "simdjson/common_defs.h"
#include "simdjson/jsonioutil.h"
#include "simdjson/padded_string.h"
#include "simdjson/document.h"
#include "simdjson/parsedjson.h"
#include "simdjson/simdjson.h"
#include "simdjson/stage1_find_marks.h"
#include "simdjson/stage2_build_tape.h"
#include <string>
#include "simdjson/jsonioutil.h"
namespace simdjson {
// json_parse_implementation is the generic function, it is specialized for
// various architectures, e.g., as
// json_parse_implementation<architecture::HASWELL> or
// json_parse_implementation<architecture::ARM64>
template <architecture T>
int json_parse_implementation(const uint8_t *buf, size_t len, document::parser &parser,
bool realloc_if_needed = true) {
int result = parser.init_parse(len);
if (result != SUCCESS) { return result; }
bool reallocated = false;
if (realloc_if_needed) {
const uint8_t *tmp_buf = buf;
buf = (uint8_t *)allocate_padded_buffer(len);
if (buf == NULL)
return simdjson::MEMALLOC;
memcpy((void *)buf, tmp_buf, len);
reallocated = true;
}
int stage1_err = simdjson::find_structural_bits<T>(buf, len, parser);
if (stage1_err != simdjson::SUCCESS) {
if (reallocated) { // must free before we exit
aligned_free((void *)buf);
}
return stage1_err;
}
int res = unified_machine<T>(buf, len, parser);
if (reallocated) {
aligned_free((void *)buf);
}
return res;
}
// Parse a document found in buf.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// You need to preallocate document::parser with a capacity of len (e.g.,
// parser.allocate_capacity(len)).
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
// or an error code from simdjson/simdjson.h in case of failure such as
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
// the simdjson::error_message function converts these error codes into a
// string).
//
// You can also check validity by calling parser.is_valid(). The same document::parser can
// be reused for other documents.
//
// If realloc_if_needed is true (default) then a temporary buffer is created
// when needed during processing (a copy of the input string is made). The input
// buf should be readable up to buf + len + SIMDJSON_PADDING if
// realloc_if_needed is false, all bytes at and after buf + len are ignored
// (can be garbage). The document::parser object can be reused.
int json_parse(const uint8_t *buf, size_t len, document::parser &parser,
bool realloc_if_needed = true);
// Parse a document found in buf.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// You need to preallocate document::parser with a capacity of len (e.g.,
// parser.allocate_capacity(len)).
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
// or an error code from simdjson/simdjson.h in case of failure such as
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
// the simdjson::error_message function converts these error codes into a
// string).
//
// You can also check validity
// by calling parser.is_valid(). The same document::parser can be reused for other
// documents.
//
// If realloc_if_needed is true (default) then a temporary buffer is created
// when needed during processing (a copy of the input string is made). The input
// buf should be readable up to buf + len + SIMDJSON_PADDING if
// realloc_if_needed is false, all bytes at and after buf + len are ignored
// (can be garbage). The document::parser object can be reused.
int json_parse(const char *buf, size_t len, document::parser &parser,
bool realloc_if_needed = true);
// We do not want to allow implicit conversion from C string to std::string.
int json_parse(const char *buf, document::parser &parser) = delete;
// Parse a document found in in string s.
// You need to preallocate document::parser with a capacity of len (e.g.,
// parser.allocate_capacity(len)).
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
// or an error code from simdjson/simdjson.h in case of failure such as
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
// the simdjson::error_message function converts these error codes into a
// string).
//
// A temporary buffer is created when needed during processing
// (a copy of the input string is made).
inline int json_parse(const std::string &s, document::parser &parser) {
return json_parse(s.data(), s.length(), parser, true);
}
// Parse a document found in in string s.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// You need to preallocate document::parser with a capacity of len (e.g.,
// parser.allocate_capacity(len)).
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
// or an error code from simdjson/simdjson.h in case of failure such as
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
// the simdjson::error_message function converts these error codes into a
// string).
//
// You can also check validity
// by calling parser.is_valid(). The same document::parser can be reused for other
// documents.
inline int json_parse(const padded_string &s, document::parser &parser) {
return json_parse(s.data(), s.length(), parser, false);
}
// Build a document::parser object. You can check validity
// by calling parser.is_valid(). This does the memory allocation needed for
// document::parser. If realloc_if_needed is true (default) then a temporary buffer is
// created when needed during processing (a copy of the input string is made).
//
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if
// realloc_if_needed is false, all bytes at and after buf + len are ignored
// (can be garbage).
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// This is a convenience function which calls json_parse.
WARN_UNUSED
document::parser build_parsed_json(const uint8_t *buf, size_t len,
bool realloc_if_needed = true);
WARN_UNUSED
// Build a document::parser object. You can check validity
// by calling parser.is_valid(). This does the memory allocation needed for
// document::parser. If realloc_if_needed is true (default) then a temporary buffer is
// created when needed during processing (a copy of the input string is made).
//
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if
// realloc_if_needed is false, all bytes at and after buf + len are ignored
// (can be garbage).
//
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// This is a convenience function which calls json_parse.
inline document::parser build_parsed_json(const char *buf, size_t len,
bool realloc_if_needed = true) {
return build_parsed_json(reinterpret_cast<const uint8_t *>(buf), len,
realloc_if_needed);
}
// We do not want to allow implicit conversion from C string to std::string.
document::parser build_parsed_json(const char *buf) = delete;
// Parse a document found in in string s.
// You need to preallocate document::parser with a capacity of len (e.g.,
// parser.allocate_capacity(len)). Return SUCCESS (an integer = 0) in case of a
// success. You can also check validity by calling parser.is_valid(). The same
// document::parser can be reused for other documents.
//
// A temporary buffer is created when needed during processing
// (a copy of the input string is made).
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// This is a convenience function which calls json_parse.
WARN_UNUSED
inline document::parser build_parsed_json(const std::string &s) {
return build_parsed_json(s.data(), s.length(), true);
}
// Parse a document found in in string s.
// You need to preallocate document::parser with a capacity of len (e.g.,
// parser.allocate_capacity(len)). Return SUCCESS (an integer = 0) in case of a
// success. You can also check validity by calling parser.is_valid(). The same
// document::parser can be reused for other documents.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// This is a convenience function which calls json_parse.
WARN_UNUSED
inline document::parser build_parsed_json(const padded_string &s) {
return build_parsed_json(s.data(), s.length(), false);
}
} // namespace simdjson
#endif

View File

@ -1,6 +1,6 @@
#ifndef SIMDJSON_PARSEDJSONITERATOR_H
#define SIMDJSON_PARSEDJSONITERATOR_H
#include "document/iterator.h"
#include "document_iterator.h"
#endif

View File

@ -1,39 +1,6 @@
#ifndef SIMDJSON_STAGE1_FIND_MARKS_H
#define SIMDJSON_STAGE1_FIND_MARKS_H
#include "simdjson/parsedjson.h"
#include "simdjson/simdjson.h"
namespace simdjson {
// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
// you may want to call on a function like trimmed_length_safe_utf8.
// A function like find_last_json_buf_idx may also prove useful.
template <architecture T = architecture::NATIVE>
int find_structural_bits(const uint8_t *buf, size_t len, document::parser &parser, bool streaming);
// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
// you may want to call on a function like trimmed_length_safe_utf8.
// A function like find_last_json_buf_idx may also prove useful.
template <architecture T = architecture::NATIVE>
int find_structural_bits(const char *buf, size_t len, document::parser &parser, bool streaming) {
return find_structural_bits<T>((const uint8_t *)buf, len, parser, streaming);
}
template <architecture T = architecture::NATIVE>
int find_structural_bits(const uint8_t *buf, size_t len, document::parser &parser) {
return find_structural_bits<T>(buf, len, parser, false);
}
template <architecture T = architecture::NATIVE>
int find_structural_bits(const char *buf, size_t len, document::parser &parser) {
return find_structural_bits<T>((const uint8_t *)buf, len, parser);
}
} // namespace simdjson
#include "simdjson/document.h"
#endif

View File

@ -1,35 +1,6 @@
#ifndef SIMDJSON_STAGE2_BUILD_TAPE_H
#define SIMDJSON_STAGE2_BUILD_TAPE_H
#include "simdjson/common_defs.h"
#include "simdjson/parsedjson.h"
#include "simdjson/simdjson.h"
namespace simdjson {
template <architecture T = architecture::NATIVE>
WARN_UNUSED int
unified_machine(const uint8_t *buf, size_t len, document::parser &parser);
template <architecture T = architecture::NATIVE>
WARN_UNUSED int
unified_machine(const char *buf, size_t len, document::parser &parser) {
return unified_machine<T>(reinterpret_cast<const uint8_t *>(buf), len, parser);
}
// Streaming
template <architecture T = architecture::NATIVE>
WARN_UNUSED int
unified_machine(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json);
template <architecture T = architecture::NATIVE>
int unified_machine(const char *buf, size_t len, document::parser &parser, size_t &next_json) {
return unified_machine<T>(reinterpret_cast<const uint8_t *>(buf), len, parser, next_json);
}
} // namespace simdjson
#include "simdjson/document.h"
#endif

View File

@ -28,7 +28,7 @@ set(SIMDJSON_SRC
stage1_find_marks.cpp
stage2_build_tape.cpp
document.cpp
document/parser.cpp
document_parser.cpp
error.cpp
)

View File

@ -3,47 +3,6 @@
namespace simdjson {
// This is the internal one all others end up calling
error_code document::parser::try_parse(const uint8_t *buf, size_t len, bool realloc_if_needed) noexcept {
return (error_code)json_parse(buf, len, *this, realloc_if_needed);
}
error_code document::parser::try_parse(const uint8_t *buf, size_t len, const document *& dst, bool realloc_if_needed) noexcept {
auto result = try_parse(buf, len, realloc_if_needed);
dst = result == SUCCESS ? &doc : nullptr;
return result;
}
error_code document::parser::try_parse_into(const uint8_t *buf, size_t len, document & dst, bool realloc_if_needed) noexcept {
auto result = try_parse(buf, len, realloc_if_needed);
if (result != SUCCESS) {
return result;
}
// Take the document
dst = (document&&)doc;
valid = false; // Document has been taken; there is no valid document anymore
error = UNINITIALIZED;
return result;
}
const document &document::parser::parse(const uint8_t *buf, size_t len, bool realloc_if_needed) {
const document *dst;
error_code result = try_parse(buf, len, dst, realloc_if_needed);
if (result) {
throw invalid_json(result);
}
return *dst;
}
document document::parser::parse_new(const uint8_t *buf, size_t len, bool realloc_if_needed) {
document dst;
error_code result = try_parse_into(buf, len, dst, realloc_if_needed);
if (result) {
throw invalid_json(result);
}
return dst;
}
WARN_UNUSED
error_code document::parser::init_parse(size_t len) {
if (len > capacity()) {

View File

@ -11,17 +11,17 @@ namespace simdjson {
// instruction sets.
// function pointer type for json_parse
using json_parse_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj, bool realloc);
using json_parse_functype = int(const uint8_t *buf, size_t len, document::parser &pj, bool realloc);
// Pointer that holds the json_parse implementation corresponding to the
// available SIMD instruction set
extern std::atomic<json_parse_functype *> json_parse_ptr;
int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool realloc) {
int json_parse(const uint8_t *buf, size_t len, document::parser &pj, bool realloc) {
return json_parse_ptr.load(std::memory_order_relaxed)(buf, len, pj, realloc);
}
int json_parse(const char *buf, size_t len, ParsedJson &pj, bool realloc) {
int json_parse(const char *buf, size_t len, document::parser &pj, bool realloc) {
return json_parse_ptr.load(std::memory_order_relaxed)(reinterpret_cast<const uint8_t *>(buf), len, pj,
realloc);
}
@ -53,7 +53,7 @@ architecture parse_architecture(char *arch) {
}
// Responsible to select the best json_parse implementation
int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool realloc) {
int json_parse_dispatch(const uint8_t *buf, size_t len, document::parser &pj, bool realloc) {
architecture best_implementation = find_best_supported_architecture();
// Selecting the best implementation
switch (best_implementation) {
@ -81,12 +81,12 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
std::atomic<json_parse_functype *> json_parse_ptr{&json_parse_dispatch};
WARN_UNUSED
ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool realloc) {
ParsedJson pj;
bool ok = pj.allocate_capacity(len);
document::parser build_parsed_json(const uint8_t *buf, size_t len, bool realloc) {
document::parser parser;
bool ok = parser.allocate_capacity(len);
if (ok) {
json_parse(buf, len, pj, realloc);
json_parse(buf, len, parser, realloc);
}
return pj;
return parser;
}
} // namespace simdjson