Remove JsonStream. Use parse_many() instead.
This commit is contained in:
parent
ab0e22a316
commit
5aec2671ea
2
Makefile
2
Makefile
|
@ -61,7 +61,7 @@ SRCHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/we
|
|||
SRCHEADERS_SRC=src/isadetection.h src/jsoncharutils.h src/simdprune_tables.h src/implementation.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document_parser_callbacks.h
|
||||
SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE) $(SRCHEADERS_FALLBACK)
|
||||
|
||||
INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/parsedjson_iterator.h include/simdjson/inline/parsedjson_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
|
||||
INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/parsedjson_iterator.h include/simdjson/inline/parsedjson_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
|
||||
|
||||
ifeq ($(SIMDJSON_TEST_AMALGAMATED_HEADERS),1)
|
||||
HEADERS=singleheader/simdjson.h
|
||||
|
|
|
@ -10,13 +10,11 @@ set(SIMDJSON_INCLUDE
|
|||
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/document_stream.h
|
||||
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/document.h
|
||||
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/error.h
|
||||
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/jsonstream.h
|
||||
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/padded_string.h
|
||||
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/parsedjson_iterator.h
|
||||
${SIMDJSON_INCLUDE_DIR}/simdjson/internal/jsonformatutils.h
|
||||
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonioutil.h
|
||||
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonparser.h
|
||||
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonstream.h
|
||||
${SIMDJSON_INCLUDE_DIR}/simdjson/padded_string.h
|
||||
${SIMDJSON_INCLUDE_DIR}/simdjson/parsedjson.h
|
||||
${SIMDJSON_INCLUDE_DIR}/simdjson/parsedjson_iterator.h
|
||||
|
|
|
@ -15,13 +15,11 @@
|
|||
#include "simdjson/jsonparser.h"
|
||||
#include "simdjson/parsedjson.h"
|
||||
#include "simdjson/parsedjson_iterator.h"
|
||||
#include "simdjson/jsonstream.h"
|
||||
|
||||
// Inline functions
|
||||
#include "simdjson/inline/document.h"
|
||||
#include "simdjson/inline/document_stream.h"
|
||||
#include "simdjson/inline/error.h"
|
||||
#include "simdjson/inline/jsonstream.h"
|
||||
#include "simdjson/inline/padded_string.h"
|
||||
#include "simdjson/inline/parsedjson_iterator.h"
|
||||
|
||||
|
|
|
@ -6,8 +6,6 @@
|
|||
|
||||
namespace simdjson {
|
||||
|
||||
template <class string_container = padded_string> class JsonStream;
|
||||
|
||||
/**
|
||||
* A forward-only stream of documents.
|
||||
*
|
||||
|
@ -137,7 +135,6 @@ private:
|
|||
std::thread stage_1_thread;
|
||||
document::parser parser_thread;
|
||||
#endif
|
||||
template <class string_container> friend class JsonStream;
|
||||
friend class document::parser;
|
||||
}; // class document::stream
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#ifndef SIMDJSON_INLINE_DOCUMENT_STREAM_H
|
||||
#define SIMDJSON_INLINE_DOCUMENT_STREAM_H
|
||||
|
||||
#include "simdjson/jsonstream.h"
|
||||
#include "simdjson/document_stream.h"
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <stdexcept>
|
||||
|
|
|
@ -1,35 +0,0 @@
|
|||
// TODO Remove this -- deprecated API and files
|
||||
|
||||
#ifndef SIMDJSON_INLINE_JSONSTREAM_H
|
||||
#define SIMDJSON_INLINE_JSONSTREAM_H
|
||||
|
||||
#include "simdjson/jsonstream.h"
|
||||
#include "simdjson/document.h"
|
||||
#include "simdjson/document_stream.h"
|
||||
|
||||
namespace simdjson {
|
||||
|
||||
template <class string_container>
|
||||
inline JsonStream<string_container>::JsonStream(const string_container &s, size_t _batch_size) noexcept
|
||||
: str(s), batch_size(_batch_size) {
|
||||
}
|
||||
|
||||
template <class string_container>
|
||||
inline JsonStream<string_container>::~JsonStream() noexcept {
|
||||
if (stream) { delete stream; }
|
||||
}
|
||||
|
||||
template <class string_container>
|
||||
inline int JsonStream<string_container>::json_parse(document::parser &parser) noexcept {
|
||||
if (unlikely(stream == nullptr)) {
|
||||
stream = new document::stream(parser, reinterpret_cast<const uint8_t*>(str.data()), str.length(), batch_size);
|
||||
} else {
|
||||
if (&parser != &stream->parser) { return stream->error = TAPE_ERROR; }
|
||||
stream->error = stream->json_parse();
|
||||
}
|
||||
return stream->error;
|
||||
}
|
||||
|
||||
} // namespace simdjson
|
||||
|
||||
#endif // SIMDJSON_INLINE_JSONSTREAM_H
|
|
@ -1,125 +0,0 @@
|
|||
// TODO Remove this -- deprecated API and files
|
||||
|
||||
#ifndef SIMDJSON_JSONSTREAM_H
|
||||
#define SIMDJSON_JSONSTREAM_H
|
||||
|
||||
#include "simdjson/document_stream.h"
|
||||
#include "simdjson/padded_string.h"
|
||||
|
||||
namespace simdjson {
|
||||
|
||||
/**
|
||||
* @deprecated use document::stream instead.
|
||||
*
|
||||
* The main motivation for this piece of software is to achieve maximum speed and offer
|
||||
* good quality of life while parsing files containing multiple JSON documents.
|
||||
*
|
||||
* Since we want to offer flexibility and not restrict ourselves to a specific file
|
||||
* format, we support any file that contains any valid JSON documents separated by one
|
||||
* or more character that is considered a whitespace by the JSON spec.
|
||||
* Namely: space, nothing, linefeed, carriage return, horizontal tab.
|
||||
* Anything that is not whitespace will be parsed as a JSON document and could lead
|
||||
* to failure.
|
||||
*
|
||||
* To offer maximum parsing speed, our implementation processes the data inside the
|
||||
* buffer by batches and their size is defined by the parameter "batch_size".
|
||||
* By loading data in batches, we can optimize the time spent allocating data in the
|
||||
* parser and can also open the possibility of multi-threading.
|
||||
* The batch_size must be at least as large as the biggest document in the file, but
|
||||
* not too large in order to submerge the chached memory. We found that 1MB is
|
||||
* somewhat a sweet spot for now. Eventually, this batch_size could be fully
|
||||
* automated and be optimal at all times.
|
||||
*
|
||||
* The template parameter (string_container) must
|
||||
* support the data() and size() methods, returning a pointer
|
||||
* to a char* and to the number of bytes respectively.
|
||||
* The simdjson parser may read up to SIMDJSON_PADDING bytes beyond the end
|
||||
* of the string, so if you do not use a padded_string container,
|
||||
* you have the responsibility to overallocate. If you fail to
|
||||
* do so, your software may crash if you cross a page boundary,
|
||||
* and you should expect memory checkers to object.
|
||||
* Most users should use a simdjson::padded_string.
|
||||
*/
|
||||
template <class string_container> class JsonStream {
|
||||
public:
|
||||
/* Create a JsonStream object that can be used to parse sequentially the valid
|
||||
* JSON documents found in the buffer "buf".
|
||||
*
|
||||
* The batch_size must be at least as large as the biggest document in the
|
||||
* file, but
|
||||
* not too large to submerge the cached memory. We found that 1MB is
|
||||
* somewhat a sweet spot for now.
|
||||
*
|
||||
* The user is expected to call the following json_parse method to parse the
|
||||
* next
|
||||
* valid JSON document found in the buffer. This method can and is expected
|
||||
* to be
|
||||
* called in a loop.
|
||||
*
|
||||
* Various methods are offered to keep track of the status, like
|
||||
* get_current_buffer_loc,
|
||||
* get_n_parsed_docs, get_n_bytes_parsed, etc.
|
||||
*
|
||||
* */
|
||||
JsonStream(const string_container &s, size_t _batch_size = 1000000) noexcept;
|
||||
|
||||
~JsonStream() noexcept;
|
||||
|
||||
/* Parse the next document found in the buffer previously given to JsonStream.
|
||||
|
||||
* The content should be a valid JSON document encoded as UTF-8. If there is a
|
||||
* UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
|
||||
* discouraged.
|
||||
*
|
||||
* You do NOT need to pre-allocate a parser. This function takes care of
|
||||
* pre-allocating a capacity defined by the batch_size defined when creating
|
||||
the
|
||||
* JsonStream object.
|
||||
*
|
||||
* The function returns simdjson::SUCCESS_AND_HAS_MORE (an integer = 1) in
|
||||
case
|
||||
* of success and indicates that the buffer still contains more data to be
|
||||
parsed,
|
||||
* meaning this function can be called again to return the next JSON document
|
||||
* after this one.
|
||||
*
|
||||
* The function returns simdjson::SUCCESS (as integer = 0) in case of success
|
||||
* and indicates that the buffer has successfully been parsed to the end.
|
||||
* Every document it contained has been parsed without error.
|
||||
*
|
||||
* The function returns an error code from simdjson/simdjson.h in case of
|
||||
failure
|
||||
* such as simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and
|
||||
so forth;
|
||||
* the simdjson::error_message function converts these error codes into a
|
||||
* string).
|
||||
*
|
||||
* You can also check validity by calling parser.is_valid(). The same parser
|
||||
can
|
||||
* and should be reused for the other documents in the buffer. */
|
||||
int json_parse(document::parser &parser) noexcept;
|
||||
|
||||
/* Returns the location (index) of where the next document should be in the
|
||||
* buffer.
|
||||
* Can be used for debugging, it tells the user the position of the end of the
|
||||
* last
|
||||
* valid JSON document parsed*/
|
||||
inline size_t get_current_buffer_loc() const noexcept { return stream ? stream->current_buffer_loc : 0; }
|
||||
|
||||
/* Returns the total amount of complete documents parsed by the JsonStream,
|
||||
* in the current buffer, at the given time.*/
|
||||
inline size_t get_n_parsed_docs() const noexcept { return stream ? stream->n_parsed_docs : 0; }
|
||||
|
||||
/* Returns the total amount of data (in bytes) parsed by the JsonStream,
|
||||
* in the current buffer, at the given time.*/
|
||||
inline size_t get_n_bytes_parsed() const noexcept { return stream ? stream->n_bytes_parsed : 0; }
|
||||
|
||||
private:
|
||||
const string_container &str;
|
||||
const size_t batch_size;
|
||||
document::stream *stream{nullptr};
|
||||
}; // end of class JsonStream
|
||||
|
||||
} // end of namespace simdjson
|
||||
|
||||
#endif // SIMDJSON_JSONSTREAM_H
|
|
@ -440,134 +440,6 @@ namespace document_stream_tests {
|
|||
return true;
|
||||
}
|
||||
|
||||
// returns true if successful
|
||||
bool JsonStream_utf8_test() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
fflush(NULL);
|
||||
const size_t n_records = 10000;
|
||||
std::string data;
|
||||
char buf[1024];
|
||||
for (size_t i = 0; i < n_records; ++i) {
|
||||
auto n = sprintf(buf,
|
||||
"{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
|
||||
"\"été\": {\"id\": %zu, \"name\": \"éventail%zu\"}}",
|
||||
i, i, (i % 2) ? "⺃" : "⺕", i % 10, i % 10);
|
||||
data += std::string(buf, n);
|
||||
}
|
||||
const size_t batch_size = 1000;
|
||||
printf(".");
|
||||
fflush(NULL);
|
||||
simdjson::padded_string str(data);
|
||||
simdjson::JsonStream<simdjson::padded_string> js{str, batch_size};
|
||||
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
|
||||
size_t count = 0;
|
||||
simdjson::ParsedJson pj;
|
||||
while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
|
||||
parse_res = js.json_parse(pj);
|
||||
if (parse_res != simdjson::SUCCESS && parse_res != simdjson::SUCCESS_AND_HAS_MORE) {
|
||||
break;
|
||||
}
|
||||
simdjson::ParsedJson::Iterator iter(pj);
|
||||
if(!iter.is_object()) {
|
||||
printf("Root should be object\n");
|
||||
return false;
|
||||
}
|
||||
if(!iter.down()) {
|
||||
printf("Root should not be emtpy\n");
|
||||
return false;
|
||||
}
|
||||
if(!iter.is_string()) {
|
||||
printf("Object should start with string key\n");
|
||||
return false;
|
||||
}
|
||||
if(strcmp(iter.get_string(),"id")!=0) {
|
||||
printf("There should a single key, id.\n");
|
||||
return false;
|
||||
}
|
||||
iter.move_to_value();
|
||||
if(!iter.is_integer()) {
|
||||
printf("Value of image should be integer\n");
|
||||
return false;
|
||||
}
|
||||
int64_t keyid = iter.get_integer();
|
||||
if(keyid != (int64_t)count) {
|
||||
printf("key does not match %d, expected %d\n",(int) keyid, (int) count);
|
||||
return false;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
if(count != n_records) {
|
||||
printf("Something is wrong in JsonStream_utf8_test at window size = %zu.\n", batch_size);
|
||||
return false;
|
||||
}
|
||||
printf("ok\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
// returns true if successful
|
||||
bool JsonStream_test() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
fflush(NULL);
|
||||
const size_t n_records = 10000;
|
||||
std::string data;
|
||||
char buf[1024];
|
||||
for (size_t i = 0; i < n_records; ++i) {
|
||||
auto n = sprintf(buf,
|
||||
"{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
|
||||
"\"ete\": {\"id\": %zu, \"name\": \"eventail%zu\"}}",
|
||||
i, i, (i % 2) ? "homme" : "femme", i % 10, i % 10);
|
||||
data += std::string(buf, n);
|
||||
}
|
||||
const size_t batch_size = 1000;
|
||||
printf(".");
|
||||
fflush(NULL);
|
||||
simdjson::padded_string str(data);
|
||||
simdjson::JsonStream<simdjson::padded_string> js{str, batch_size};
|
||||
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
|
||||
size_t count = 0;
|
||||
simdjson::ParsedJson pj;
|
||||
while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
|
||||
parse_res = js.json_parse(pj);
|
||||
if (parse_res != simdjson::SUCCESS && parse_res != simdjson::SUCCESS_AND_HAS_MORE) {
|
||||
break;
|
||||
}
|
||||
simdjson::ParsedJson::Iterator iter(pj);
|
||||
if(!iter.is_object()) {
|
||||
printf("Root should be object\n");
|
||||
return false;
|
||||
}
|
||||
if(!iter.down()) {
|
||||
printf("Root should not be emtpy\n");
|
||||
return false;
|
||||
}
|
||||
if(!iter.is_string()) {
|
||||
printf("Object should start with string key\n");
|
||||
return false;
|
||||
}
|
||||
if(strcmp(iter.get_string(),"id")!=0) {
|
||||
printf("There should a single key, id.\n");
|
||||
return false;
|
||||
}
|
||||
iter.move_to_value();
|
||||
if(!iter.is_integer()) {
|
||||
printf("Value of image should be integer\n");
|
||||
return false;
|
||||
}
|
||||
int64_t keyid = iter.get_integer();
|
||||
if(keyid != (int64_t)count) {
|
||||
printf("key does not match %d, expected %d\n",(int) keyid, (int) count);
|
||||
return false;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
if(count != n_records) {
|
||||
printf("Something is wrong in JsonStream_test at window size = %zu.\n", batch_size);
|
||||
return false;
|
||||
}
|
||||
printf("ok\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
// returns true if successful
|
||||
bool document_stream_test() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
|
@ -666,9 +538,7 @@ namespace document_stream_tests {
|
|||
|
||||
bool run() {
|
||||
return document_stream_test() &&
|
||||
document_stream_utf8_test() &&
|
||||
JsonStream_test() &&
|
||||
JsonStream_utf8_test();
|
||||
document_stream_utf8_test();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue