Remove JsonStream. Use parse_many() instead.

This commit is contained in:
John Keiser 2020-03-25 14:38:56 -07:00
parent ab0e22a316
commit 5aec2671ea
8 changed files with 3 additions and 300 deletions

View File

@ -61,7 +61,7 @@ SRCHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/we
SRCHEADERS_SRC=src/isadetection.h src/jsoncharutils.h src/simdprune_tables.h src/implementation.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document_parser_callbacks.h
SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE) $(SRCHEADERS_FALLBACK)
INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/parsedjson_iterator.h include/simdjson/inline/parsedjson_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/parsedjson_iterator.h include/simdjson/inline/parsedjson_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
ifeq ($(SIMDJSON_TEST_AMALGAMATED_HEADERS),1)
HEADERS=singleheader/simdjson.h

View File

@ -10,13 +10,11 @@ set(SIMDJSON_INCLUDE
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/document_stream.h
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/document.h
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/error.h
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/jsonstream.h
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/padded_string.h
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/parsedjson_iterator.h
${SIMDJSON_INCLUDE_DIR}/simdjson/internal/jsonformatutils.h
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonioutil.h
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonparser.h
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonstream.h
${SIMDJSON_INCLUDE_DIR}/simdjson/padded_string.h
${SIMDJSON_INCLUDE_DIR}/simdjson/parsedjson.h
${SIMDJSON_INCLUDE_DIR}/simdjson/parsedjson_iterator.h

View File

@ -15,13 +15,11 @@
#include "simdjson/jsonparser.h"
#include "simdjson/parsedjson.h"
#include "simdjson/parsedjson_iterator.h"
#include "simdjson/jsonstream.h"
// Inline functions
#include "simdjson/inline/document.h"
#include "simdjson/inline/document_stream.h"
#include "simdjson/inline/error.h"
#include "simdjson/inline/jsonstream.h"
#include "simdjson/inline/padded_string.h"
#include "simdjson/inline/parsedjson_iterator.h"

View File

@ -6,8 +6,6 @@
namespace simdjson {
template <class string_container = padded_string> class JsonStream;
/**
* A forward-only stream of documents.
*
@ -137,7 +135,6 @@ private:
std::thread stage_1_thread;
document::parser parser_thread;
#endif
template <class string_container> friend class JsonStream;
friend class document::parser;
}; // class document::stream

View File

@ -1,7 +1,7 @@
#ifndef SIMDJSON_INLINE_DOCUMENT_STREAM_H
#define SIMDJSON_INLINE_DOCUMENT_STREAM_H
#include "simdjson/jsonstream.h"
#include "simdjson/document_stream.h"
#include <algorithm>
#include <limits>
#include <stdexcept>

View File

@ -1,35 +0,0 @@
// TODO Remove this -- deprecated API and files
#ifndef SIMDJSON_INLINE_JSONSTREAM_H
#define SIMDJSON_INLINE_JSONSTREAM_H
#include "simdjson/jsonstream.h"
#include "simdjson/document.h"
#include "simdjson/document_stream.h"
namespace simdjson {
template <class string_container>
inline JsonStream<string_container>::JsonStream(const string_container &s, size_t _batch_size) noexcept
: str(s), batch_size(_batch_size) {
}
template <class string_container>
inline JsonStream<string_container>::~JsonStream() noexcept {
if (stream) { delete stream; }
}
template <class string_container>
inline int JsonStream<string_container>::json_parse(document::parser &parser) noexcept {
if (unlikely(stream == nullptr)) {
stream = new document::stream(parser, reinterpret_cast<const uint8_t*>(str.data()), str.length(), batch_size);
} else {
if (&parser != &stream->parser) { return stream->error = TAPE_ERROR; }
stream->error = stream->json_parse();
}
return stream->error;
}
} // namespace simdjson
#endif // SIMDJSON_INLINE_JSONSTREAM_H

View File

@ -1,125 +0,0 @@
// TODO Remove this -- deprecated API and files
#ifndef SIMDJSON_JSONSTREAM_H
#define SIMDJSON_JSONSTREAM_H
#include "simdjson/document_stream.h"
#include "simdjson/padded_string.h"
namespace simdjson {
/**
* @deprecated use document::stream instead.
*
* The main motivation for this piece of software is to achieve maximum speed and offer
* good quality of life while parsing files containing multiple JSON documents.
*
* Since we want to offer flexibility and not restrict ourselves to a specific file
* format, we support any file that contains any valid JSON documents separated by one
* or more character that is considered a whitespace by the JSON spec.
* Namely: space, nothing, linefeed, carriage return, horizontal tab.
* Anything that is not whitespace will be parsed as a JSON document and could lead
* to failure.
*
* To offer maximum parsing speed, our implementation processes the data inside the
* buffer by batches and their size is defined by the parameter "batch_size".
* By loading data in batches, we can optimize the time spent allocating data in the
* parser and can also open the possibility of multi-threading.
* The batch_size must be at least as large as the biggest document in the file, but
* not too large in order to submerge the chached memory. We found that 1MB is
* somewhat a sweet spot for now. Eventually, this batch_size could be fully
* automated and be optimal at all times.
*
* The template parameter (string_container) must
* support the data() and size() methods, returning a pointer
* to a char* and to the number of bytes respectively.
* The simdjson parser may read up to SIMDJSON_PADDING bytes beyond the end
* of the string, so if you do not use a padded_string container,
* you have the responsibility to overallocate. If you fail to
* do so, your software may crash if you cross a page boundary,
* and you should expect memory checkers to object.
* Most users should use a simdjson::padded_string.
*/
template <class string_container> class JsonStream {
public:
/* Create a JsonStream object that can be used to parse sequentially the valid
* JSON documents found in the buffer "buf".
*
* The batch_size must be at least as large as the biggest document in the
* file, but
* not too large to submerge the cached memory. We found that 1MB is
* somewhat a sweet spot for now.
*
* The user is expected to call the following json_parse method to parse the
* next
* valid JSON document found in the buffer. This method can and is expected
* to be
* called in a loop.
*
* Various methods are offered to keep track of the status, like
* get_current_buffer_loc,
* get_n_parsed_docs, get_n_bytes_parsed, etc.
*
* */
JsonStream(const string_container &s, size_t _batch_size = 1000000) noexcept;
~JsonStream() noexcept;
/* Parse the next document found in the buffer previously given to JsonStream.
* The content should be a valid JSON document encoded as UTF-8. If there is a
* UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
* discouraged.
*
* You do NOT need to pre-allocate a parser. This function takes care of
* pre-allocating a capacity defined by the batch_size defined when creating
the
* JsonStream object.
*
* The function returns simdjson::SUCCESS_AND_HAS_MORE (an integer = 1) in
case
* of success and indicates that the buffer still contains more data to be
parsed,
* meaning this function can be called again to return the next JSON document
* after this one.
*
* The function returns simdjson::SUCCESS (as integer = 0) in case of success
* and indicates that the buffer has successfully been parsed to the end.
* Every document it contained has been parsed without error.
*
* The function returns an error code from simdjson/simdjson.h in case of
failure
* such as simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and
so forth;
* the simdjson::error_message function converts these error codes into a
* string).
*
* You can also check validity by calling parser.is_valid(). The same parser
can
* and should be reused for the other documents in the buffer. */
int json_parse(document::parser &parser) noexcept;
/* Returns the location (index) of where the next document should be in the
* buffer.
* Can be used for debugging, it tells the user the position of the end of the
* last
* valid JSON document parsed*/
inline size_t get_current_buffer_loc() const noexcept { return stream ? stream->current_buffer_loc : 0; }
/* Returns the total amount of complete documents parsed by the JsonStream,
* in the current buffer, at the given time.*/
inline size_t get_n_parsed_docs() const noexcept { return stream ? stream->n_parsed_docs : 0; }
/* Returns the total amount of data (in bytes) parsed by the JsonStream,
* in the current buffer, at the given time.*/
inline size_t get_n_bytes_parsed() const noexcept { return stream ? stream->n_bytes_parsed : 0; }
private:
const string_container &str;
const size_t batch_size;
document::stream *stream{nullptr};
}; // end of class JsonStream
} // end of namespace simdjson
#endif // SIMDJSON_JSONSTREAM_H

View File

@ -440,134 +440,6 @@ namespace document_stream_tests {
return true;
}
// returns true if successful
bool JsonStream_utf8_test() {
std::cout << "Running " << __func__ << std::endl;
fflush(NULL);
const size_t n_records = 10000;
std::string data;
char buf[1024];
for (size_t i = 0; i < n_records; ++i) {
auto n = sprintf(buf,
"{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
"\"été\": {\"id\": %zu, \"name\": \"éventail%zu\"}}",
i, i, (i % 2) ? "" : "", i % 10, i % 10);
data += std::string(buf, n);
}
const size_t batch_size = 1000;
printf(".");
fflush(NULL);
simdjson::padded_string str(data);
simdjson::JsonStream<simdjson::padded_string> js{str, batch_size};
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
size_t count = 0;
simdjson::ParsedJson pj;
while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
parse_res = js.json_parse(pj);
if (parse_res != simdjson::SUCCESS && parse_res != simdjson::SUCCESS_AND_HAS_MORE) {
break;
}
simdjson::ParsedJson::Iterator iter(pj);
if(!iter.is_object()) {
printf("Root should be object\n");
return false;
}
if(!iter.down()) {
printf("Root should not be emtpy\n");
return false;
}
if(!iter.is_string()) {
printf("Object should start with string key\n");
return false;
}
if(strcmp(iter.get_string(),"id")!=0) {
printf("There should a single key, id.\n");
return false;
}
iter.move_to_value();
if(!iter.is_integer()) {
printf("Value of image should be integer\n");
return false;
}
int64_t keyid = iter.get_integer();
if(keyid != (int64_t)count) {
printf("key does not match %d, expected %d\n",(int) keyid, (int) count);
return false;
}
count++;
}
if(count != n_records) {
printf("Something is wrong in JsonStream_utf8_test at window size = %zu.\n", batch_size);
return false;
}
printf("ok\n");
return true;
}
// returns true if successful
bool JsonStream_test() {
std::cout << "Running " << __func__ << std::endl;
fflush(NULL);
const size_t n_records = 10000;
std::string data;
char buf[1024];
for (size_t i = 0; i < n_records; ++i) {
auto n = sprintf(buf,
"{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
"\"ete\": {\"id\": %zu, \"name\": \"eventail%zu\"}}",
i, i, (i % 2) ? "homme" : "femme", i % 10, i % 10);
data += std::string(buf, n);
}
const size_t batch_size = 1000;
printf(".");
fflush(NULL);
simdjson::padded_string str(data);
simdjson::JsonStream<simdjson::padded_string> js{str, batch_size};
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
size_t count = 0;
simdjson::ParsedJson pj;
while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
parse_res = js.json_parse(pj);
if (parse_res != simdjson::SUCCESS && parse_res != simdjson::SUCCESS_AND_HAS_MORE) {
break;
}
simdjson::ParsedJson::Iterator iter(pj);
if(!iter.is_object()) {
printf("Root should be object\n");
return false;
}
if(!iter.down()) {
printf("Root should not be emtpy\n");
return false;
}
if(!iter.is_string()) {
printf("Object should start with string key\n");
return false;
}
if(strcmp(iter.get_string(),"id")!=0) {
printf("There should a single key, id.\n");
return false;
}
iter.move_to_value();
if(!iter.is_integer()) {
printf("Value of image should be integer\n");
return false;
}
int64_t keyid = iter.get_integer();
if(keyid != (int64_t)count) {
printf("key does not match %d, expected %d\n",(int) keyid, (int) count);
return false;
}
count++;
}
if(count != n_records) {
printf("Something is wrong in JsonStream_test at window size = %zu.\n", batch_size);
return false;
}
printf("ok\n");
return true;
}
// returns true if successful
bool document_stream_test() {
std::cout << "Running " << __func__ << std::endl;
@ -666,9 +538,7 @@ namespace document_stream_tests {
bool run() {
return document_stream_test() &&
document_stream_utf8_test() &&
JsonStream_test() &&
JsonStream_utf8_test();
document_stream_utf8_test();
}
}