diff --git a/.gitignore b/.gitignore index 1dd1436b..7ca3c4b6 100644 --- a/.gitignore +++ b/.gitignore @@ -218,3 +218,7 @@ _deps # We check in a custom version of root Makefile that is not generated by CMake !/Makefile +singleheader/amalgamation_demo +singleheader/amalgamation_demo.cpp +singleheader/simdjson.cpp +singleheader/simdjson.h \ No newline at end of file diff --git a/HACKING.md b/HACKING.md index 01ec29a1..f2907482 100644 --- a/HACKING.md +++ b/HACKING.md @@ -20,7 +20,7 @@ simdjson's source structure, from the top level, looks like this: implementations). * simdjson.cpp: A "master source" that includes all implementation files from src/. This is equivalent to the distributed simdjson.cpp. - * arm64/|fallback/|haswell/|westmere/: Architecture-specific implementations. All functions are + * arm64/|fallback/|haswell/|westmere/: Architecture-specific implementations. All functions are Each architecture defines its own namespace, e.g. simdjson::haswell. * generic/: Generic implementations of the simdjson parser. These files may be included and compiled multiple times, from whichever architectures use them. They assume they are already @@ -56,7 +56,7 @@ Other important files and directories: * **tools:** Source for executables that can be distributed with simdjson > **Don't modify the files in singleheader/ directly; these are automatically generated.** -> +> > While we distribute those files on release, we *maintain* the files under include/ and src/. While simdjson distributes just two files from the singleheader/ directory, we *maintain* the code in @@ -157,6 +157,19 @@ make make test ``` +linux way: + +``` +mkdir build # if necessary +cd build +cmake .. -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CC_COMPILER=gcc # or +cmake .. -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CC_COMPILER=clang +make -j +``` + + + + ### Usage (CMake on 64-bit Windows using Visual Studio) We assume you have a common 64-bit Windows PC with at least Visual Studio 2017 and an x64 processor with AVX2 support (2013 Intel Haswell or later) or SSE 4.2 + CLMUL (2010 Westmere or later). @@ -189,7 +202,7 @@ On Windows (64-bit): will build and install `simdjson` as a shared library. ``` -.\vcpkg.exe install simdjson:x64-windows-static +.\vcpkg.exe install simdjson:x64-windows-static ``` will build and install `simdjson` as a static library. diff --git a/include/simdjson/document.h b/include/simdjson/document.h index b2ed94ce..6c32e3c8 100644 --- a/include/simdjson/document.h +++ b/include/simdjson/document.h @@ -124,8 +124,9 @@ public: * Get the next value. * * Part of the std::iterator interface. + * */ - inline void operator++() noexcept; + inline iterator& operator++() noexcept; /** * Check if these values come from the same place in the JSON. * @@ -205,8 +206,9 @@ public: * Get the next key/value pair. * * Part of the std::iterator interface. + * */ - inline void operator++() noexcept; + inline iterator& operator++() noexcept; /** * Check if these key value pairs come from the same place in the JSON. * @@ -373,13 +375,13 @@ public: bool dump_raw_tape(std::ostream &os) const noexcept; /** @private Structural values. */ - std::unique_ptr tape; + std::unique_ptr tape{}; /** @private String values. * * Should be at least byte_capacity. */ - std::unique_ptr string_buf; + std::unique_ptr string_buf{}; private: inline error_code allocate(size_t len) noexcept; @@ -660,8 +662,7 @@ public: * to allocate an initial capacity, call allocate() after constructing the parser. * Defaults to SIMDJSON_MAXSIZE_BYTES (the largest single document simdjson can process). */ - really_inline parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept; - + really_inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept; /** * Take another parser's buffers and state. * @@ -811,7 +812,7 @@ public: * - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity. * - other json errors if parsing fails. */ - inline document_stream load_many(const std::string &path, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; + inline document_stream load_many(const std::string &path, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; /** * Parse a buffer containing many JSON documents. @@ -953,21 +954,21 @@ public: /** @private Number of structural indices passed from stage 1 to stage 2 */ uint32_t n_structural_indexes{0}; /** @private Structural indices passed from stage 1 to stage 2 */ - std::unique_ptr structural_indexes; + std::unique_ptr structural_indexes{}; /** @private Tape location of each open { or [ */ - std::unique_ptr containing_scope; + std::unique_ptr containing_scope{}; #ifdef SIMDJSON_USE_COMPUTED_GOTO /** @private Return address of each open { or [ */ - std::unique_ptr ret_address; + std::unique_ptr ret_address{}; #else /** @private Return address of each open { or [ */ - std::unique_ptr ret_address; + std::unique_ptr ret_address{}; #endif /** @private Next write location in the string buf for stage 2 parsing */ - uint8_t *current_string_buf_loc; + uint8_t *current_string_buf_loc{}; /** @private Use `if (parser.parse(...).error())` instead */ bool valid{false}; @@ -975,7 +976,7 @@ public: error_code error{UNINITIALIZED}; /** @private Use `parser.parse(...).value()` instead */ - document doc; + document doc{}; /** @private returns true if the document parsed was valid */ [[deprecated("Use the result of parser.parse() instead")]] diff --git a/include/simdjson/document_stream.h b/include/simdjson/document_stream.h index b5ebf5fe..31c6af36 100644 --- a/include/simdjson/document_stream.h +++ b/include/simdjson/document_stream.h @@ -135,8 +135,8 @@ private: error_code error{SUCCESS_AND_HAS_MORE}; #ifdef SIMDJSON_THREADS_ENABLED error_code stage1_is_ok_thread{SUCCESS}; - std::thread stage_1_thread; - dom::parser parser_thread; + std::thread stage_1_thread{}; + dom::parser parser_thread{}; #endif friend class dom::parser; }; // class document_stream diff --git a/include/simdjson/implementation.h b/include/simdjson/implementation.h index 60594a4b..9eb8d60b 100644 --- a/include/simdjson/implementation.h +++ b/include/simdjson/implementation.h @@ -17,6 +17,7 @@ namespace simdjson { */ class implementation { public: + /** * The name of this implementation. * @@ -131,6 +132,7 @@ protected: _required_instruction_sets(required_instruction_sets) { } + virtual ~implementation()=default; private: /** diff --git a/include/simdjson/inline/document.h b/include/simdjson/inline/document.h index 81fc9055..6853b9c3 100644 --- a/include/simdjson/inline/document.h +++ b/include/simdjson/inline/document.h @@ -213,7 +213,7 @@ inline error_code document::allocate(size_t capacity) noexcept { // a pathological input like "[[[[..." would generate len tape elements, so // need a capacity of at least len + 1, but it is also possible to do - // worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6" + // worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6" //where len + 1 tape elements are // generated, see issue https://github.com/lemire/simdjson/issues/345 size_t tape_capacity = ROUNDUP_N(capacity + 2, 64); @@ -322,7 +322,9 @@ inline bool document::dump_raw_tape(std::ostream &os) const noexcept { // parser inline implementation // really_inline parser::parser(size_t max_capacity) noexcept - : _max_capacity{max_capacity}, loaded_bytes(nullptr, &aligned_free_char) {} + : _max_capacity{max_capacity}, + loaded_bytes(nullptr, &aligned_free_char) + {} inline bool parser::is_valid() const noexcept { return valid; } inline int parser::get_error_code() const noexcept { return error; } inline std::string parser::get_error_message() const noexcept { return error_message(error); } @@ -604,8 +606,9 @@ inline element array::iterator::operator*() const noexcept { inline bool array::iterator::operator!=(const array::iterator& other) const noexcept { return json_index != other.json_index; } -inline void array::iterator::operator++() noexcept { +inline array::iterator& array::iterator::operator++() noexcept { json_index = after_element(); + return *this; } // @@ -703,9 +706,10 @@ inline const key_value_pair object::iterator::operator*() const noexcept { inline bool object::iterator::operator!=(const object::iterator& other) const noexcept { return json_index != other.json_index; } -inline void object::iterator::operator++() noexcept { +inline object::iterator& object::iterator::operator++() noexcept { json_index++; json_index = after_element(); + return *this; } inline std::string_view object::iterator::key() const noexcept { size_t string_buf_index = size_t(tape_value()); diff --git a/include/simdjson/inline/document_stream.h b/include/simdjson/inline/document_stream.h index 2c7918c7..f86aa7df 100644 --- a/include/simdjson/inline/document_stream.h +++ b/include/simdjson/inline/document_stream.h @@ -85,7 +85,7 @@ static inline bool is_ascii(char c) { return ((unsigned char)c) <= 127; } -// if the string ends with UTF-8 values, backtrack +// if the string ends with UTF-8 values, backtrack // up to the first ASCII character. May return 0. static inline size_t trimmed_length_safe_utf8(const char * c, size_t len) { while ((len > 0) and (not is_ascii(c[len - 1]))) { @@ -100,14 +100,19 @@ static inline size_t trimmed_length_safe_utf8(const char * c, size_t len) { namespace simdjson { namespace dom { - really_inline document_stream::document_stream( dom::parser &_parser, const uint8_t *buf, size_t len, size_t batch_size, error_code _error -) noexcept : parser{_parser}, _buf{buf}, _len{len}, _batch_size(batch_size), error{_error} { +) noexcept + : parser{_parser}, + _buf{buf}, + _len{len}, + _batch_size(batch_size), + error(_error) +{ if (!error) { error = json_parse(); } } diff --git a/include/simdjson/inline/parsedjson_iterator.h b/include/simdjson/inline/parsedjson_iterator.h index d82ff1c1..5b36d4b4 100644 --- a/include/simdjson/inline/parsedjson_iterator.h +++ b/include/simdjson/inline/parsedjson_iterator.h @@ -212,9 +212,9 @@ bool dom::parser::Iterator::next() { current_type = next_type; return true; } - dom::parser::Iterator::Iterator(const dom::parser &pj) noexcept(false) - : doc(pj.doc), depth(0), location(0), tape_length(0) { + : doc(pj.doc) +{ #if SIMDJSON_EXCEPTIONS if (!pj.valid) { throw simdjson_error(pj.error); } #else @@ -239,12 +239,17 @@ dom::parser::Iterator::Iterator(const dom::parser &pj) noexcept(false) depth_index[depth].scope_type = current_type; } } - dom::parser::Iterator::Iterator( const dom::parser::Iterator &o) noexcept - : doc(o.doc), max_depth(o.depth), depth(o.depth), location(o.location), - tape_length(o.tape_length), current_type(o.current_type), - current_val(o.current_val) { + : doc(o.doc), + max_depth(o.depth), + depth(o.depth), + location(o.location), + tape_length(o.tape_length), + current_type(o.current_type), + current_val(o.current_val), + depth_index() +{ depth_index = new scopeindex_t[max_depth+1]; memcpy(depth_index, o.depth_index, (depth + 1) * sizeof(depth_index[0])); } diff --git a/include/simdjson/parsedjson_iterator.h b/include/simdjson/parsedjson_iterator.h index 37513089..06c8cc83 100644 --- a/include/simdjson/parsedjson_iterator.h +++ b/include/simdjson/parsedjson_iterator.h @@ -22,6 +22,8 @@ public: inline Iterator(const Iterator &o) noexcept; inline ~Iterator() noexcept; + inline Iterator& operator=(const Iterator&) = delete; + inline bool is_ok() const; // useful for debugging purposes @@ -253,13 +255,13 @@ public: private: const document &doc; - size_t max_depth; - size_t depth; - size_t location; // our current location on a tape - size_t tape_length; - uint8_t current_type; - uint64_t current_val; - scopeindex_t *depth_index; + size_t max_depth{}; + size_t depth{}; + size_t location{}; // our current location on a tape + size_t tape_length{}; + uint8_t current_type{}; + uint64_t current_val{}; + scopeindex_t *depth_index{}; }; } // namespace simdjson diff --git a/src/generic/json_minifier.h b/src/generic/json_minifier.h index bb30ed42..f5a1e8f7 100644 --- a/src/generic/json_minifier.h +++ b/src/generic/json_minifier.h @@ -11,12 +11,14 @@ public: static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept; private: - really_inline json_minifier(uint8_t *_dst) : dst{_dst} {} + really_inline json_minifier(uint8_t *_dst) + : dst{_dst} + {} template really_inline void step(const uint8_t *block_buf, buf_block_reader &reader) noexcept; really_inline void next(simd::simd8x64 in, json_block block); really_inline error_code finish(uint8_t *dst_start, size_t &dst_len); - json_scanner scanner; + json_scanner scanner{}; uint8_t *dst; }; @@ -70,4 +72,4 @@ error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, s return minifier.finish(dst, dst_len); } -} // namespace stage1 \ No newline at end of file +} // namespace stage1 diff --git a/src/generic/json_scanner.h b/src/generic/json_scanner.h index c410101a..8f181ae9 100644 --- a/src/generic/json_scanner.h +++ b/src/generic/json_scanner.h @@ -48,6 +48,7 @@ private: */ class json_scanner { public: + json_scanner() {} really_inline json_block next(const simd::simd8x64 in); really_inline error_code finish(bool streaming); @@ -55,7 +56,7 @@ private: // Whether the last character of the previous iteration is part of a scalar token // (anything except whitespace or an operator). uint64_t prev_scalar = 0ULL; - json_string_scanner string_scanner; + json_string_scanner string_scanner{}; }; @@ -74,7 +75,7 @@ really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) { // // Check if the current character follows a matching character, with possible "filler" between. -// For example, this checks for empty curly braces, e.g. +// For example, this checks for empty curly braces, e.g. // // in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { * } // @@ -100,4 +101,4 @@ really_inline error_code json_scanner::finish(bool streaming) { return string_scanner.finish(streaming); } -} // namespace stage1 \ No newline at end of file +} // namespace stage1 diff --git a/src/generic/json_structural_indexer.h b/src/generic/json_structural_indexer.h index 30062beb..99f6ec15 100644 --- a/src/generic/json_structural_indexer.h +++ b/src/generic/json_structural_indexer.h @@ -61,13 +61,14 @@ public: static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept; private: - really_inline json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {} + really_inline json_structural_indexer(uint32_t *structural_indexes) + : indexer{structural_indexes} {} template really_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; really_inline void next(simd::simd8x64 in, json_block block, size_t idx); really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming); - json_scanner scanner; + json_scanner scanner{}; utf8_checker checker{}; bit_indexer indexer; uint64_t prev_structurals = 0; @@ -145,7 +146,7 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b // they can make a lot of progress before they need that information. // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that // to finish: utf-8 checks and generating the output from the last iteration. -// +// // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough // workout. @@ -172,4 +173,4 @@ error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser return indexer.finish(parser, reader.block_index(), len, streaming); } -} // namespace stage1 \ No newline at end of file +} // namespace stage1 diff --git a/src/generic/stage2_build_tape.h b/src/generic/stage2_build_tape.h index 1175eb81..da714d4f 100644 --- a/src/generic/stage2_build_tape.h +++ b/src/generic/stage2_build_tape.h @@ -50,7 +50,11 @@ struct unified_machine_addresses { class structural_iterator { public: really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index) - : buf{_buf}, len{_len}, structural_indexes{_structural_indexes}, next_structural{next_structural_index} {} + : buf{_buf}, + len{_len}, + structural_indexes{_structural_indexes}, + next_structural{next_structural_index} + {} really_inline char advance_char() { idx = structural_indexes[next_structural]; next_structural++;