From 5d1e3efce843e50acc6cc7a75c4bbf929b038591 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 20 Mar 2020 16:14:47 -0400 Subject: [PATCH] faster minifier (#568) * Fallback should use our scalar code. * parse should have a nicer error message. * Making it so that "minify" can use different architectures. * Let us change the minifier competition so that it tests all implementations. * Documenting the untaken optimization opportunity. Co-authored-by: John Keiser --- Makefile | 4 +- amalgamation.sh | 2 +- benchmark/minifiercompetition.cpp | 18 +- benchmark/parse.cpp | 7 +- include/CMakeLists.txt | 1 - include/simdjson.h | 1 - include/simdjson/error.h | 12 + include/simdjson/implementation.h | 16 + include/simdjson/jsonminifier.h | 32 -- src/CMakeLists.txt | 1 - src/arm64/bitmanipulation.h | 2 +- src/arm64/implementation.h | 1 + src/arm64/simd.h | 47 ++- src/arm64/stage1_find_marks.h | 27 +- src/fallback/implementation.h | 1 + src/fallback/stage1_find_marks.h | 56 +++ src/generic/buf_block_reader.h | 48 +++ src/generic/json_minifier.h | 73 ++++ src/generic/json_scanner.h | 26 +- src/generic/json_string_scanner.h | 4 +- src/generic/json_structural_indexer.h | 61 +--- src/haswell/bitmanipulation.h | 2 +- src/haswell/implementation.h | 1 + src/haswell/simd.h | 63 +++- src/haswell/stage1_find_marks.h | 15 +- src/implementation.cpp | 5 + src/jsonminifier.cpp | 478 -------------------------- src/simdjson.cpp | 1 - src/westmere/bitmanipulation.h | 2 +- src/westmere/implementation.h | 1 + src/westmere/simd.h | 48 ++- src/westmere/stage1_find_marks.h | 15 +- tests/basictests.cpp | 21 +- tools/minify.cpp | 86 ++++- 34 files changed, 558 insertions(+), 620 deletions(-) delete mode 100644 include/simdjson/jsonminifier.h create mode 100644 src/generic/buf_block_reader.h create mode 100644 src/generic/json_minifier.h delete mode 100644 src/jsonminifier.cpp diff --git a/Makefile b/Makefile index 6a155cbf..2b706266 100644 --- a/Makefile +++ b/Makefile @@ -53,7 +53,7 @@ endif # ifeq ($(SANITIZE),1) endif # ifeq ($(MEMSANITIZE),1) # Headers and sources -SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/json_scanner.h src/generic/json_string_scanner.h src/generic/json_structural_indexer.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h +SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/json_scanner.h src/generic/json_string_scanner.h src/generic/json_structural_indexer.h src/generic/json_minifier.h src/generic/buf_block_reader.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h SRCHEADERS_ARM64= src/arm64/bitmanipulation.h src/arm64/bitmask.h src/arm64/intrinsics.h src/arm64/numberparsing.h src/arm64/simd.h src/arm64/stage1_find_marks.h src/arm64/stage2_build_tape.h src/arm64/stringparsing.h SRCHEADERS_HASWELL= src/haswell/bitmanipulation.h src/haswell/bitmask.h src/haswell/intrinsics.h src/haswell/numberparsing.h src/haswell/simd.h src/haswell/stage1_find_marks.h src/haswell/stage2_build_tape.h src/haswell/stringparsing.h SRCHEADERS_FALLBACK= src/fallback/bitmanipulation.h src/fallback/implementation.h src/fallback/numberparsing.h src/fallback/stage1_find_marks.h src/fallback/stage2_build_tape.h src/fallback/stringparsing.h @@ -61,7 +61,7 @@ SRCHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/we SRCHEADERS_SRC=src/isadetection.h src/jsoncharutils.h src/simdprune_tables.h src/implementation.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document_parser_callbacks.h SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE) $(SRCHEADERS_FALLBACK) -INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h +INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h ifeq ($(SIMDJSON_TEST_AMALGAMATED_HEADERS),1) HEADERS=singleheader/simdjson.h diff --git a/amalgamation.sh b/amalgamation.sh index 57504b00..6146c9a5 100755 --- a/amalgamation.sh +++ b/amalgamation.sh @@ -143,7 +143,7 @@ int main(int argc, char *argv[]) { // parse_many const char * filename2 = argv[2]; for (auto result : parser.load_many(filename2)) { - error = result.error; + error = result.error(); } if (error) { std::cout << "parse_many failed" << std::endl; diff --git a/benchmark/minifiercompetition.cpp b/benchmark/minifiercompetition.cpp index bcae39aa..48cefe56 100644 --- a/benchmark/minifiercompetition.cpp +++ b/benchmark/minifiercompetition.cpp @@ -98,16 +98,14 @@ int main(int argc, char *argv[]) { "despacing with RapidJSON Insitu", rapid_stringme_insitu((char *)buffer), memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data); memcpy(buffer, p.data(), p.size()); - - size_t outlength = simdjson::json_minify((const uint8_t *)buffer, p.size(), - (uint8_t *)buffer); - if (verbose) - std::cout << "json_minify length is " << outlength << std::endl; - + size_t outlength; uint8_t *cbuffer = (uint8_t *)buffer; - BEST_TIME("json_minify", simdjson::json_minify(cbuffer, p.size(), cbuffer), + for (auto imple : simdjson::available_implementations) { + BEST_TIME((std::string("simdjson->minify+")+imple->name()).c_str(), (imple->minify(cbuffer, p.size(), cbuffer, outlength), outlength), outlength, memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data); + } + printf("minisize = %zu, original size = %zu (minified down to %.2f percent " "of original) \n", outlength, p.size(), outlength * 100.0 / p.size()); @@ -121,8 +119,9 @@ int main(int argc, char *argv[]) { !just_data); char *mini_buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1); - size_t minisize = simdjson::json_minify((const uint8_t *)p.data(), p.size(), - (uint8_t *)mini_buffer); + size_t minisize; + simdjson::active_implementation->minify((const uint8_t *)p.data(), p.size(), + (uint8_t *)mini_buffer, minisize); mini_buffer[minisize] = '\0'; BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(), @@ -171,6 +170,7 @@ int main(int argc, char *argv[]) { automated_reallocation), simdjson::SUCCESS, memcpy(buffer, mini_buffer, p.size()), repeat, volume, !just_data); + free(buffer); free(ast_buffer); free(mini_buffer); diff --git a/benchmark/parse.cpp b/benchmark/parse.cpp index b50a95d5..1f86e001 100644 --- a/benchmark/parse.cpp +++ b/benchmark/parse.cpp @@ -109,7 +109,12 @@ struct option_struct { case 'a': { const implementation *impl = simdjson::available_implementations[optarg]; if (!impl) { - exit_usage(string("Unsupported option value -a ") + optarg + ": expected -a haswell, westmere or arm64"); + std::string exit_message = string("Unsupported option value -a ") + optarg + ": expected -a with one of "; + for (auto imple : simdjson::available_implementations) { + exit_message += imple->name(); + exit_message += " "; + } + exit_usage(exit_message); } simdjson::active_implementation = impl; break; diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt index 8d1d30f8..fc717dea 100644 --- a/include/CMakeLists.txt +++ b/include/CMakeLists.txt @@ -16,7 +16,6 @@ set(SIMDJSON_INCLUDE ${SIMDJSON_INCLUDE_DIR}/simdjson/inline/padded_string.h ${SIMDJSON_INCLUDE_DIR}/simdjson/internal/jsonformatutils.h ${SIMDJSON_INCLUDE_DIR}/simdjson/jsonioutil.h - ${SIMDJSON_INCLUDE_DIR}/simdjson/jsonminifier.h ${SIMDJSON_INCLUDE_DIR}/simdjson/jsonparser.h ${SIMDJSON_INCLUDE_DIR}/simdjson/jsonstream.h ${SIMDJSON_INCLUDE_DIR}/simdjson/padded_string.h diff --git a/include/simdjson.h b/include/simdjson.h index d240ed20..7b489f65 100644 --- a/include/simdjson.h +++ b/include/simdjson.h @@ -10,7 +10,6 @@ #include "simdjson/implementation.h" #include "simdjson/document.h" #include "simdjson/document_stream.h" -#include "simdjson/jsonminifier.h" // Deprecated API #include "simdjson/parsedjsoniterator.h" diff --git a/include/simdjson/error.h b/include/simdjson/error.h index 5c88649a..d9163f9c 100644 --- a/include/simdjson/error.h +++ b/include/simdjson/error.h @@ -77,6 +77,17 @@ private: */ template struct simdjson_result : public std::pair { + /** + * Move the value and the error to the provided variables. + */ + void tie(T& t, error_code & e) { + // on the clang compiler that comes with current macOS (Apple clang version 11.0.0), + // tie(width, error) = size["w"].as_uint64_t(); + // fails with "error: no viable overloaded '='"" + t = std::move(this->first); + e = std::move(this->second); + } + /** * The error. */ @@ -138,6 +149,7 @@ struct simdjson_move_result : std::pair { t = std::move(this->first); e = std::move(this->second); } + /** * The error. */ diff --git a/include/simdjson/implementation.h b/include/simdjson/implementation.h index 35bc872c..dcb8e4ec 100644 --- a/include/simdjson/implementation.h +++ b/include/simdjson/implementation.h @@ -56,6 +56,19 @@ public: */ WARN_UNUSED virtual error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept = 0; + /** + * Run a full document parse (ensure_capacity, stage1 and stage2). + * + * Overridden by each implementation. + * + * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. + * @param len the length of the json document. + * @param dst the buffer to write the minified document to. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. + * @param dst_len the number of bytes written. Output only. + * @return the error code, or SUCCESS if there was no error. + */ + WARN_UNUSED virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0; + /** * Stage 1 of the document parser. * @@ -182,6 +195,9 @@ public: WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final { return set_best()->parse(buf, len, parser); } + WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final { + return set_best()->minify(buf, len, dst, dst_len); + } WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final { return set_best()->stage1(buf, len, parser, streaming); } diff --git a/include/simdjson/jsonminifier.h b/include/simdjson/jsonminifier.h deleted file mode 100644 index 2a1b460c..00000000 --- a/include/simdjson/jsonminifier.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef SIMDJSON_JSONMINIFIER_H -#define SIMDJSON_JSONMINIFIER_H - -#include "simdjson/padded_string.h" -#include -#include -#include - -namespace simdjson { - -// Take input from buf and remove useless whitespace, write it to out; buf and -// out can be the same pointer. Result is null terminated, -// return the string length (minus the null termination). -// The accelerated version of this function only runs on AVX2 hardware. -size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out); - -static inline size_t json_minify(const char *buf, size_t len, char *out) { - return json_minify(reinterpret_cast(buf), len, - reinterpret_cast(out)); -} - -static inline size_t json_minify(const std::string_view &p, char *out) { - return json_minify(p.data(), p.size(), out); -} - -static inline size_t json_minify(const padded_string &p, char *out) { - return json_minify(p.data(), p.size(), out); -} - -} // namespace simdjson - -#endif // SIMDJSON_JSONMINIFIER_H diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 52bc745b..4c117d75 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -29,7 +29,6 @@ set(SIMDJSON_SRC set(SIMDJSON_SRC_HEADERS implementation.cpp isadetection.h - jsonminifier.cpp simdprune_tables.h stage1_find_marks.cpp stage2_build_tape.cpp diff --git a/src/arm64/bitmanipulation.h b/src/arm64/bitmanipulation.h index da00e7db..d0616065 100644 --- a/src/arm64/bitmanipulation.h +++ b/src/arm64/bitmanipulation.h @@ -48,7 +48,7 @@ really_inline int leading_zeroes(uint64_t input_num) { } /* result might be undefined when input_num is zero */ -really_inline int hamming(uint64_t input_num) { +really_inline int count_ones(uint64_t input_num) { return vaddv_u8(vcnt_u8((uint8x8_t)input_num)); } diff --git a/src/arm64/implementation.h b/src/arm64/implementation.h index 4766cb4e..a3be4b69 100644 --- a/src/arm64/implementation.h +++ b/src/arm64/implementation.h @@ -10,6 +10,7 @@ class implementation final : public simdjson::implementation { public: really_inline implementation() : simdjson::implementation("arm64", "ARM NEON", instruction_set::NEON) {} WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final; + WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final; WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final; WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final; diff --git a/src/arm64/simd.h b/src/arm64/simd.h index bb21b273..78dd9592 100644 --- a/src/arm64/simd.h +++ b/src/arm64/simd.h @@ -2,6 +2,8 @@ #define SIMDJSON_ARM64_SIMD_H #include "simdjson.h" +#include "simdprune_tables.h" +#include "arm64/bitmanipulation.h" #include "arm64/intrinsics.h" namespace simdjson::arm64::simd { @@ -142,6 +144,43 @@ namespace simdjson::arm64::simd { really_inline simd8 lookup_16(simd8 lookup_table) const { return lookup_table.apply_lookup_16_to(*this); } + + + // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). + // Passing a 0 value for mask would be equivalent to writing out every byte to output. + // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes + // get written. + // Design consideration: it seems like a function with the + // signature simd8 compress(uint16_t mask) would be + // sensible, but the AVX ISA makes this kind of approach difficult. + template + really_inline void compress(uint16_t mask, L * output) const { + // this particular implementation was inspired by work done by @animetosho + // we do it in two steps, first 8 bytes and then second 8 bytes + uint8_t mask1 = static_cast(mask); // least significant 8 bits + uint8_t mask2 = static_cast(mask >> 8); // most significant 8 bits + // next line just loads the 64-bit values thintable_epi8[mask1] and + // thintable_epi8[mask2] into a 128-bit register, using only + // two instructions on most compilers. + uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]}; + uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64); + // we increment by 0x08 the second half of the mask + uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08}; + shufmask = vaddq_u8(shufmask, inc); + // this is the version "nearly pruned" + uint8x16_t pruned = vqtbl1q_u8(*this, shufmask); + // we still need to put the two halves together. + // we compute the popcount of the first half: + int pop1 = BitsSetTable256mul2[mask1]; + // then load the corresponding mask, what it does is to write + // only the first pop1 bytes from the first 8 bytes, and then + // it fills in with the bytes from the second 8 bytes + some filling + // at the end. + uint8x16_t compactmask = vld1q_u8((const uint8_t *)(pshufb_combine_table + pop1 * 8)); + uint8x16_t answer = vqtbl1q_u8(pruned, compactmask); + vst1q_u8((uint8_t*) output, answer); + } + template really_inline simd8 lookup_16( L replace0, L replace1, L replace2, L replace3, @@ -267,6 +306,13 @@ namespace simdjson::arm64::simd { this->chunks[3].store(ptr+sizeof(simd8)*3); } + really_inline void compress(uint64_t mask, T * output) const { + this->chunks[0].compress(mask, output); + this->chunks[1].compress(mask >> 16, output + 16 - count_ones(mask & 0xFFFF)); + this->chunks[2].compress(mask >> 32, output + 32 - count_ones(mask & 0xFFFFFFFF)); + this->chunks[3].compress(mask >> 48, output + 48 - count_ones(mask & 0xFFFFFFFFFFFF)); + } + template static really_inline void each_index(F const& each) { each(0); @@ -339,7 +385,6 @@ namespace simdjson::arm64::simd { const simd8 mask = simd8::splat(m); return this->map( [&](auto a) { return a <= mask; } ).to_bitmask(); } - }; // struct simd8x64 } // namespace simdjson::arm64::simd diff --git a/src/arm64/stage1_find_marks.h b/src/arm64/stage1_find_marks.h index be94a62f..b21400b5 100644 --- a/src/arm64/stage1_find_marks.h +++ b/src/arm64/stage1_find_marks.h @@ -31,6 +31,23 @@ really_inline json_character_block json_character_block::classify(const simd::si return shuf_lo & shuf_hi; }); + + // We compute whitespace and op separately. If the code later only use one or the + // other, given the fact that all functions are aggressively inlined, we can + // hope that useless computations will be omitted. This is namely case when + // minifying (we only need whitespace). *However* if we only need spaces, + // it is likely that we will still compute 'v' above with two lookup_16: one + // could do it a bit cheaper. This is in contrast with the x64 implementations + // where we can, efficiently, do the white space and structural matching + // separately. One reason for this difference is that on ARM NEON, the table + // lookups either zero or leave unchanged the characters exceeding 0xF whereas + // on x64, the equivalent instruction (pshufb) automatically applies a mask, + // ignoring the 4 most significant bits. Thus the x64 implementation is + // optimized differently. This being said, if you use this code strictly + // just for minification (or just to identify the structural characters), + // there is a small untaken optimization opportunity here. We deliberately + // do not pick it up. + uint64_t op = v.map([&](simd8 _v) { return _v.any_bits_set(0x7); }).to_bitmask(); uint64_t whitespace = v.map([&](simd8 _v) { return _v.any_bits_set(0x18); }).to_bitmask(); return { whitespace, op }; @@ -53,11 +70,17 @@ really_inline simd8 must_be_continuation(simd8 prev1, simd8(buf, len, dst, dst_len); +} + +#include "generic/utf8_lookup2_algorithm.h" +#include "generic/json_structural_indexer.h" WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept { return arm64::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming); } diff --git a/src/fallback/implementation.h b/src/fallback/implementation.h index 32e7c9bf..372fde7c 100644 --- a/src/fallback/implementation.h +++ b/src/fallback/implementation.h @@ -14,6 +14,7 @@ public: 0 ) {} WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final; + WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final; WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final; WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final; diff --git a/src/fallback/stage1_find_marks.h b/src/fallback/stage1_find_marks.h index 51e59895..dfc54a9e 100644 --- a/src/fallback/stage1_find_marks.h +++ b/src/fallback/stage1_find_marks.h @@ -151,6 +151,62 @@ WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, do return scanner.scan(); } +// big table for the minifier +static uint8_t jump_table[256 * 3] = { + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, + 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, +}; + +WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept { + size_t i = 0, pos = 0; + uint8_t quote = 0; + uint8_t nonescape = 1; + + while (i < len) { + unsigned char c = buf[i]; + uint8_t *meta = jump_table + 3 * c; + + quote = quote ^ (meta[0] & nonescape); + dst[pos] = c; + pos += meta[2] | quote; + + i += 1; + nonescape = (~nonescape) | (meta[1]); + } + dst_len = pos; // we intentionally do not work with a reference + // for fear of aliasing + return SUCCESS; +} + } // namespace simdjson::fallback #endif // SIMDJSON_FALLBACK_STAGE1_FIND_MARKS_H diff --git a/src/generic/buf_block_reader.h b/src/generic/buf_block_reader.h new file mode 100644 index 00000000..f6c7b4f3 --- /dev/null +++ b/src/generic/buf_block_reader.h @@ -0,0 +1,48 @@ +// Walks through a buffer in block-sized increments, loading the last part with spaces +template +struct buf_block_reader { +public: + really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + really_inline size_t block_index() { return idx; } + really_inline bool has_full_block() const { + return idx < lenminusstep; + } + really_inline const uint8_t *full_block() const { + return &buf[idx]; + } + really_inline bool has_remainder() const { + return idx < len; + } + really_inline void get_remainder(uint8_t *tmp_buf) const { + memset(tmp_buf, 0x20, STEP_SIZE); + memcpy(tmp_buf, buf + idx, len - idx); + } + really_inline void advance() { + idx += STEP_SIZE; + } +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +UNUSED static char * format_input_text(const simd8x64 in) { + static char *buf = (char*)malloc(sizeof(simd8x64) + 1); + in.store((uint8_t*)buf); + for (size_t i=0; i); i++) { + if (buf[i] < ' ') { buf[i] = '_'; } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +UNUSED static char * format_mask(uint64_t mask) { + static char *buf = (char*)malloc(64 + 1); + for (size_t i=0; i<64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} diff --git a/src/generic/json_minifier.h b/src/generic/json_minifier.h new file mode 100644 index 00000000..bb30ed42 --- /dev/null +++ b/src/generic/json_minifier.h @@ -0,0 +1,73 @@ +// This file contains the common code every implementation uses in stage1 +// It is intended to be included multiple times and compiled multiple times +// We assume the file in which it is included already includes +// "simdjson/stage1_find_marks.h" (this simplifies amalgation) + +namespace stage1 { + +class json_minifier { +public: + template + static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept; + +private: + really_inline json_minifier(uint8_t *_dst) : dst{_dst} {} + template + really_inline void step(const uint8_t *block_buf, buf_block_reader &reader) noexcept; + really_inline void next(simd::simd8x64 in, json_block block); + really_inline error_code finish(uint8_t *dst_start, size_t &dst_len); + json_scanner scanner; + uint8_t *dst; +}; + +really_inline void json_minifier::next(simd::simd8x64 in, json_block block) { + uint64_t mask = block.whitespace(); + in.compress(mask, dst); + dst += 64 - count_ones(mask); +} + +really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) { + *dst = '\0'; + error_code error = scanner.finish(false); + if (error) { dst_len = 0; return error; } + dst_len = dst - dst_start; + return SUCCESS; +} + +template<> +really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept { + simd::simd8x64 in_1(block_buf); + simd::simd8x64 in_2(block_buf+64); + json_block block_1 = scanner.next(in_1); + json_block block_2 = scanner.next(in_2); + this->next(in_1, block_1); + this->next(in_2, block_2); + reader.advance(); +} + +template<> +really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept { + simd::simd8x64 in_1(block_buf); + json_block block_1 = scanner.next(in_1); + this->next(block_buf, block_1); + reader.advance(); +} + +template +error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept { + buf_block_reader reader(buf, len); + json_minifier minifier(dst); + while (reader.has_full_block()) { + minifier.step(reader.full_block(), reader); + } + + if (likely(reader.has_remainder())) { + uint8_t block[STEP_SIZE]; + reader.get_remainder(block); + minifier.step(block, reader); + } + + return minifier.finish(dst, dst_len); +} + +} // namespace stage1 \ No newline at end of file diff --git a/src/generic/json_scanner.h b/src/generic/json_scanner.h index 7f1f27ac..c410101a 100644 --- a/src/generic/json_scanner.h +++ b/src/generic/json_scanner.h @@ -5,23 +5,33 @@ namespace stage1 { */ struct json_block { public: - // the start of structurals that are not inside strings + /** The start of structurals */ really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); } + /** All JSON whitespace (i.e. not in a string) */ + really_inline uint64_t whitespace() { return non_quote_outside_string(_characters.whitespace()); } - // operators plus scalar starts like 123, true and "abc" - really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); } - // the start of non-operator runs, like 123, true and "abc" - really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); } - // whether the given character is immediately after a non-operator like 123, true or " - really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; } - // Return a mask of whether the given characters are inside a string (only works on non-quotes) + // Helpers + + /** Whether the given characters are inside a string (only works on non-quotes) */ really_inline uint64_t non_quote_inside_string(uint64_t mask) { return _string.non_quote_inside_string(mask); } + /** Whether the given characters are outside a string (only works on non-quotes) */ + really_inline uint64_t non_quote_outside_string(uint64_t mask) { return _string.non_quote_outside_string(mask); } + // string and escape characters json_string_block _string; // whitespace, operators, scalars json_character_block _characters; // whether the previous character was a scalar uint64_t _follows_potential_scalar; +private: + // Potential structurals (i.e. disregarding strings) + + /** operators plus scalar starts like 123, true and "abc" */ + really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); } + /** the start of non-operator runs, like 123, true and "abc" */ + really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); } + /** whether the given character is immediately after a non-operator like 123, true or " */ + really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; } }; /** diff --git a/src/generic/json_string_scanner.h b/src/generic/json_string_scanner.h index 0968131a..f42a6e55 100644 --- a/src/generic/json_string_scanner.h +++ b/src/generic/json_string_scanner.h @@ -14,7 +14,9 @@ struct json_string_block { // Only characters inside the string (not including the quotes) really_inline uint64_t string_content() const { return _in_string & ~_quote; } // Return a mask of whether the given characters are inside a string (only works on non-quotes) - really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return _in_string & mask; } + really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; } + // Return a mask of whether the given characters are inside a string (only works on non-quotes) + really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; } // Tail of string (everything except the start quote) really_inline uint64_t string_tail() const { return _in_string ^ _quote; } diff --git a/src/generic/json_structural_indexer.h b/src/generic/json_structural_indexer.h index 04f90496..d15cb09a 100644 --- a/src/generic/json_structural_indexer.h +++ b/src/generic/json_structural_indexer.h @@ -22,7 +22,7 @@ public: // it helps tremendously. if (bits == 0) return; - uint32_t cnt = hamming(bits); + uint32_t cnt = count_ones(bits); // Do the first 8 all together for (int i=0; i<8; i++) { @@ -55,55 +55,6 @@ public: } }; -// Routines to print masks and text for debugging bitmask operations -UNUSED static char * format_input_text(const simd8x64 in) { - static char *buf = (char*)malloc(sizeof(simd8x64) + 1); - in.store((uint8_t*)buf); - for (size_t i=0; i); i++) { - if (buf[i] < ' ') { buf[i] = '_'; } - } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} - -UNUSED static char * format_mask(uint64_t mask) { - static char *buf = (char*)malloc(64 + 1); - for (size_t i=0; i<64; i++) { - buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; - } - buf[64] = '\0'; - return buf; -} - -// Walks through a buffer in block-sized increments, loading the last part with spaces -template -struct buf_block_reader { -public: - really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} - really_inline size_t block_index() { return idx; } - really_inline bool has_full_block() const { - return idx < lenminusstep; - } - really_inline const uint8_t *full_block() const { - return &buf[idx]; - } - really_inline bool has_remainder() const { - return idx < len; - } - really_inline void get_remainder(uint8_t *tmp_buf) const { - memset(tmp_buf, 0x20, STEP_SIZE); - memcpy(tmp_buf, buf + idx, len - idx); - } - really_inline void advance() { - idx += STEP_SIZE; - } -private: - const uint8_t *buf; - const size_t len; - const size_t lenminusstep; - size_t idx; -}; - class json_structural_indexer { public: template @@ -112,7 +63,7 @@ public: private: really_inline json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {} template - really_inline void index_step(const uint8_t *block, buf_block_reader &reader) noexcept; + really_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; really_inline void next(simd::simd8x64 in, json_block block, size_t idx); really_inline error_code finish(document::parser &parser, size_t idx, size_t len, bool streaming); @@ -162,7 +113,7 @@ really_inline error_code json_structural_indexer::finish(document::parser &parse } template<> -really_inline void json_structural_indexer::index_step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept { +really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept { simd::simd8x64 in_1(block); simd::simd8x64 in_2(block+64); json_block block_1 = scanner.next(in_1); @@ -173,7 +124,7 @@ really_inline void json_structural_indexer::index_step<128>(const uint8_t *block } template<> -really_inline void json_structural_indexer::index_step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept { +really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept { simd::simd8x64 in_1(block); json_block block_1 = scanner.next(in_1); this->next(in_1, block_1, reader.block_index()); @@ -209,13 +160,13 @@ error_code json_structural_indexer::index(const uint8_t *buf, size_t len, docume buf_block_reader reader(buf, len); json_structural_indexer indexer(parser.structural_indexes.get()); while (reader.has_full_block()) { - indexer.index_step(reader.full_block(), reader); + indexer.step(reader.full_block(), reader); } if (likely(reader.has_remainder())) { uint8_t block[STEP_SIZE]; reader.get_remainder(block); - indexer.index_step(block, reader); + indexer.step(block, reader); } return indexer.finish(parser, reader.block_index(), len, streaming); diff --git a/src/haswell/bitmanipulation.h b/src/haswell/bitmanipulation.h index 6ef91a97..c46f0733 100644 --- a/src/haswell/bitmanipulation.h +++ b/src/haswell/bitmanipulation.h @@ -37,7 +37,7 @@ really_inline int leading_zeroes(uint64_t input_num) { return static_cast(_lzcnt_u64(input_num)); } -really_inline int hamming(uint64_t input_num) { +really_inline int count_ones(uint64_t input_num) { #ifdef _MSC_VER // note: we do not support legacy 32-bit Windows return __popcnt64(input_num);// Visual Studio wants two underscores diff --git a/src/haswell/implementation.h b/src/haswell/implementation.h index d9f50514..fb1ff71c 100644 --- a/src/haswell/implementation.h +++ b/src/haswell/implementation.h @@ -14,6 +14,7 @@ public: instruction_set::AVX2 | instruction_set::PCLMULQDQ | instruction_set::BMI1 | instruction_set::BMI2 ) {} WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final; + WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final; WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final; WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final; diff --git a/src/haswell/simd.h b/src/haswell/simd.h index 4850dabb..472423ee 100644 --- a/src/haswell/simd.h +++ b/src/haswell/simd.h @@ -2,6 +2,8 @@ #define SIMDJSON_HASWELL_SIMD_H #include "simdjson.h" +#include "simdprune_tables.h" +#include "haswell/bitmanipulation.h" #include "haswell/intrinsics.h" TARGET_HASWELL @@ -109,6 +111,57 @@ namespace simdjson::haswell::simd { really_inline simd8 lookup_16(simd8 lookup_table) const { return _mm256_shuffle_epi8(lookup_table, *this); } + + // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). + // Passing a 0 value for mask would be equivalent to writing out every byte to output. + // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes + // get written. + // Design consideration: it seems like a function with the + // signature simd8 compress(uint32_t mask) would be + // sensible, but the AVX ISA makes this kind of approach difficult. + template + really_inline void compress(uint32_t mask, L * output) const { + // this particular implementation was inspired by work done by @animetosho + // we do it in four steps, first 8 bytes and then second 8 bytes... + uint8_t mask1 = static_cast(mask); // least significant 8 bits + uint8_t mask2 = static_cast(mask >> 8); // second least significant 8 bits + uint8_t mask3 = static_cast(mask >> 16); // ... + uint8_t mask4 = static_cast(mask >> 24); // ... + // next line just loads the 64-bit values thintable_epi8[mask1] and + // thintable_epi8[mask2] into a 128-bit register, using only + // two instructions on most compilers. + __m256i shufmask = _mm256_set_epi64x(thintable_epi8[mask4], thintable_epi8[mask3], + thintable_epi8[mask2], thintable_epi8[mask1]); + // we increment by 0x08 the second half of the mask and so forth + shufmask = + _mm256_add_epi8(shufmask, _mm256_set_epi32(0x18181818, 0x18181818, + 0x10101010, 0x10101010, 0x08080808, 0x08080808, 0, 0)); + // this is the version "nearly pruned" + __m256i pruned = _mm256_shuffle_epi8(*this, shufmask); + // we still need to put the pieces back together. + // we compute the popcount of the first words: + int pop1 = BitsSetTable256mul2[mask1]; + int pop3 = BitsSetTable256mul2[mask3]; + + // then load the corresponding mask + // could be done with _mm256_loadu2_m128i but many standard libraries omit this intrinsic. + __m256i v256 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop1 * 8))); + __m256i compactmask = _mm256_insertf128_si256(v256, + _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop3 * 8)), 1); + __m256i almostthere = _mm256_shuffle_epi8(pruned, compactmask); + // We just need to write out the result. + // This is the tricky bit that is hard to do + // if we want to return a SIMD register, since there + // is no single-instruction approach to recombine + // the two 128-bit lanes with an offset. + __m128i v128; + v128 = _mm256_castsi256_si128(almostthere); + _mm_storeu_si128( (__m128i *)output, v128); + v128 = _mm256_extractf128_si256(almostthere, 1); + _mm_storeu_si128( (__m128i *)(output + 16 - count_ones(mask & 0xFFFF)), v128); + } + template really_inline simd8 lookup_16( L replace0, L replace1, L replace2, L replace3, @@ -249,6 +302,13 @@ namespace simdjson::haswell::simd { each(1); } + really_inline void compress(uint64_t mask, T * output) const { + uint32_t mask1 = static_cast(mask); + uint32_t mask2 = static_cast(mask >> 32); + this->chunks[0].compress(mask1, output); + this->chunks[1].compress(mask2, output + 32 - count_ones(mask1)); + } + really_inline void store(T ptr[64]) const { this->chunks[0].store(ptr+sizeof(simd8)*0); this->chunks[1].store(ptr+sizeof(simd8)*1); @@ -269,6 +329,8 @@ namespace simdjson::haswell::simd { ); } + + template really_inline simd8x64 map(const simd8x64 b, F const& map_chunk) const { return simd8x64( @@ -302,7 +364,6 @@ namespace simdjson::haswell::simd { const simd8 mask = simd8::splat(m); return this->map( [&](auto a) { return a <= mask; } ).to_bitmask(); } - }; // struct simd8x64 } // namespace simdjson::haswell::simd diff --git a/src/haswell/stage1_find_marks.h b/src/haswell/stage1_find_marks.h index 1f995555..40fcf6b0 100644 --- a/src/haswell/stage1_find_marks.h +++ b/src/haswell/stage1_find_marks.h @@ -30,6 +30,11 @@ really_inline json_character_block json_character_block::classify(const simd::si auto whitespace_table = simd8::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100); auto op_table = simd8::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{'); + // We compute whitespace and op separately. If the code later only use one or the + // other, given the fact that all functions are aggressively inlined, we can + // hope that useless computations will be omitted. This is namely case when + // minifying (we only need whitespace). + uint64_t whitespace = in.map([&](simd8 _in) { return _in == simd8(_mm256_shuffle_epi8(whitespace_table, _in)); }).to_bitmask(); @@ -54,11 +59,17 @@ really_inline simd8 must_be_continuation(simd8 prev1, simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); } -#include "generic/utf8_lookup2_algorithm.h" +#include "generic/buf_block_reader.h" #include "generic/json_string_scanner.h" #include "generic/json_scanner.h" -#include "generic/json_structural_indexer.h" +#include "generic/json_minifier.h" +WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept { + return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len); +} + +#include "generic/utf8_lookup2_algorithm.h" +#include "generic/json_structural_indexer.h" WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept { return haswell::stage1::json_structural_indexer::index<128>(buf, len, parser, streaming); } diff --git a/src/implementation.cpp b/src/implementation.cpp index 5d40df29..9831ee46 100644 --- a/src/implementation.cpp +++ b/src/implementation.cpp @@ -1,5 +1,7 @@ #include "simdjson.h" #include "isadetection.h" +#include "simdprune_tables.h" + #include // Static array of known implementations. We're hoping these get baked into the executable @@ -48,6 +50,9 @@ public: WARN_UNUSED error_code parse(const uint8_t *, size_t, document::parser &) const noexcept final { return UNSUPPORTED_ARCHITECTURE; } + WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final { + return UNSUPPORTED_ARCHITECTURE; + } WARN_UNUSED error_code stage1(const uint8_t *, size_t, document::parser &, bool) const noexcept final { return UNSUPPORTED_ARCHITECTURE; } diff --git a/src/jsonminifier.cpp b/src/jsonminifier.cpp deleted file mode 100644 index 02c921bf..00000000 --- a/src/jsonminifier.cpp +++ /dev/null @@ -1,478 +0,0 @@ -#include "simdjson.h" -#include - -#ifndef SIMDJSON_ISSUE384RESOLVED // to avoid tripping users - -namespace simdjson { -static uint8_t jump_table[256 * 3] = { - 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, - 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, - 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, - 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, - 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, - 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, - 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, - 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, - 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, - 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, - 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, - 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, - 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, - 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, - 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, - 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, - 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, - 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, - 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, - 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, - 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, - 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, - 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, - 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, - 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, - 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, - 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, - 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, - 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, - 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, - 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, -}; - -size_t json_minify(const unsigned char *bytes, size_t how_many, - unsigned char *out) { - size_t i = 0, pos = 0; - uint8_t quote = 0; - uint8_t nonescape = 1; - - while (i < how_many) { - unsigned char c = bytes[i]; - uint8_t *meta = jump_table + 3 * c; - - quote = quote ^ (meta[0] & nonescape); - out[pos] = c; - pos += meta[2] | quote; - - i += 1; - nonescape = (~nonescape) | (meta[1]); - } - return pos; -} -} // namespace simdjson -#else - -// -// This fast code is disabled. -// See issue https://github.com/lemire/simdjson/issues/384 -// -#include "simdprune_tables.h" -#include -#include // currently, there is no runtime dispatch for the minifier - -namespace simdjson { - -// a straightforward comparison of a mask against input. -static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi, - __m256i mask) { - __m256i cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask); - uint64_t res_0 = static_cast(_mm256_movemask_epi8(cmp_res_0)); - __m256i cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask); - uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1); - return res_0 | (res_1 << 32); -} - -// Write up to 16 bytes, only the bytes corresponding to a 1-bit are written -// out. credit: Anime Tosho -static __m128i skinnycleanm128(__m128i x, int mask) { - int mask1 = mask & 0xFF; - int mask2 = (mask >> 8) & 0xFF; - __m128i shufmask = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(_mm_loadl_epi64( - (const __m128i *)(thintable_epi8 + mask1))), - (const __m64 *)(thintable_epi8 + mask2))); - shufmask = - _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); - __m128i pruned = _mm_shuffle_epi8(x, shufmask); - intptr_t popx2 = BitsSetTable256mul2[mask1]; - __m128i compactmask = - _mm_loadu_si128((const __m128i *)(pshufb_combine_table + popx2 * 8)); - return _mm_shuffle_epi8(pruned, compactmask); -} - -// take input from buf and remove useless whitespace, input and output can be -// the same, result is null terminated, return the string length (minus the null -// termination) -size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out) { - // Useful constant masks - const uint64_t even_bits = 0x5555555555555555ULL; - const uint64_t odd_bits = ~even_bits; - uint8_t *initout(out); - uint64_t prev_iter_ends_odd_backslash = - 0ULL; // either 0 or 1, but a 64-bit value - uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones - size_t idx = 0; - if (len >= 64) { - size_t avx_len = len - 63; - - for (; idx < avx_len; idx += 64) { - __m256i input_lo = - _mm256_loadu_si256(reinterpret_cast(buf + idx + 0)); - __m256i input_hi = - _mm256_loadu_si256(reinterpret_cast(buf + idx + 32)); - uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, - _mm256_set1_epi8('\\')); - uint64_t start_edges = bs_bits & ~(bs_bits << 1); - uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; - uint64_t even_starts = start_edges & even_start_mask; - uint64_t odd_starts = start_edges & ~even_start_mask; - uint64_t even_carries = bs_bits + even_starts; - uint64_t odd_carries; - bool iter_ends_odd_backslash = - add_overflow(bs_bits, odd_starts, &odd_carries); - odd_carries |= prev_iter_ends_odd_backslash; - prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; - uint64_t even_carry_ends = even_carries & ~bs_bits; - uint64_t odd_carry_ends = odd_carries & ~bs_bits; - uint64_t even_start_odd_end = even_carry_ends & odd_bits; - uint64_t odd_start_even_end = odd_carry_ends & even_bits; - uint64_t odd_ends = even_start_odd_end | odd_start_even_end; - uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, - _mm256_set1_epi8('"')); - quote_bits = quote_bits & ~odd_ends; - uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( - _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); - quote_mask ^= prev_iter_inside_quote; - prev_iter_inside_quote = static_cast( - static_cast(quote_mask) >> - 63); // might be undefined behavior, should be fully defined in C++20, - // ok according to John Regher from Utah University - const __m256i low_nibble_mask = _mm256_setr_epi8( - // 0 9 a b c d - 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, - 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); - const __m256i high_nibble_mask = _mm256_setr_epi8( - // 0 2 3 5 7 - 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0, - 1, 0, 0, 0, 3, 2, 1, 0, 0); - __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18); - __m256i v_lo = _mm256_and_si256( - _mm256_shuffle_epi8(low_nibble_mask, input_lo), - _mm256_shuffle_epi8(high_nibble_mask, - _mm256_and_si256(_mm256_srli_epi32(input_lo, 4), - _mm256_set1_epi8(0x7f)))); - - __m256i v_hi = _mm256_and_si256( - _mm256_shuffle_epi8(low_nibble_mask, input_hi), - _mm256_shuffle_epi8(high_nibble_mask, - _mm256_and_si256(_mm256_srli_epi32(input_hi, 4), - _mm256_set1_epi8(0x7f)))); - __m256i tmp_ws_lo = _mm256_cmpeq_epi8( - _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0)); - __m256i tmp_ws_hi = _mm256_cmpeq_epi8( - _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0)); - - uint64_t ws_res_0 = - static_cast(_mm256_movemask_epi8(tmp_ws_lo)); - uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); - uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); - whitespace &= ~quote_mask; - - uint64_t non_whitespace = ~whitespace; - - __m128i x1 = _mm256_extracti128_si256(input_lo, 0); - __m128i x2 = _mm256_extracti128_si256(input_lo, 1); - __m128i x3 = _mm256_extracti128_si256(input_hi, 0); - __m128i x4 = _mm256_extracti128_si256(input_hi, 1); - - int mask1 = non_whitespace & 0xFFFF; - int mask2 = (non_whitespace >> 16) & 0xFFFF; - int mask3 = (non_whitespace >> 32) & 0xFFFF; - int mask4 = (non_whitespace >> 48) & 0xFFFF; - - x1 = skinnycleanm128(x1, mask1); - x2 = skinnycleanm128(x2, mask2); - x3 = skinnycleanm128(x3, mask3); - x4 = skinnycleanm128(x4, mask4); - int pop1 = hamming(non_whitespace & 0xFFFF); - int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF)); - int pop3 = hamming(non_whitespace & UINT64_C(0xFFFFFFFFFFFF)); - int pop4 = hamming(non_whitespace); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4); - out += pop4; - } - } - // we finish off the job... copying and pasting the code is not ideal here, - // but it gets the job done. - if (idx < len) { - uint8_t buffer[64]; - memset(buffer, 0, 64); - memcpy(buffer, buf + idx, len - idx); - __m256i input_lo = - _mm256_loadu_si256(reinterpret_cast(buffer)); - __m256i input_hi = - _mm256_loadu_si256(reinterpret_cast(buffer + 32)); - uint64_t bs_bits = - cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\')); - uint64_t start_edges = bs_bits & ~(bs_bits << 1); - uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; - uint64_t even_starts = start_edges & even_start_mask; - uint64_t odd_starts = start_edges & ~even_start_mask; - uint64_t even_carries = bs_bits + even_starts; - uint64_t odd_carries; - // bool iter_ends_odd_backslash = - add_overflow(bs_bits, odd_starts, &odd_carries); - odd_carries |= prev_iter_ends_odd_backslash; - // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; - // // we never use it - uint64_t even_carry_ends = even_carries & ~bs_bits; - uint64_t odd_carry_ends = odd_carries & ~bs_bits; - uint64_t even_start_odd_end = even_carry_ends & odd_bits; - uint64_t odd_start_even_end = odd_carry_ends & even_bits; - uint64_t odd_ends = even_start_odd_end | odd_start_even_end; - uint64_t quote_bits = - cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"')); - quote_bits = quote_bits & ~odd_ends; - uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( - _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); - quote_mask ^= prev_iter_inside_quote; - // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we - // don't need this anymore - - __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32 - __m256i mask_70 = - _mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits - // but moves any value >= 16 above 128 - - __m256i lut_cntrl = _mm256_setr_epi8( - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, - 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00); - - __m256i tmp_ws_lo = _mm256_or_si256( - _mm256_cmpeq_epi8(mask_20, input_lo), - _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo))); - __m256i tmp_ws_hi = _mm256_or_si256( - _mm256_cmpeq_epi8(mask_20, input_hi), - _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi))); - uint64_t ws_res_0 = static_cast(_mm256_movemask_epi8(tmp_ws_lo)); - uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); - uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32)); - whitespace &= ~quote_mask; - - if (len - idx < 64) { - whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx); - } - uint64_t non_whitespace = ~whitespace; - - int mask1 = non_whitespace & 0xFFFF; - int mask2 = (non_whitespace >> 16) & 0xFFFF; - int mask3 = (non_whitespace >> 32) & 0xFFFF; - int mask4 = (non_whitespace >> 48) & 0xFFFF; - - x1 = skinnycleanm128(x1, mask1); - x2 = skinnycleanm128(x2, mask2); - x3 = skinnycleanm128(x3, mask3); - x4 = skinnycleanm128(x4, mask4); - int pop1 = hamming(non_whitespace & 0xFFFF); - int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF)); - int pop3 = hamming(non_whitespace & UINT64_C(0xFFFFFFFFFFFF)); - int pop4 = hamming(non_whitespace); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4); - out += pop4; - } - *out = '\0'; // NULL termination - return out - initout; -} - -size_t oldjson_minify(const uint8_t *buf, size_t len, uint8_t *out) { - // Useful constant masks - const uint64_t even_bits = 0x5555555555555555ULL; - const uint64_t odd_bits = ~even_bits; - uint8_t *initout(out); - uint64_t prev_iter_ends_odd_backslash = - 0ULL; // either 0 or 1, but a 64-bit value - uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones - size_t idx = 0; - if (len >= 64) { - size_t avx_len = len - 63; - - for (; idx < avx_len; idx += 64) { - __m256i input_lo = - _mm256_loadu_si256(reinterpret_cast(buf + idx + 0)); - __m256i input_hi = - _mm256_loadu_si256(reinterpret_cast(buf + idx + 32)); - uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, - _mm256_set1_epi8('\\')); - uint64_t start_edges = bs_bits & ~(bs_bits << 1); - uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; - uint64_t even_starts = start_edges & even_start_mask; - uint64_t odd_starts = start_edges & ~even_start_mask; - uint64_t even_carries = bs_bits + even_starts; - uint64_t odd_carries; - bool iter_ends_odd_backslash = - add_overflow(bs_bits, odd_starts, &odd_carries); - odd_carries |= prev_iter_ends_odd_backslash; - prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; - uint64_t even_carry_ends = even_carries & ~bs_bits; - uint64_t odd_carry_ends = odd_carries & ~bs_bits; - uint64_t even_start_odd_end = even_carry_ends & odd_bits; - uint64_t odd_start_even_end = odd_carry_ends & even_bits; - uint64_t odd_ends = even_start_odd_end | odd_start_even_end; - uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, - _mm256_set1_epi8('"')); - quote_bits = quote_bits & ~odd_ends; - uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( - _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); - quote_mask ^= prev_iter_inside_quote; - prev_iter_inside_quote = static_cast( - static_cast(quote_mask) >> - 63); // might be undefined behavior, should be fully defined in C++20, - // ok according to John Regher from Utah University - const __m256i low_nibble_mask = _mm256_setr_epi8( - // 0 9 a b c d - 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, - 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); - const __m256i high_nibble_mask = _mm256_setr_epi8( - // 0 2 3 5 7 - 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0, - 1, 0, 0, 0, 3, 2, 1, 0, 0); - __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18); - __m256i v_lo = _mm256_and_si256( - _mm256_shuffle_epi8(low_nibble_mask, input_lo), - _mm256_shuffle_epi8(high_nibble_mask, - _mm256_and_si256(_mm256_srli_epi32(input_lo, 4), - _mm256_set1_epi8(0x7f)))); - - __m256i v_hi = _mm256_and_si256( - _mm256_shuffle_epi8(low_nibble_mask, input_hi), - _mm256_shuffle_epi8(high_nibble_mask, - _mm256_and_si256(_mm256_srli_epi32(input_hi, 4), - _mm256_set1_epi8(0x7f)))); - __m256i tmp_ws_lo = _mm256_cmpeq_epi8( - _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0)); - __m256i tmp_ws_hi = _mm256_cmpeq_epi8( - _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0)); - - uint64_t ws_res_0 = - static_cast(_mm256_movemask_epi8(tmp_ws_lo)); - uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); - uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); - whitespace &= ~quote_mask; - int mask1 = whitespace & 0xFFFF; - int mask2 = (whitespace >> 16) & 0xFFFF; - int mask3 = (whitespace >> 32) & 0xFFFF; - int mask4 = (whitespace >> 48) & 0xFFFF; - int pop1 = hamming((~whitespace) & 0xFFFF); - int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF)); - int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF)); - int pop4 = hamming((~whitespace)); - __m128i x1 = _mm256_extracti128_si256(input_lo, 0); - __m128i x2 = _mm256_extracti128_si256(input_lo, 1); - __m128i x3 = _mm256_extracti128_si256(input_hi, 0); - __m128i x4 = _mm256_extracti128_si256(input_hi, 1); - x1 = skinnycleanm128(x1, mask1); - x2 = skinnycleanm128(x2, mask2); - x3 = skinnycleanm128(x3, mask3); - x4 = skinnycleanm128(x4, mask4); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4); - out += pop4; - } - } - // we finish off the job... copying and pasting the code is not ideal here, - // but it gets the job done. - if (idx < len) { - uint8_t buffer[64]; - memset(buffer, 0, 64); - memcpy(buffer, buf + idx, len - idx); - __m256i input_lo = - _mm256_loadu_si256(reinterpret_cast(buffer)); - __m256i input_hi = - _mm256_loadu_si256(reinterpret_cast(buffer + 32)); - uint64_t bs_bits = - cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\')); - uint64_t start_edges = bs_bits & ~(bs_bits << 1); - uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; - uint64_t even_starts = start_edges & even_start_mask; - uint64_t odd_starts = start_edges & ~even_start_mask; - uint64_t even_carries = bs_bits + even_starts; - uint64_t odd_carries; - // bool iter_ends_odd_backslash = - add_overflow(bs_bits, odd_starts, &odd_carries); - odd_carries |= prev_iter_ends_odd_backslash; - // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; - // // we never use it - uint64_t even_carry_ends = even_carries & ~bs_bits; - uint64_t odd_carry_ends = odd_carries & ~bs_bits; - uint64_t even_start_odd_end = even_carry_ends & odd_bits; - uint64_t odd_start_even_end = odd_carry_ends & even_bits; - uint64_t odd_ends = even_start_odd_end | odd_start_even_end; - uint64_t quote_bits = - cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"')); - quote_bits = quote_bits & ~odd_ends; - uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( - _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); - quote_mask ^= prev_iter_inside_quote; - // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we - // don't need this anymore - - __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32 - __m256i mask_70 = - _mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits - // but moves any value >= 16 above 128 - - __m256i lut_cntrl = _mm256_setr_epi8( - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, - 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00); - - __m256i tmp_ws_lo = _mm256_or_si256( - _mm256_cmpeq_epi8(mask_20, input_lo), - _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo))); - __m256i tmp_ws_hi = _mm256_or_si256( - _mm256_cmpeq_epi8(mask_20, input_hi), - _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi))); - uint64_t ws_res_0 = static_cast(_mm256_movemask_epi8(tmp_ws_lo)); - uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); - uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32)); - whitespace &= ~quote_mask; - - if (len - idx < 64) { - whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx); - } - int mask1 = whitespace & 0xFFFF; - int mask2 = (whitespace >> 16) & 0xFFFF; - int mask3 = (whitespace >> 32) & 0xFFFF; - int mask4 = (whitespace >> 48) & 0xFFFF; - int pop1 = hamming((~whitespace) & 0xFFFF); - int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF)); - int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF)); - int pop4 = hamming((~whitespace)); - __m128i x1 = _mm256_extracti128_si256(input_lo, 0); - __m128i x2 = _mm256_extracti128_si256(input_lo, 1); - __m128i x3 = _mm256_extracti128_si256(input_hi, 0); - __m128i x4 = _mm256_extracti128_si256(input_hi, 1); - x1 = skinnycleanm128(x1, mask1); - x2 = skinnycleanm128(x2, mask2); - x3 = skinnycleanm128(x3, mask3); - x4 = skinnycleanm128(x4, mask4); - _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer), x1); - _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop1), x2); - _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop2), x3); - _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop3), x4); - memcpy(out, buffer, pop4); - out += pop4; - } - *out = '\0'; // NULL termination - return out - initout; -} - -} // namespace simdjson -#endif diff --git a/src/simdjson.cpp b/src/simdjson.cpp index d4a685d0..c2f89a55 100644 --- a/src/simdjson.cpp +++ b/src/simdjson.cpp @@ -1,5 +1,4 @@ #include "simdjson.h" #include "implementation.cpp" -#include "jsonminifier.cpp" #include "stage1_find_marks.cpp" #include "stage2_build_tape.cpp" diff --git a/src/westmere/bitmanipulation.h b/src/westmere/bitmanipulation.h index e6c7e45b..da1b8256 100644 --- a/src/westmere/bitmanipulation.h +++ b/src/westmere/bitmanipulation.h @@ -46,7 +46,7 @@ really_inline int leading_zeroes(uint64_t input_num) { #endif// _MSC_VER } -really_inline int hamming(uint64_t input_num) { +really_inline int count_ones(uint64_t input_num) { #ifdef _MSC_VER // note: we do not support legacy 32-bit Windows return __popcnt64(input_num);// Visual Studio wants two underscores diff --git a/src/westmere/implementation.h b/src/westmere/implementation.h index 4675867e..2a8957f1 100644 --- a/src/westmere/implementation.h +++ b/src/westmere/implementation.h @@ -11,6 +11,7 @@ class implementation final : public simdjson::implementation { public: really_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", instruction_set::SSE42 | instruction_set::PCLMULQDQ) {} WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final; + WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final; WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final; WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final; diff --git a/src/westmere/simd.h b/src/westmere/simd.h index c399a7a4..8a5a1fcd 100644 --- a/src/westmere/simd.h +++ b/src/westmere/simd.h @@ -2,8 +2,12 @@ #define SIMDJSON_WESTMERE_SIMD_H #include "simdjson.h" +#include "simdprune_tables.h" +#include "westmere/bitmanipulation.h" #include "westmere/intrinsics.h" + + TARGET_WESTMERE namespace simdjson::westmere::simd { @@ -106,6 +110,42 @@ namespace simdjson::westmere::simd { really_inline simd8 lookup_16(simd8 lookup_table) const { return _mm_shuffle_epi8(lookup_table, *this); } + + // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). + // Passing a 0 value for mask would be equivalent to writing out every byte to output. + // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes + // get written. + // Design consideration: it seems like a function with the + // signature simd8 compress(uint32_t mask) would be + // sensible, but the AVX ISA makes this kind of approach difficult. + template + really_inline void compress(uint16_t mask, L * output) const { + // this particular implementation was inspired by work done by @animetosho + // we do it in two steps, first 8 bytes and then second 8 bytes + uint8_t mask1 = static_cast(mask); // least significant 8 bits + uint8_t mask2 = static_cast(mask >> 8); // most significant 8 bits + // next line just loads the 64-bit values thintable_epi8[mask1] and + // thintable_epi8[mask2] into a 128-bit register, using only + // two instructions on most compilers. + __m128i shufmask = _mm_set_epi64x(thintable_epi8[mask2], thintable_epi8[mask1]); + // we increment by 0x08 the second half of the mask + shufmask = + _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); + // this is the version "nearly pruned" + __m128i pruned = _mm_shuffle_epi8(*this, shufmask); + // we still need to put the two halves together. + // we compute the popcount of the first half: + int pop1 = BitsSetTable256mul2[mask1]; + // then load the corresponding mask, what it does is to write + // only the first pop1 bytes from the first 8 bytes, and then + // it fills in with the bytes from the second 8 bytes + some filling + // at the end. + __m128i compactmask = + _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop1 * 8)); + __m128i answer = _mm_shuffle_epi8(pruned, compactmask); + _mm_storeu_si128(( __m128i *)(output), answer); + } + template really_inline simd8 lookup_16( L replace0, L replace1, L replace2, L replace3, @@ -235,6 +275,13 @@ namespace simdjson::westmere::simd { this->chunks[3].store(ptr+sizeof(simd8)*3); } + really_inline void compress(uint64_t mask, T * output) const { + this->chunks[0].compress(mask, output); + this->chunks[1].compress(mask >> 16, output + 16 - count_ones(mask & 0xFFFF)); + this->chunks[2].compress(mask >> 32, output + 32 - count_ones(mask & 0xFFFFFFFF)); + this->chunks[3].compress(mask >> 48, output + 48 - count_ones(mask & 0xFFFFFFFFFFFF)); + } + template static really_inline void each_index(F const& each) { each(0); @@ -302,7 +349,6 @@ namespace simdjson::westmere::simd { const simd8 mask = simd8::splat(m); return this->map( [&](auto a) { return a <= mask; } ).to_bitmask(); } - }; // struct simd8x64 } // namespace simdjson::westmere::simd diff --git a/src/westmere/stage1_find_marks.h b/src/westmere/stage1_find_marks.h index 11a498c4..1b578d18 100644 --- a/src/westmere/stage1_find_marks.h +++ b/src/westmere/stage1_find_marks.h @@ -29,6 +29,11 @@ really_inline json_character_block json_character_block::classify(const simd::si auto whitespace_table = simd8::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100); auto op_table = simd8::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{'); + // We compute whitespace and op separately. If the code later only use one or the + // other, given the fact that all functions are aggressively inlined, we can + // hope that useless computations will be omitted. This is namely case when + // minifying (we only need whitespace). + uint64_t whitespace = in.map([&](simd8 _in) { return _in == simd8(_mm_shuffle_epi8(whitespace_table, _in)); }).to_bitmask(); @@ -53,11 +58,17 @@ really_inline simd8 must_be_continuation(simd8 prev1, simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); } -#include "generic/utf8_lookup2_algorithm.h" +#include "generic/buf_block_reader.h" #include "generic/json_string_scanner.h" #include "generic/json_scanner.h" -#include "generic/json_structural_indexer.h" +#include "generic/json_minifier.h" +WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept { + return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len); +} + +#include "generic/utf8_lookup2_algorithm.h" +#include "generic/json_structural_indexer.h" WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept { return westmere::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming); } diff --git a/tests/basictests.cpp b/tests/basictests.cpp index f0083c52..0e219e3a 100644 --- a/tests/basictests.cpp +++ b/tests/basictests.cpp @@ -892,7 +892,8 @@ namespace dom_api { if (doc["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(doc[\"a\"]) to be 1, was " << doc["a"].first << endl; return false; } UNUSED document::element val; - tie(val, error) = doc["d"]; + // tie(val, error) = doc["d"]; fails with "no viable overloaded '='" on Apple clang version 11.0.0 + doc["d"].tie(val, error); if (error != simdjson::NO_SUCH_FIELD) { cerr << "Expected NO_SUCH_FIELD error for uint64_t(doc[\"d\"]), got " << error << endl; return false; } return true; } @@ -906,11 +907,11 @@ namespace dom_api { if (doc["obj"]["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(doc[\"obj\"][\"a\"]) to be 1, was " << doc["obj"]["a"].first << endl; return false; } document::object obj; - tie(obj, error) = doc.as_object(); + doc.as_object().tie(obj, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0 if (error) { cerr << "Error: " << error << endl; return false; } if (obj["obj"]["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(doc[\"obj\"][\"a\"]) to be 1, was " << doc["obj"]["a"].first << endl; return false; } - tie(obj, error) = obj["obj"].as_object(); + obj["obj"].as_object().tie(obj, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0 if (obj["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(obj[\"a\"]) to be 1, was " << obj["a"].first << endl; return false; } if (obj["b"].as_uint64_t().first != 2) { cerr << "Expected uint64_t(obj[\"b\"]) to be 2, was " << obj["b"].first << endl; return false; } if (obj["c"].as_uint64_t().first != 3) { cerr << "Expected uint64_t(obj[\"c\"]) to be 3, was " << obj["c"].first << endl; return false; } @@ -920,7 +921,7 @@ namespace dom_api { if (obj["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(obj[\"a\"]) to be 1, was " << obj["a"].first << endl; return false; } UNUSED document::element val; - tie(val, error) = doc["d"]; + doc["d"].tie(val, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0 if (error != simdjson::NO_SUCH_FIELD) { cerr << "Expected NO_SUCH_FIELD error for uint64_t(obj[\"d\"]), got " << error << endl; return false; } return true; } @@ -944,14 +945,14 @@ namespace dom_api { if (error) { cerr << "Error: " << error << endl; return false; } for (auto tweet : tweets) { document::object user; - tie(user, error) = tweet["user"].as_object(); + tweet["user"].as_object().tie(user, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0 if (error) { cerr << "Error: " << error << endl; return false; } bool default_profile; - tie(default_profile, error) = user["default_profile"].as_bool(); + user["default_profile"].as_bool().tie(default_profile, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0 if (error) { cerr << "Error: " << error << endl; return false; } if (default_profile) { std::string_view screen_name; - tie(screen_name, error) = user["screen_name"].as_string(); + user["screen_name"].as_string().tie(screen_name, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0 if (error) { cerr << "Error: " << error << endl; return false; } default_users.insert(screen_name); } @@ -972,13 +973,13 @@ namespace dom_api { if (!not_found) { for (auto image : media) { document::object sizes; - tie(sizes, error) = image["sizes"].as_object(); + image["sizes"].as_object().tie(sizes, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0 if (error) { cerr << "Error: " << error << endl; return false; } for (auto [key, size] : sizes) { uint64_t width, height; - tie(width, error) = size["w"].as_uint64_t(); + size["w"].as_uint64_t().tie(width, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0 if (error) { cerr << "Error: " << error << endl; return false; } - tie(height, error) = size["h"].as_uint64_t(); + size["h"].as_uint64_t().tie(height, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0 if (error) { cerr << "Error: " << error << endl; return false; } image_sizes.insert(make_pair(width, height)); } diff --git a/tools/minify.cpp b/tools/minify.cpp index 342c2ad9..2748904e 100644 --- a/tools/minify.cpp +++ b/tools/minify.cpp @@ -1,18 +1,90 @@ #include +#ifndef _MSC_VER +#include +#include +#endif #include "simdjson.h" -int main(int argc, char *argv[]) { - if (argc != 2) { - std::cerr << "Usage: " << argv[0] << " \n"; - exit(1); +// Stash the exe_name in main() for functions to use +char* exe_name; + +void print_usage(std::ostream& out) { + out << "Usage: " << exe_name << " [-a ARCH] " << std::endl; + out << std::endl; + out << "Runs the parser against the given json files in a loop, measuring speed and other statistics." << std::endl; + out << std::endl; + out << "Options:" << std::endl; + out << std::endl; + out << "-a IMPL - Use the given parser implementation. By default, detects the most advanced" << std::endl; + out << " implementation supported on the host machine." << std::endl; + for (auto impl : simdjson::available_implementations) { + out << "-a " << std::left << std::setw(9) << impl->name() << " - Use the " << impl->description() << " parser implementation." << std::endl; } - std::string filename = argv[argc - 1]; +} + +void exit_usage(std::string message) { + std::cerr << message << std::endl; + std::cerr << std::endl; + print_usage(std::cerr); + exit(EXIT_FAILURE); +} + + +struct option_struct { + char* filename; + + option_struct(int argc, char **argv) { + #ifndef _MSC_VER + int c; + + while ((c = getopt(argc, argv, "a:")) != -1) { + switch (c) { + case 'a': { + const simdjson::implementation *impl = simdjson::available_implementations[optarg]; + if (!impl) { + std::string exit_message = std::string("Unsupported option value -a ") + optarg + ": expected -a with one of "; + for (auto imple : simdjson::available_implementations) { + exit_message += imple->name(); + exit_message += " "; + } + exit_usage(exit_message); + } + simdjson::active_implementation = impl; + break; + } + default: + // reaching here means an argument was given to getopt() which did not have a case label + exit_usage("Unexpected argument - missing case for option "+ + std::string(1,static_cast(c))+ + " (programming error)"); + } + } + #else + int optind = 1; + #endif + + // All remaining arguments are considered to be files + if(optind + 1 == argc) { + filename = argv[optind]; + } else { + exit_usage("Please specify exactly one input file."); + } + } +}; + +int main(int argc, char *argv[]) { + exe_name = argv[0]; + option_struct options(argc, argv); + std::string filename = options.filename; auto [p, error] = simdjson::padded_string::load(filename); if (error) { std::cerr << "Could not load the file " << filename << std::endl; return EXIT_FAILURE; } - simdjson::json_minify(p, p.data()); - printf("%s", p.data()); + simdjson::padded_string copy(p.length()); + size_t copy_len; + error = simdjson::active_implementation->minify((const uint8_t*)p.data(), p.length(), (uint8_t*)copy.data(), copy_len); + if (error) { std::cerr << error << std::endl; return 1; } + printf("%s", copy.data()); }