faster minifier (#568)

* Fallback should use our scalar code.
* parse should have a nicer error message.
* Making it so that "minify" can use different architectures.
* Let us change the minifier competition so that it tests all implementations.
* Documenting the untaken optimization opportunity.

Co-authored-by: John Keiser <john@johnkeiser.com>
This commit is contained in:
Daniel Lemire 2020-03-20 16:14:47 -04:00 committed by GitHub
parent 293ec7aec5
commit 5d1e3efce8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
34 changed files with 558 additions and 620 deletions

View File

@ -53,7 +53,7 @@ endif # ifeq ($(SANITIZE),1)
endif # ifeq ($(MEMSANITIZE),1)
# Headers and sources
SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/json_scanner.h src/generic/json_string_scanner.h src/generic/json_structural_indexer.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h
SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/json_scanner.h src/generic/json_string_scanner.h src/generic/json_structural_indexer.h src/generic/json_minifier.h src/generic/buf_block_reader.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h
SRCHEADERS_ARM64= src/arm64/bitmanipulation.h src/arm64/bitmask.h src/arm64/intrinsics.h src/arm64/numberparsing.h src/arm64/simd.h src/arm64/stage1_find_marks.h src/arm64/stage2_build_tape.h src/arm64/stringparsing.h
SRCHEADERS_HASWELL= src/haswell/bitmanipulation.h src/haswell/bitmask.h src/haswell/intrinsics.h src/haswell/numberparsing.h src/haswell/simd.h src/haswell/stage1_find_marks.h src/haswell/stage2_build_tape.h src/haswell/stringparsing.h
SRCHEADERS_FALLBACK= src/fallback/bitmanipulation.h src/fallback/implementation.h src/fallback/numberparsing.h src/fallback/stage1_find_marks.h src/fallback/stage2_build_tape.h src/fallback/stringparsing.h
@ -61,7 +61,7 @@ SRCHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/we
SRCHEADERS_SRC=src/isadetection.h src/jsoncharutils.h src/simdprune_tables.h src/implementation.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document_parser_callbacks.h
SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE) $(SRCHEADERS_FALLBACK)
INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
ifeq ($(SIMDJSON_TEST_AMALGAMATED_HEADERS),1)
HEADERS=singleheader/simdjson.h

View File

@ -143,7 +143,7 @@ int main(int argc, char *argv[]) {
// parse_many
const char * filename2 = argv[2];
for (auto result : parser.load_many(filename2)) {
error = result.error;
error = result.error();
}
if (error) {
std::cout << "parse_many failed" << std::endl;

View File

@ -98,16 +98,14 @@ int main(int argc, char *argv[]) {
"despacing with RapidJSON Insitu", rapid_stringme_insitu((char *)buffer),
memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
memcpy(buffer, p.data(), p.size());
size_t outlength = simdjson::json_minify((const uint8_t *)buffer, p.size(),
(uint8_t *)buffer);
if (verbose)
std::cout << "json_minify length is " << outlength << std::endl;
size_t outlength;
uint8_t *cbuffer = (uint8_t *)buffer;
BEST_TIME("json_minify", simdjson::json_minify(cbuffer, p.size(), cbuffer),
for (auto imple : simdjson::available_implementations) {
BEST_TIME((std::string("simdjson->minify+")+imple->name()).c_str(), (imple->minify(cbuffer, p.size(), cbuffer, outlength), outlength),
outlength, memcpy(buffer, p.data(), p.size()), repeat, volume,
!just_data);
}
printf("minisize = %zu, original size = %zu (minified down to %.2f percent "
"of original) \n",
outlength, p.size(), outlength * 100.0 / p.size());
@ -121,8 +119,9 @@ int main(int argc, char *argv[]) {
!just_data);
char *mini_buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1);
size_t minisize = simdjson::json_minify((const uint8_t *)p.data(), p.size(),
(uint8_t *)mini_buffer);
size_t minisize;
simdjson::active_implementation->minify((const uint8_t *)p.data(), p.size(),
(uint8_t *)mini_buffer, minisize);
mini_buffer[minisize] = '\0';
BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(),
@ -171,6 +170,7 @@ int main(int argc, char *argv[]) {
automated_reallocation),
simdjson::SUCCESS, memcpy(buffer, mini_buffer, p.size()), repeat, volume,
!just_data);
free(buffer);
free(ast_buffer);
free(mini_buffer);

View File

@ -109,7 +109,12 @@ struct option_struct {
case 'a': {
const implementation *impl = simdjson::available_implementations[optarg];
if (!impl) {
exit_usage(string("Unsupported option value -a ") + optarg + ": expected -a haswell, westmere or arm64");
std::string exit_message = string("Unsupported option value -a ") + optarg + ": expected -a with one of ";
for (auto imple : simdjson::available_implementations) {
exit_message += imple->name();
exit_message += " ";
}
exit_usage(exit_message);
}
simdjson::active_implementation = impl;
break;

View File

@ -16,7 +16,6 @@ set(SIMDJSON_INCLUDE
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/padded_string.h
${SIMDJSON_INCLUDE_DIR}/simdjson/internal/jsonformatutils.h
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonioutil.h
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonminifier.h
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonparser.h
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonstream.h
${SIMDJSON_INCLUDE_DIR}/simdjson/padded_string.h

View File

@ -10,7 +10,6 @@
#include "simdjson/implementation.h"
#include "simdjson/document.h"
#include "simdjson/document_stream.h"
#include "simdjson/jsonminifier.h"
// Deprecated API
#include "simdjson/parsedjsoniterator.h"

View File

@ -77,6 +77,17 @@ private:
*/
template<typename T>
struct simdjson_result : public std::pair<T, error_code> {
/**
* Move the value and the error to the provided variables.
*/
void tie(T& t, error_code & e) {
// on the clang compiler that comes with current macOS (Apple clang version 11.0.0),
// tie(width, error) = size["w"].as_uint64_t();
// fails with "error: no viable overloaded '='""
t = std::move(this->first);
e = std::move(this->second);
}
/**
* The error.
*/
@ -138,6 +149,7 @@ struct simdjson_move_result : std::pair<T, error_code> {
t = std::move(this->first);
e = std::move(this->second);
}
/**
* The error.
*/

View File

@ -56,6 +56,19 @@ public:
*/
WARN_UNUSED virtual error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept = 0;
/**
* Run a full document parse (ensure_capacity, stage1 and stage2).
*
* Overridden by each implementation.
*
* @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
* @param len the length of the json document.
* @param dst the buffer to write the minified document to. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
* @param dst_len the number of bytes written. Output only.
* @return the error code, or SUCCESS if there was no error.
*/
WARN_UNUSED virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0;
/**
* Stage 1 of the document parser.
*
@ -182,6 +195,9 @@ public:
WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final {
return set_best()->parse(buf, len, parser);
}
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
return set_best()->minify(buf, len, dst, dst_len);
}
WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final {
return set_best()->stage1(buf, len, parser, streaming);
}

View File

@ -1,32 +0,0 @@
#ifndef SIMDJSON_JSONMINIFIER_H
#define SIMDJSON_JSONMINIFIER_H
#include "simdjson/padded_string.h"
#include <cstddef>
#include <cstdint>
#include <string_view>
namespace simdjson {
// Take input from buf and remove useless whitespace, write it to out; buf and
// out can be the same pointer. Result is null terminated,
// return the string length (minus the null termination).
// The accelerated version of this function only runs on AVX2 hardware.
size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out);
static inline size_t json_minify(const char *buf, size_t len, char *out) {
return json_minify(reinterpret_cast<const uint8_t *>(buf), len,
reinterpret_cast<uint8_t *>(out));
}
static inline size_t json_minify(const std::string_view &p, char *out) {
return json_minify(p.data(), p.size(), out);
}
static inline size_t json_minify(const padded_string &p, char *out) {
return json_minify(p.data(), p.size(), out);
}
} // namespace simdjson
#endif // SIMDJSON_JSONMINIFIER_H

View File

@ -29,7 +29,6 @@ set(SIMDJSON_SRC
set(SIMDJSON_SRC_HEADERS
implementation.cpp
isadetection.h
jsonminifier.cpp
simdprune_tables.h
stage1_find_marks.cpp
stage2_build_tape.cpp

View File

@ -48,7 +48,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
}
/* result might be undefined when input_num is zero */
really_inline int hamming(uint64_t input_num) {
really_inline int count_ones(uint64_t input_num) {
return vaddv_u8(vcnt_u8((uint8x8_t)input_num));
}

View File

@ -10,6 +10,7 @@ class implementation final : public simdjson::implementation {
public:
really_inline implementation() : simdjson::implementation("arm64", "ARM NEON", instruction_set::NEON) {}
WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final;
WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final;

View File

@ -2,6 +2,8 @@
#define SIMDJSON_ARM64_SIMD_H
#include "simdjson.h"
#include "simdprune_tables.h"
#include "arm64/bitmanipulation.h"
#include "arm64/intrinsics.h"
namespace simdjson::arm64::simd {
@ -142,6 +144,43 @@ namespace simdjson::arm64::simd {
really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
return lookup_table.apply_lookup_16_to(*this);
}
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
// Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
// get written.
// Design consideration: it seems like a function with the
// signature simd8<L> compress(uint16_t mask) would be
// sensible, but the AVX ISA makes this kind of approach difficult.
template<typename L>
really_inline void compress(uint16_t mask, L * output) const {
// this particular implementation was inspired by work done by @animetosho
// we do it in two steps, first 8 bytes and then second 8 bytes
uint8_t mask1 = static_cast<uint8_t>(mask); // least significant 8 bits
uint8_t mask2 = static_cast<uint8_t>(mask >> 8); // most significant 8 bits
// next line just loads the 64-bit values thintable_epi8[mask1] and
// thintable_epi8[mask2] into a 128-bit register, using only
// two instructions on most compilers.
uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
// we increment by 0x08 the second half of the mask
uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
shufmask = vaddq_u8(shufmask, inc);
// this is the version "nearly pruned"
uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
// we still need to put the two halves together.
// we compute the popcount of the first half:
int pop1 = BitsSetTable256mul2[mask1];
// then load the corresponding mask, what it does is to write
// only the first pop1 bytes from the first 8 bytes, and then
// it fills in with the bytes from the second 8 bytes + some filling
// at the end.
uint8x16_t compactmask = vld1q_u8((const uint8_t *)(pshufb_combine_table + pop1 * 8));
uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
vst1q_u8((uint8_t*) output, answer);
}
template<typename L>
really_inline simd8<L> lookup_16(
L replace0, L replace1, L replace2, L replace3,
@ -267,6 +306,13 @@ namespace simdjson::arm64::simd {
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
}
really_inline void compress(uint64_t mask, T * output) const {
this->chunks[0].compress(mask, output);
this->chunks[1].compress(mask >> 16, output + 16 - count_ones(mask & 0xFFFF));
this->chunks[2].compress(mask >> 32, output + 32 - count_ones(mask & 0xFFFFFFFF));
this->chunks[3].compress(mask >> 48, output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
}
template <typename F>
static really_inline void each_index(F const& each) {
each(0);
@ -339,7 +385,6 @@ namespace simdjson::arm64::simd {
const simd8<T> mask = simd8<T>::splat(m);
return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
}
}; // struct simd8x64<T>
} // namespace simdjson::arm64::simd

View File

@ -31,6 +31,23 @@ really_inline json_character_block json_character_block::classify(const simd::si
return shuf_lo & shuf_hi;
});
// We compute whitespace and op separately. If the code later only use one or the
// other, given the fact that all functions are aggressively inlined, we can
// hope that useless computations will be omitted. This is namely case when
// minifying (we only need whitespace). *However* if we only need spaces,
// it is likely that we will still compute 'v' above with two lookup_16: one
// could do it a bit cheaper. This is in contrast with the x64 implementations
// where we can, efficiently, do the white space and structural matching
// separately. One reason for this difference is that on ARM NEON, the table
// lookups either zero or leave unchanged the characters exceeding 0xF whereas
// on x64, the equivalent instruction (pshufb) automatically applies a mask,
// ignoring the 4 most significant bits. Thus the x64 implementation is
// optimized differently. This being said, if you use this code strictly
// just for minification (or just to identify the structural characters),
// there is a small untaken optimization opportunity here. We deliberately
// do not pick it up.
uint64_t op = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x7); }).to_bitmask();
uint64_t whitespace = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x18); }).to_bitmask();
return { whitespace, op };
@ -53,11 +70,17 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
return is_second_byte ^ is_third_byte ^ is_fourth_byte;
}
#include "generic/utf8_lookup2_algorithm.h"
#include "generic/buf_block_reader.h"
#include "generic/json_string_scanner.h"
#include "generic/json_scanner.h"
#include "generic/json_structural_indexer.h"
#include "generic/json_minifier.h"
WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
#include "generic/utf8_lookup2_algorithm.h"
#include "generic/json_structural_indexer.h"
WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept {
return arm64::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming);
}

View File

@ -14,6 +14,7 @@ public:
0
) {}
WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final;
WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final;

View File

@ -151,6 +151,62 @@ WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, do
return scanner.scan();
}
// big table for the minifier
static uint8_t jump_table[256 * 3] = {
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
};
WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
size_t i = 0, pos = 0;
uint8_t quote = 0;
uint8_t nonescape = 1;
while (i < len) {
unsigned char c = buf[i];
uint8_t *meta = jump_table + 3 * c;
quote = quote ^ (meta[0] & nonescape);
dst[pos] = c;
pos += meta[2] | quote;
i += 1;
nonescape = (~nonescape) | (meta[1]);
}
dst_len = pos; // we intentionally do not work with a reference
// for fear of aliasing
return SUCCESS;
}
} // namespace simdjson::fallback
#endif // SIMDJSON_FALLBACK_STAGE1_FIND_MARKS_H

View File

@ -0,0 +1,48 @@
// Walks through a buffer in block-sized increments, loading the last part with spaces
template<size_t STEP_SIZE>
struct buf_block_reader {
public:
really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
really_inline size_t block_index() { return idx; }
really_inline bool has_full_block() const {
return idx < lenminusstep;
}
really_inline const uint8_t *full_block() const {
return &buf[idx];
}
really_inline bool has_remainder() const {
return idx < len;
}
really_inline void get_remainder(uint8_t *tmp_buf) const {
memset(tmp_buf, 0x20, STEP_SIZE);
memcpy(tmp_buf, buf + idx, len - idx);
}
really_inline void advance() {
idx += STEP_SIZE;
}
private:
const uint8_t *buf;
const size_t len;
const size_t lenminusstep;
size_t idx;
};
// Routines to print masks and text for debugging bitmask operations
UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
in.store((uint8_t*)buf);
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
if (buf[i] < ' ') { buf[i] = '_'; }
}
buf[sizeof(simd8x64<uint8_t>)] = '\0';
return buf;
}
UNUSED static char * format_mask(uint64_t mask) {
static char *buf = (char*)malloc(64 + 1);
for (size_t i=0; i<64; i++) {
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
}
buf[64] = '\0';
return buf;
}

View File

@ -0,0 +1,73 @@
// This file contains the common code every implementation uses in stage1
// It is intended to be included multiple times and compiled multiple times
// We assume the file in which it is included already includes
// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
namespace stage1 {
class json_minifier {
public:
template<size_t STEP_SIZE>
static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
private:
really_inline json_minifier(uint8_t *_dst) : dst{_dst} {}
template<size_t STEP_SIZE>
really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
really_inline void next(simd::simd8x64<uint8_t> in, json_block block);
really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
json_scanner scanner;
uint8_t *dst;
};
really_inline void json_minifier::next(simd::simd8x64<uint8_t> in, json_block block) {
uint64_t mask = block.whitespace();
in.compress(mask, dst);
dst += 64 - count_ones(mask);
}
really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
*dst = '\0';
error_code error = scanner.finish(false);
if (error) { dst_len = 0; return error; }
dst_len = dst - dst_start;
return SUCCESS;
}
template<>
really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block_buf);
simd::simd8x64<uint8_t> in_2(block_buf+64);
json_block block_1 = scanner.next(in_1);
json_block block_2 = scanner.next(in_2);
this->next(in_1, block_1);
this->next(in_2, block_2);
reader.advance();
}
template<>
really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block_buf);
json_block block_1 = scanner.next(in_1);
this->next(block_buf, block_1);
reader.advance();
}
template<size_t STEP_SIZE>
error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
buf_block_reader<STEP_SIZE> reader(buf, len);
json_minifier minifier(dst);
while (reader.has_full_block()) {
minifier.step<STEP_SIZE>(reader.full_block(), reader);
}
if (likely(reader.has_remainder())) {
uint8_t block[STEP_SIZE];
reader.get_remainder(block);
minifier.step<STEP_SIZE>(block, reader);
}
return minifier.finish(dst, dst_len);
}
} // namespace stage1

View File

@ -5,23 +5,33 @@ namespace stage1 {
*/
struct json_block {
public:
// the start of structurals that are not inside strings
/** The start of structurals */
really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); }
/** All JSON whitespace (i.e. not in a string) */
really_inline uint64_t whitespace() { return non_quote_outside_string(_characters.whitespace()); }
// operators plus scalar starts like 123, true and "abc"
really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
// the start of non-operator runs, like 123, true and "abc"
really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
// whether the given character is immediately after a non-operator like 123, true or "
really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
// Helpers
/** Whether the given characters are inside a string (only works on non-quotes) */
really_inline uint64_t non_quote_inside_string(uint64_t mask) { return _string.non_quote_inside_string(mask); }
/** Whether the given characters are outside a string (only works on non-quotes) */
really_inline uint64_t non_quote_outside_string(uint64_t mask) { return _string.non_quote_outside_string(mask); }
// string and escape characters
json_string_block _string;
// whitespace, operators, scalars
json_character_block _characters;
// whether the previous character was a scalar
uint64_t _follows_potential_scalar;
private:
// Potential structurals (i.e. disregarding strings)
/** operators plus scalar starts like 123, true and "abc" */
really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
/** the start of non-operator runs, like 123, true and "abc" */
really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
/** whether the given character is immediately after a non-operator like 123, true or " */
really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
};
/**

View File

@ -14,7 +14,9 @@ struct json_string_block {
// Only characters inside the string (not including the quotes)
really_inline uint64_t string_content() const { return _in_string & ~_quote; }
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return _in_string & mask; }
really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
// Tail of string (everything except the start quote)
really_inline uint64_t string_tail() const { return _in_string ^ _quote; }

View File

@ -22,7 +22,7 @@ public:
// it helps tremendously.
if (bits == 0)
return;
uint32_t cnt = hamming(bits);
uint32_t cnt = count_ones(bits);
// Do the first 8 all together
for (int i=0; i<8; i++) {
@ -55,55 +55,6 @@ public:
}
};
// Routines to print masks and text for debugging bitmask operations
UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
in.store((uint8_t*)buf);
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
if (buf[i] < ' ') { buf[i] = '_'; }
}
buf[sizeof(simd8x64<uint8_t>)] = '\0';
return buf;
}
UNUSED static char * format_mask(uint64_t mask) {
static char *buf = (char*)malloc(64 + 1);
for (size_t i=0; i<64; i++) {
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
}
buf[64] = '\0';
return buf;
}
// Walks through a buffer in block-sized increments, loading the last part with spaces
template<size_t STEP_SIZE>
struct buf_block_reader {
public:
really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
really_inline size_t block_index() { return idx; }
really_inline bool has_full_block() const {
return idx < lenminusstep;
}
really_inline const uint8_t *full_block() const {
return &buf[idx];
}
really_inline bool has_remainder() const {
return idx < len;
}
really_inline void get_remainder(uint8_t *tmp_buf) const {
memset(tmp_buf, 0x20, STEP_SIZE);
memcpy(tmp_buf, buf + idx, len - idx);
}
really_inline void advance() {
idx += STEP_SIZE;
}
private:
const uint8_t *buf;
const size_t len;
const size_t lenminusstep;
size_t idx;
};
class json_structural_indexer {
public:
template<size_t STEP_SIZE>
@ -112,7 +63,7 @@ public:
private:
really_inline json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
template<size_t STEP_SIZE>
really_inline void index_step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
really_inline error_code finish(document::parser &parser, size_t idx, size_t len, bool streaming);
@ -162,7 +113,7 @@ really_inline error_code json_structural_indexer::finish(document::parser &parse
}
template<>
really_inline void json_structural_indexer::index_step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
simd::simd8x64<uint8_t> in_2(block+64);
json_block block_1 = scanner.next(in_1);
@ -173,7 +124,7 @@ really_inline void json_structural_indexer::index_step<128>(const uint8_t *block
}
template<>
really_inline void json_structural_indexer::index_step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
json_block block_1 = scanner.next(in_1);
this->next(in_1, block_1, reader.block_index());
@ -209,13 +160,13 @@ error_code json_structural_indexer::index(const uint8_t *buf, size_t len, docume
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
while (reader.has_full_block()) {
indexer.index_step<STEP_SIZE>(reader.full_block(), reader);
indexer.step<STEP_SIZE>(reader.full_block(), reader);
}
if (likely(reader.has_remainder())) {
uint8_t block[STEP_SIZE];
reader.get_remainder(block);
indexer.index_step<STEP_SIZE>(block, reader);
indexer.step<STEP_SIZE>(block, reader);
}
return indexer.finish(parser, reader.block_index(), len, streaming);

View File

@ -37,7 +37,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
return static_cast<int>(_lzcnt_u64(input_num));
}
really_inline int hamming(uint64_t input_num) {
really_inline int count_ones(uint64_t input_num) {
#ifdef _MSC_VER
// note: we do not support legacy 32-bit Windows
return __popcnt64(input_num);// Visual Studio wants two underscores

View File

@ -14,6 +14,7 @@ public:
instruction_set::AVX2 | instruction_set::PCLMULQDQ | instruction_set::BMI1 | instruction_set::BMI2
) {}
WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final;
WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final;

View File

@ -2,6 +2,8 @@
#define SIMDJSON_HASWELL_SIMD_H
#include "simdjson.h"
#include "simdprune_tables.h"
#include "haswell/bitmanipulation.h"
#include "haswell/intrinsics.h"
TARGET_HASWELL
@ -109,6 +111,57 @@ namespace simdjson::haswell::simd {
really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
return _mm256_shuffle_epi8(lookup_table, *this);
}
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
// Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
// get written.
// Design consideration: it seems like a function with the
// signature simd8<L> compress(uint32_t mask) would be
// sensible, but the AVX ISA makes this kind of approach difficult.
template<typename L>
really_inline void compress(uint32_t mask, L * output) const {
// this particular implementation was inspired by work done by @animetosho
// we do it in four steps, first 8 bytes and then second 8 bytes...
uint8_t mask1 = static_cast<uint8_t>(mask); // least significant 8 bits
uint8_t mask2 = static_cast<uint8_t>(mask >> 8); // second least significant 8 bits
uint8_t mask3 = static_cast<uint8_t>(mask >> 16); // ...
uint8_t mask4 = static_cast<uint8_t>(mask >> 24); // ...
// next line just loads the 64-bit values thintable_epi8[mask1] and
// thintable_epi8[mask2] into a 128-bit register, using only
// two instructions on most compilers.
__m256i shufmask = _mm256_set_epi64x(thintable_epi8[mask4], thintable_epi8[mask3],
thintable_epi8[mask2], thintable_epi8[mask1]);
// we increment by 0x08 the second half of the mask and so forth
shufmask =
_mm256_add_epi8(shufmask, _mm256_set_epi32(0x18181818, 0x18181818,
0x10101010, 0x10101010, 0x08080808, 0x08080808, 0, 0));
// this is the version "nearly pruned"
__m256i pruned = _mm256_shuffle_epi8(*this, shufmask);
// we still need to put the pieces back together.
// we compute the popcount of the first words:
int pop1 = BitsSetTable256mul2[mask1];
int pop3 = BitsSetTable256mul2[mask3];
// then load the corresponding mask
// could be done with _mm256_loadu2_m128i but many standard libraries omit this intrinsic.
__m256i v256 = _mm256_castsi128_si256(
_mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop1 * 8)));
__m256i compactmask = _mm256_insertf128_si256(v256,
_mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop3 * 8)), 1);
__m256i almostthere = _mm256_shuffle_epi8(pruned, compactmask);
// We just need to write out the result.
// This is the tricky bit that is hard to do
// if we want to return a SIMD register, since there
// is no single-instruction approach to recombine
// the two 128-bit lanes with an offset.
__m128i v128;
v128 = _mm256_castsi256_si128(almostthere);
_mm_storeu_si128( (__m128i *)output, v128);
v128 = _mm256_extractf128_si256(almostthere, 1);
_mm_storeu_si128( (__m128i *)(output + 16 - count_ones(mask & 0xFFFF)), v128);
}
template<typename L>
really_inline simd8<L> lookup_16(
L replace0, L replace1, L replace2, L replace3,
@ -249,6 +302,13 @@ namespace simdjson::haswell::simd {
each(1);
}
really_inline void compress(uint64_t mask, T * output) const {
uint32_t mask1 = static_cast<uint32_t>(mask);
uint32_t mask2 = static_cast<uint32_t>(mask >> 32);
this->chunks[0].compress(mask1, output);
this->chunks[1].compress(mask2, output + 32 - count_ones(mask1));
}
really_inline void store(T ptr[64]) const {
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
@ -269,6 +329,8 @@ namespace simdjson::haswell::simd {
);
}
template <typename R=bool, typename F>
really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const {
return simd8x64<R>(
@ -302,7 +364,6 @@ namespace simdjson::haswell::simd {
const simd8<T> mask = simd8<T>::splat(m);
return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
}
}; // struct simd8x64<T>
} // namespace simdjson::haswell::simd

View File

@ -30,6 +30,11 @@ really_inline json_character_block json_character_block::classify(const simd::si
auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
// We compute whitespace and op separately. If the code later only use one or the
// other, given the fact that all functions are aggressively inlined, we can
// hope that useless computations will be omitted. This is namely case when
// minifying (we only need whitespace).
uint64_t whitespace = in.map([&](simd8<uint8_t> _in) {
return _in == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, _in));
}).to_bitmask();
@ -54,11 +59,17 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
}
#include "generic/utf8_lookup2_algorithm.h"
#include "generic/buf_block_reader.h"
#include "generic/json_string_scanner.h"
#include "generic/json_scanner.h"
#include "generic/json_structural_indexer.h"
#include "generic/json_minifier.h"
WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
}
#include "generic/utf8_lookup2_algorithm.h"
#include "generic/json_structural_indexer.h"
WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept {
return haswell::stage1::json_structural_indexer::index<128>(buf, len, parser, streaming);
}

View File

@ -1,5 +1,7 @@
#include "simdjson.h"
#include "isadetection.h"
#include "simdprune_tables.h"
#include <initializer_list>
// Static array of known implementations. We're hoping these get baked into the executable
@ -48,6 +50,9 @@ public:
WARN_UNUSED error_code parse(const uint8_t *, size_t, document::parser &) const noexcept final {
return UNSUPPORTED_ARCHITECTURE;
}
WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final {
return UNSUPPORTED_ARCHITECTURE;
}
WARN_UNUSED error_code stage1(const uint8_t *, size_t, document::parser &, bool) const noexcept final {
return UNSUPPORTED_ARCHITECTURE;
}

View File

@ -1,478 +0,0 @@
#include "simdjson.h"
#include <cstdint>
#ifndef SIMDJSON_ISSUE384RESOLVED // to avoid tripping users
namespace simdjson {
static uint8_t jump_table[256 * 3] = {
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
};
size_t json_minify(const unsigned char *bytes, size_t how_many,
unsigned char *out) {
size_t i = 0, pos = 0;
uint8_t quote = 0;
uint8_t nonescape = 1;
while (i < how_many) {
unsigned char c = bytes[i];
uint8_t *meta = jump_table + 3 * c;
quote = quote ^ (meta[0] & nonescape);
out[pos] = c;
pos += meta[2] | quote;
i += 1;
nonescape = (~nonescape) | (meta[1]);
}
return pos;
}
} // namespace simdjson
#else
//
// This fast code is disabled.
// See issue https://github.com/lemire/simdjson/issues/384
//
#include "simdprune_tables.h"
#include <cstring>
#include <x86intrin.h> // currently, there is no runtime dispatch for the minifier
namespace simdjson {
// a straightforward comparison of a mask against input.
static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi,
__m256i mask) {
__m256i cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask);
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
__m256i cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask);
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
return res_0 | (res_1 << 32);
}
// Write up to 16 bytes, only the bytes corresponding to a 1-bit are written
// out. credit: Anime Tosho
static __m128i skinnycleanm128(__m128i x, int mask) {
int mask1 = mask & 0xFF;
int mask2 = (mask >> 8) & 0xFF;
__m128i shufmask = _mm_castps_si128(
_mm_loadh_pi(_mm_castsi128_ps(_mm_loadl_epi64(
(const __m128i *)(thintable_epi8 + mask1))),
(const __m64 *)(thintable_epi8 + mask2)));
shufmask =
_mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
__m128i pruned = _mm_shuffle_epi8(x, shufmask);
intptr_t popx2 = BitsSetTable256mul2[mask1];
__m128i compactmask =
_mm_loadu_si128((const __m128i *)(pshufb_combine_table + popx2 * 8));
return _mm_shuffle_epi8(pruned, compactmask);
}
// take input from buf and remove useless whitespace, input and output can be
// the same, result is null terminated, return the string length (minus the null
// termination)
size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out) {
// Useful constant masks
const uint64_t even_bits = 0x5555555555555555ULL;
const uint64_t odd_bits = ~even_bits;
uint8_t *initout(out);
uint64_t prev_iter_ends_odd_backslash =
0ULL; // either 0 or 1, but a 64-bit value
uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones
size_t idx = 0;
if (len >= 64) {
size_t avx_len = len - 63;
for (; idx < avx_len; idx += 64) {
__m256i input_lo =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0));
__m256i input_hi =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32));
uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi,
_mm256_set1_epi8('\\'));
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
uint64_t even_starts = start_edges & even_start_mask;
uint64_t odd_starts = start_edges & ~even_start_mask;
uint64_t even_carries = bs_bits + even_starts;
uint64_t odd_carries;
bool iter_ends_odd_backslash =
add_overflow(bs_bits, odd_starts, &odd_carries);
odd_carries |= prev_iter_ends_odd_backslash;
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
uint64_t even_carry_ends = even_carries & ~bs_bits;
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
uint64_t odd_start_even_end = odd_carry_ends & even_bits;
uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi,
_mm256_set1_epi8('"'));
quote_bits = quote_bits & ~odd_ends;
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
prev_iter_inside_quote = static_cast<uint64_t>(
static_cast<int64_t>(quote_mask) >>
63); // might be undefined behavior, should be fully defined in C++20,
// ok according to John Regher from Utah University
const __m256i low_nibble_mask = _mm256_setr_epi8(
// 0 9 a b c d
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0,
0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
const __m256i high_nibble_mask = _mm256_setr_epi8(
// 0 2 3 5 7
8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
1, 0, 0, 0, 3, 2, 1, 0, 0);
__m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);
__m256i v_lo = _mm256_and_si256(
_mm256_shuffle_epi8(low_nibble_mask, input_lo),
_mm256_shuffle_epi8(high_nibble_mask,
_mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
_mm256_set1_epi8(0x7f))));
__m256i v_hi = _mm256_and_si256(
_mm256_shuffle_epi8(low_nibble_mask, input_hi),
_mm256_shuffle_epi8(high_nibble_mask,
_mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
_mm256_set1_epi8(0x7f))));
__m256i tmp_ws_lo = _mm256_cmpeq_epi8(
_mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
__m256i tmp_ws_hi = _mm256_cmpeq_epi8(
_mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
uint64_t ws_res_0 =
static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
whitespace &= ~quote_mask;
uint64_t non_whitespace = ~whitespace;
__m128i x1 = _mm256_extracti128_si256(input_lo, 0);
__m128i x2 = _mm256_extracti128_si256(input_lo, 1);
__m128i x3 = _mm256_extracti128_si256(input_hi, 0);
__m128i x4 = _mm256_extracti128_si256(input_hi, 1);
int mask1 = non_whitespace & 0xFFFF;
int mask2 = (non_whitespace >> 16) & 0xFFFF;
int mask3 = (non_whitespace >> 32) & 0xFFFF;
int mask4 = (non_whitespace >> 48) & 0xFFFF;
x1 = skinnycleanm128(x1, mask1);
x2 = skinnycleanm128(x2, mask2);
x3 = skinnycleanm128(x3, mask3);
x4 = skinnycleanm128(x4, mask4);
int pop1 = hamming(non_whitespace & 0xFFFF);
int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF));
int pop3 = hamming(non_whitespace & UINT64_C(0xFFFFFFFFFFFF));
int pop4 = hamming(non_whitespace);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4);
out += pop4;
}
}
// we finish off the job... copying and pasting the code is not ideal here,
// but it gets the job done.
if (idx < len) {
uint8_t buffer[64];
memset(buffer, 0, 64);
memcpy(buffer, buf + idx, len - idx);
__m256i input_lo =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer));
__m256i input_hi =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32));
uint64_t bs_bits =
cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\'));
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
uint64_t even_starts = start_edges & even_start_mask;
uint64_t odd_starts = start_edges & ~even_start_mask;
uint64_t even_carries = bs_bits + even_starts;
uint64_t odd_carries;
// bool iter_ends_odd_backslash =
add_overflow(bs_bits, odd_starts, &odd_carries);
odd_carries |= prev_iter_ends_odd_backslash;
// prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
// // we never use it
uint64_t even_carry_ends = even_carries & ~bs_bits;
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
uint64_t odd_start_even_end = odd_carry_ends & even_bits;
uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
uint64_t quote_bits =
cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"'));
quote_bits = quote_bits & ~odd_ends;
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
// prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we
// don't need this anymore
__m256i mask_20 = _mm256_set1_epi8(0x20); // c==32
__m256i mask_70 =
_mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits
// but moves any value >= 16 above 128
__m256i lut_cntrl = _mm256_setr_epi8(
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00,
0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00);
__m256i tmp_ws_lo = _mm256_or_si256(
_mm256_cmpeq_epi8(mask_20, input_lo),
_mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo)));
__m256i tmp_ws_hi = _mm256_or_si256(
_mm256_cmpeq_epi8(mask_20, input_hi),
_mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi)));
uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32));
whitespace &= ~quote_mask;
if (len - idx < 64) {
whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx);
}
uint64_t non_whitespace = ~whitespace;
int mask1 = non_whitespace & 0xFFFF;
int mask2 = (non_whitespace >> 16) & 0xFFFF;
int mask3 = (non_whitespace >> 32) & 0xFFFF;
int mask4 = (non_whitespace >> 48) & 0xFFFF;
x1 = skinnycleanm128(x1, mask1);
x2 = skinnycleanm128(x2, mask2);
x3 = skinnycleanm128(x3, mask3);
x4 = skinnycleanm128(x4, mask4);
int pop1 = hamming(non_whitespace & 0xFFFF);
int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF));
int pop3 = hamming(non_whitespace & UINT64_C(0xFFFFFFFFFFFF));
int pop4 = hamming(non_whitespace);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4);
out += pop4;
}
*out = '\0'; // NULL termination
return out - initout;
}
size_t oldjson_minify(const uint8_t *buf, size_t len, uint8_t *out) {
// Useful constant masks
const uint64_t even_bits = 0x5555555555555555ULL;
const uint64_t odd_bits = ~even_bits;
uint8_t *initout(out);
uint64_t prev_iter_ends_odd_backslash =
0ULL; // either 0 or 1, but a 64-bit value
uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones
size_t idx = 0;
if (len >= 64) {
size_t avx_len = len - 63;
for (; idx < avx_len; idx += 64) {
__m256i input_lo =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0));
__m256i input_hi =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32));
uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi,
_mm256_set1_epi8('\\'));
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
uint64_t even_starts = start_edges & even_start_mask;
uint64_t odd_starts = start_edges & ~even_start_mask;
uint64_t even_carries = bs_bits + even_starts;
uint64_t odd_carries;
bool iter_ends_odd_backslash =
add_overflow(bs_bits, odd_starts, &odd_carries);
odd_carries |= prev_iter_ends_odd_backslash;
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
uint64_t even_carry_ends = even_carries & ~bs_bits;
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
uint64_t odd_start_even_end = odd_carry_ends & even_bits;
uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi,
_mm256_set1_epi8('"'));
quote_bits = quote_bits & ~odd_ends;
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
prev_iter_inside_quote = static_cast<uint64_t>(
static_cast<int64_t>(quote_mask) >>
63); // might be undefined behavior, should be fully defined in C++20,
// ok according to John Regher from Utah University
const __m256i low_nibble_mask = _mm256_setr_epi8(
// 0 9 a b c d
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0,
0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
const __m256i high_nibble_mask = _mm256_setr_epi8(
// 0 2 3 5 7
8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
1, 0, 0, 0, 3, 2, 1, 0, 0);
__m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);
__m256i v_lo = _mm256_and_si256(
_mm256_shuffle_epi8(low_nibble_mask, input_lo),
_mm256_shuffle_epi8(high_nibble_mask,
_mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
_mm256_set1_epi8(0x7f))));
__m256i v_hi = _mm256_and_si256(
_mm256_shuffle_epi8(low_nibble_mask, input_hi),
_mm256_shuffle_epi8(high_nibble_mask,
_mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
_mm256_set1_epi8(0x7f))));
__m256i tmp_ws_lo = _mm256_cmpeq_epi8(
_mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
__m256i tmp_ws_hi = _mm256_cmpeq_epi8(
_mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
uint64_t ws_res_0 =
static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
whitespace &= ~quote_mask;
int mask1 = whitespace & 0xFFFF;
int mask2 = (whitespace >> 16) & 0xFFFF;
int mask3 = (whitespace >> 32) & 0xFFFF;
int mask4 = (whitespace >> 48) & 0xFFFF;
int pop1 = hamming((~whitespace) & 0xFFFF);
int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
int pop4 = hamming((~whitespace));
__m128i x1 = _mm256_extracti128_si256(input_lo, 0);
__m128i x2 = _mm256_extracti128_si256(input_lo, 1);
__m128i x3 = _mm256_extracti128_si256(input_hi, 0);
__m128i x4 = _mm256_extracti128_si256(input_hi, 1);
x1 = skinnycleanm128(x1, mask1);
x2 = skinnycleanm128(x2, mask2);
x3 = skinnycleanm128(x3, mask3);
x4 = skinnycleanm128(x4, mask4);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4);
out += pop4;
}
}
// we finish off the job... copying and pasting the code is not ideal here,
// but it gets the job done.
if (idx < len) {
uint8_t buffer[64];
memset(buffer, 0, 64);
memcpy(buffer, buf + idx, len - idx);
__m256i input_lo =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer));
__m256i input_hi =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32));
uint64_t bs_bits =
cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\'));
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
uint64_t even_starts = start_edges & even_start_mask;
uint64_t odd_starts = start_edges & ~even_start_mask;
uint64_t even_carries = bs_bits + even_starts;
uint64_t odd_carries;
// bool iter_ends_odd_backslash =
add_overflow(bs_bits, odd_starts, &odd_carries);
odd_carries |= prev_iter_ends_odd_backslash;
// prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
// // we never use it
uint64_t even_carry_ends = even_carries & ~bs_bits;
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
uint64_t odd_start_even_end = odd_carry_ends & even_bits;
uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
uint64_t quote_bits =
cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"'));
quote_bits = quote_bits & ~odd_ends;
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
// prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we
// don't need this anymore
__m256i mask_20 = _mm256_set1_epi8(0x20); // c==32
__m256i mask_70 =
_mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits
// but moves any value >= 16 above 128
__m256i lut_cntrl = _mm256_setr_epi8(
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00,
0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00);
__m256i tmp_ws_lo = _mm256_or_si256(
_mm256_cmpeq_epi8(mask_20, input_lo),
_mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo)));
__m256i tmp_ws_hi = _mm256_or_si256(
_mm256_cmpeq_epi8(mask_20, input_hi),
_mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi)));
uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32));
whitespace &= ~quote_mask;
if (len - idx < 64) {
whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx);
}
int mask1 = whitespace & 0xFFFF;
int mask2 = (whitespace >> 16) & 0xFFFF;
int mask3 = (whitespace >> 32) & 0xFFFF;
int mask4 = (whitespace >> 48) & 0xFFFF;
int pop1 = hamming((~whitespace) & 0xFFFF);
int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
int pop4 = hamming((~whitespace));
__m128i x1 = _mm256_extracti128_si256(input_lo, 0);
__m128i x2 = _mm256_extracti128_si256(input_lo, 1);
__m128i x3 = _mm256_extracti128_si256(input_hi, 0);
__m128i x4 = _mm256_extracti128_si256(input_hi, 1);
x1 = skinnycleanm128(x1, mask1);
x2 = skinnycleanm128(x2, mask2);
x3 = skinnycleanm128(x3, mask3);
x4 = skinnycleanm128(x4, mask4);
_mm_storeu_si128(reinterpret_cast<__m128i *>(buffer), x1);
_mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop1), x2);
_mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop2), x3);
_mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop3), x4);
memcpy(out, buffer, pop4);
out += pop4;
}
*out = '\0'; // NULL termination
return out - initout;
}
} // namespace simdjson
#endif

View File

@ -1,5 +1,4 @@
#include "simdjson.h"
#include "implementation.cpp"
#include "jsonminifier.cpp"
#include "stage1_find_marks.cpp"
#include "stage2_build_tape.cpp"

View File

@ -46,7 +46,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
#endif// _MSC_VER
}
really_inline int hamming(uint64_t input_num) {
really_inline int count_ones(uint64_t input_num) {
#ifdef _MSC_VER
// note: we do not support legacy 32-bit Windows
return __popcnt64(input_num);// Visual Studio wants two underscores

View File

@ -11,6 +11,7 @@ class implementation final : public simdjson::implementation {
public:
really_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", instruction_set::SSE42 | instruction_set::PCLMULQDQ) {}
WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final;
WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final;

View File

@ -2,8 +2,12 @@
#define SIMDJSON_WESTMERE_SIMD_H
#include "simdjson.h"
#include "simdprune_tables.h"
#include "westmere/bitmanipulation.h"
#include "westmere/intrinsics.h"
TARGET_WESTMERE
namespace simdjson::westmere::simd {
@ -106,6 +110,42 @@ namespace simdjson::westmere::simd {
really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
return _mm_shuffle_epi8(lookup_table, *this);
}
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
// Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
// get written.
// Design consideration: it seems like a function with the
// signature simd8<L> compress(uint32_t mask) would be
// sensible, but the AVX ISA makes this kind of approach difficult.
template<typename L>
really_inline void compress(uint16_t mask, L * output) const {
// this particular implementation was inspired by work done by @animetosho
// we do it in two steps, first 8 bytes and then second 8 bytes
uint8_t mask1 = static_cast<uint8_t>(mask); // least significant 8 bits
uint8_t mask2 = static_cast<uint8_t>(mask >> 8); // most significant 8 bits
// next line just loads the 64-bit values thintable_epi8[mask1] and
// thintable_epi8[mask2] into a 128-bit register, using only
// two instructions on most compilers.
__m128i shufmask = _mm_set_epi64x(thintable_epi8[mask2], thintable_epi8[mask1]);
// we increment by 0x08 the second half of the mask
shufmask =
_mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
// this is the version "nearly pruned"
__m128i pruned = _mm_shuffle_epi8(*this, shufmask);
// we still need to put the two halves together.
// we compute the popcount of the first half:
int pop1 = BitsSetTable256mul2[mask1];
// then load the corresponding mask, what it does is to write
// only the first pop1 bytes from the first 8 bytes, and then
// it fills in with the bytes from the second 8 bytes + some filling
// at the end.
__m128i compactmask =
_mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop1 * 8));
__m128i answer = _mm_shuffle_epi8(pruned, compactmask);
_mm_storeu_si128(( __m128i *)(output), answer);
}
template<typename L>
really_inline simd8<L> lookup_16(
L replace0, L replace1, L replace2, L replace3,
@ -235,6 +275,13 @@ namespace simdjson::westmere::simd {
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
}
really_inline void compress(uint64_t mask, T * output) const {
this->chunks[0].compress(mask, output);
this->chunks[1].compress(mask >> 16, output + 16 - count_ones(mask & 0xFFFF));
this->chunks[2].compress(mask >> 32, output + 32 - count_ones(mask & 0xFFFFFFFF));
this->chunks[3].compress(mask >> 48, output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
}
template <typename F>
static really_inline void each_index(F const& each) {
each(0);
@ -302,7 +349,6 @@ namespace simdjson::westmere::simd {
const simd8<T> mask = simd8<T>::splat(m);
return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
}
}; // struct simd8x64<T>
} // namespace simdjson::westmere::simd

View File

@ -29,6 +29,11 @@ really_inline json_character_block json_character_block::classify(const simd::si
auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
// We compute whitespace and op separately. If the code later only use one or the
// other, given the fact that all functions are aggressively inlined, we can
// hope that useless computations will be omitted. This is namely case when
// minifying (we only need whitespace).
uint64_t whitespace = in.map([&](simd8<uint8_t> _in) {
return _in == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, _in));
}).to_bitmask();
@ -53,11 +58,17 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
}
#include "generic/utf8_lookup2_algorithm.h"
#include "generic/buf_block_reader.h"
#include "generic/json_string_scanner.h"
#include "generic/json_scanner.h"
#include "generic/json_structural_indexer.h"
#include "generic/json_minifier.h"
WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
#include "generic/utf8_lookup2_algorithm.h"
#include "generic/json_structural_indexer.h"
WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept {
return westmere::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming);
}

View File

@ -892,7 +892,8 @@ namespace dom_api {
if (doc["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(doc[\"a\"]) to be 1, was " << doc["a"].first << endl; return false; }
UNUSED document::element val;
tie(val, error) = doc["d"];
// tie(val, error) = doc["d"]; fails with "no viable overloaded '='" on Apple clang version 11.0.0
doc["d"].tie(val, error);
if (error != simdjson::NO_SUCH_FIELD) { cerr << "Expected NO_SUCH_FIELD error for uint64_t(doc[\"d\"]), got " << error << endl; return false; }
return true;
}
@ -906,11 +907,11 @@ namespace dom_api {
if (doc["obj"]["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(doc[\"obj\"][\"a\"]) to be 1, was " << doc["obj"]["a"].first << endl; return false; }
document::object obj;
tie(obj, error) = doc.as_object();
doc.as_object().tie(obj, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
if (error) { cerr << "Error: " << error << endl; return false; }
if (obj["obj"]["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(doc[\"obj\"][\"a\"]) to be 1, was " << doc["obj"]["a"].first << endl; return false; }
tie(obj, error) = obj["obj"].as_object();
obj["obj"].as_object().tie(obj, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
if (obj["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(obj[\"a\"]) to be 1, was " << obj["a"].first << endl; return false; }
if (obj["b"].as_uint64_t().first != 2) { cerr << "Expected uint64_t(obj[\"b\"]) to be 2, was " << obj["b"].first << endl; return false; }
if (obj["c"].as_uint64_t().first != 3) { cerr << "Expected uint64_t(obj[\"c\"]) to be 3, was " << obj["c"].first << endl; return false; }
@ -920,7 +921,7 @@ namespace dom_api {
if (obj["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(obj[\"a\"]) to be 1, was " << obj["a"].first << endl; return false; }
UNUSED document::element val;
tie(val, error) = doc["d"];
doc["d"].tie(val, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
if (error != simdjson::NO_SUCH_FIELD) { cerr << "Expected NO_SUCH_FIELD error for uint64_t(obj[\"d\"]), got " << error << endl; return false; }
return true;
}
@ -944,14 +945,14 @@ namespace dom_api {
if (error) { cerr << "Error: " << error << endl; return false; }
for (auto tweet : tweets) {
document::object user;
tie(user, error) = tweet["user"].as_object();
tweet["user"].as_object().tie(user, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
if (error) { cerr << "Error: " << error << endl; return false; }
bool default_profile;
tie(default_profile, error) = user["default_profile"].as_bool();
user["default_profile"].as_bool().tie(default_profile, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
if (error) { cerr << "Error: " << error << endl; return false; }
if (default_profile) {
std::string_view screen_name;
tie(screen_name, error) = user["screen_name"].as_string();
user["screen_name"].as_string().tie(screen_name, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
if (error) { cerr << "Error: " << error << endl; return false; }
default_users.insert(screen_name);
}
@ -972,13 +973,13 @@ namespace dom_api {
if (!not_found) {
for (auto image : media) {
document::object sizes;
tie(sizes, error) = image["sizes"].as_object();
image["sizes"].as_object().tie(sizes, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
if (error) { cerr << "Error: " << error << endl; return false; }
for (auto [key, size] : sizes) {
uint64_t width, height;
tie(width, error) = size["w"].as_uint64_t();
size["w"].as_uint64_t().tie(width, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
if (error) { cerr << "Error: " << error << endl; return false; }
tie(height, error) = size["h"].as_uint64_t();
size["h"].as_uint64_t().tie(height, error); // tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
if (error) { cerr << "Error: " << error << endl; return false; }
image_sizes.insert(make_pair(width, height));
}

View File

@ -1,18 +1,90 @@
#include <iostream>
#ifndef _MSC_VER
#include <dirent.h>
#include <unistd.h>
#endif
#include "simdjson.h"
int main(int argc, char *argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <jsonfile>\n";
exit(1);
// Stash the exe_name in main() for functions to use
char* exe_name;
void print_usage(std::ostream& out) {
out << "Usage: " << exe_name << " [-a ARCH] <jsonfile>" << std::endl;
out << std::endl;
out << "Runs the parser against the given json files in a loop, measuring speed and other statistics." << std::endl;
out << std::endl;
out << "Options:" << std::endl;
out << std::endl;
out << "-a IMPL - Use the given parser implementation. By default, detects the most advanced" << std::endl;
out << " implementation supported on the host machine." << std::endl;
for (auto impl : simdjson::available_implementations) {
out << "-a " << std::left << std::setw(9) << impl->name() << " - Use the " << impl->description() << " parser implementation." << std::endl;
}
std::string filename = argv[argc - 1];
}
void exit_usage(std::string message) {
std::cerr << message << std::endl;
std::cerr << std::endl;
print_usage(std::cerr);
exit(EXIT_FAILURE);
}
struct option_struct {
char* filename;
option_struct(int argc, char **argv) {
#ifndef _MSC_VER
int c;
while ((c = getopt(argc, argv, "a:")) != -1) {
switch (c) {
case 'a': {
const simdjson::implementation *impl = simdjson::available_implementations[optarg];
if (!impl) {
std::string exit_message = std::string("Unsupported option value -a ") + optarg + ": expected -a with one of ";
for (auto imple : simdjson::available_implementations) {
exit_message += imple->name();
exit_message += " ";
}
exit_usage(exit_message);
}
simdjson::active_implementation = impl;
break;
}
default:
// reaching here means an argument was given to getopt() which did not have a case label
exit_usage("Unexpected argument - missing case for option "+
std::string(1,static_cast<char>(c))+
" (programming error)");
}
}
#else
int optind = 1;
#endif
// All remaining arguments are considered to be files
if(optind + 1 == argc) {
filename = argv[optind];
} else {
exit_usage("Please specify exactly one input file.");
}
}
};
int main(int argc, char *argv[]) {
exe_name = argv[0];
option_struct options(argc, argv);
std::string filename = options.filename;
auto [p, error] = simdjson::padded_string::load(filename);
if (error) {
std::cerr << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
simdjson::json_minify(p, p.data());
printf("%s", p.data());
simdjson::padded_string copy(p.length());
size_t copy_len;
error = simdjson::active_implementation->minify((const uint8_t*)p.data(), p.length(), (uint8_t*)copy.data(), copy_len);
if (error) { std::cerr << error << std::endl; return 1; }
printf("%s", copy.data());
}