Make simd8 64 uncopyable and other Visual Studio optimizations (#1031)

* Working on making simd8x64 immutable


* Even less invasive
This commit is contained in:
Daniel Lemire 2020-07-21 18:11:21 -04:00 committed by GitHub
parent 0ff6833e96
commit 4beb2ed507
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 220 additions and 210 deletions

View File

@ -86,7 +86,7 @@ else()
endif()
if(SIMDJSON_VISUAL_STUDIO_BUILD_WITH_DEBUG_INFO_FOR_PROFILING)
target_link_options(simdjson-flags INTERFACE /DEBUG )
target_compile_options(simdjson-flags INTERFACE /Zi )
target_compile_options(simdjson-flags INTERFACE /Zi)
endif()
else()
target_compile_options(simdjson-internal-flags INTERFACE -fPIC)

View File

@ -1,4 +1,4 @@
/* auto-generated on Tue 21 Jul 2020 16:54:49 EDT. Do not edit! */
/* auto-generated on Tue 21 Jul 2020 17:54:23 EDT. Do not edit! */
#include <iostream>
#include "simdjson.h"

View File

@ -1,4 +1,4 @@
/* auto-generated on Tue 21 Jul 2020 16:54:49 EDT. Do not edit! */
/* auto-generated on Tue 21 Jul 2020 17:54:23 EDT. Do not edit! */
/* begin file src/simdjson.cpp */
#include "simdjson.h"
@ -2646,10 +2646,14 @@ really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_
template<typename T>
struct simd8x64 {
static const int NUM_CHUNKS = 64 / sizeof(simd8<T>);
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
const simd8<T> chunks[NUM_CHUNKS];
really_inline simd8x64() : chunks{simd8<T>(), simd8<T>(), simd8<T>(), simd8<T>()} {}
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
simd8x64() = delete; // no default constructor allowed
really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
@ -2660,6 +2664,11 @@ really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
}
really_inline simd8<T> reduce_or() const {
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
}
really_inline void compress(uint64_t mask, T * output) const {
this->chunks[0].compress(uint16_t(mask), output);
this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
@ -2667,14 +2676,6 @@ really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_
this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
}
template <typename F>
static really_inline void each_index(F const& each) {
each(0);
each(1);
each(2);
each(3);
}
really_inline uint64_t to_bitmask() const {
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
const uint8x16_t bit_mask = make_uint8x16_t(
@ -2740,7 +2741,7 @@ namespace arm64 {
using namespace simd;
struct json_character_block {
static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);
static really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
really_inline uint64_t whitespace() const { return _whitespace; }
really_inline uint64_t op() const { return _op; }
@ -2750,7 +2751,7 @@ struct json_character_block {
uint64_t _op;
};
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
// Functional programming causes trouble with Visual Studio.
// Keeping this version in comments since it is much nicer:
// auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
@ -2763,7 +2764,7 @@ really_inline json_character_block json_character_block::classify(const simd::si
const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
auto v = simd8x64<uint8_t>(
simd8x64<uint8_t> v(
(in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
(in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
(in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
@ -2804,12 +2805,12 @@ really_inline json_character_block json_character_block::classify(const simd::si
return { whitespace, op };
}
really_inline bool is_ascii(simd8x64<uint8_t> input) {
simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]) | (input.chunks[2] | input.chunks[3]);
really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
simd8<uint8_t> bits = input.reduce_or();
return bits.max() < 0b10000000u;
}
really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
@ -2821,7 +2822,7 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
return is_second_byte ^ is_third_byte ^ is_fourth_byte;
}
really_inline simd8<bool> must_be_2_3_continuation(simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
return is_third_byte ^ is_fourth_byte;
@ -2865,7 +2866,7 @@ UNUSED static char * format_input_text_64(const uint8_t *text) {
}
// Routines to print masks and text for debugging bitmask operations
UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
UNUSED static char * format_input_text(const simd8x64<uint8_t>& in) {
static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
in.store((uint8_t*)buf);
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
@ -2948,7 +2949,7 @@ struct json_string_block {
// Scans blocks for string characters, storing the state necessary to do so
class json_string_scanner {
public:
really_inline json_string_block next(const simd::simd8x64<uint8_t> in);
really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
really_inline error_code finish(bool streaming);
private:
@ -3014,7 +3015,7 @@ really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t bac
//
// Backslash sequences outside of quotes will be detected in stage 2.
//
really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t> in) {
really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
const uint64_t backslash = in.eq('\\');
const uint64_t escaped = find_escaped(backslash);
const uint64_t quote = in.eq('"') & ~escaped;
@ -3105,7 +3106,7 @@ private:
class json_scanner {
public:
json_scanner() {}
really_inline json_block next(const simd::simd8x64<uint8_t> in);
really_inline json_block next(const simd::simd8x64<uint8_t>& in);
really_inline error_code finish(bool streaming);
private:
@ -3142,7 +3143,7 @@ really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint
return result;
}
really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t> in) {
really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
json_string_block strings = string_scanner.next(in);
json_character_block characters = json_character_block::classify(in);
uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);
@ -3188,13 +3189,13 @@ private:
{}
template<size_t STEP_SIZE>
really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
really_inline void next(simd::simd8x64<uint8_t> in, json_block block);
really_inline void next(const simd::simd8x64<uint8_t>& in, json_block block);
really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
json_scanner scanner{};
uint8_t *dst;
};
really_inline void json_minifier::next(simd::simd8x64<uint8_t> in, json_block block) {
really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, json_block block) {
uint64_t mask = block.whitespace();
in.compress(mask, dst);
dst += 64 - count_ones(mask);
@ -3347,7 +3348,7 @@ namespace utf8_validation {
using namespace simd;
really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t>& input, const simd8<uint8_t>& prev1) {
really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
// Bit 1 = Too Long (ASCII followed by continuation)
// Bit 2 = Overlong 3-byte
@ -3437,8 +3438,8 @@ using namespace simd;
);
return (byte_1_high & byte_1_low & byte_2_high);
}
really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t>& input,
const simd8<uint8_t>& prev_input, const simd8<uint8_t>& sc) {
really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
@ -3450,7 +3451,7 @@ using namespace simd;
// Return nonzero if there are incomplete multibyte characters at the end of the block:
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
//
really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t>& input) {
really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
// ... 1111____ 111_____ 11______
static const uint8_t max_array[32] = {
@ -3474,7 +3475,7 @@ using namespace simd;
//
// Check whether the current bytes are valid UTF-8.
//
really_inline void check_utf8_bytes(const simd8<uint8_t>& input, const simd8<uint8_t>& prev_input) {
really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
@ -3490,7 +3491,9 @@ using namespace simd;
}
really_inline void check_next_input(const simd8x64<uint8_t>& input) {
if (unlikely(!is_ascii(input))) {
if(likely(is_ascii(input))) {
this->error |= this->prev_incomplete;
} else {
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
"We support either two or four chunks per 64-byte block.");
@ -3505,10 +3508,7 @@ using namespace simd;
}
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
} else {
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
// possibly finish them.
this->error |= this->prev_incomplete;
}
}
@ -3595,7 +3595,7 @@ private:
really_inline json_structural_indexer(uint32_t *structural_indexes);
template<size_t STEP_SIZE>
really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
really_inline void next(const simd::simd8x64<uint8_t>& in, json_block block, size_t idx);
really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
json_scanner scanner{};
@ -3664,7 +3664,7 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b
reader.advance();
}
really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
really_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, json_block block, size_t idx) {
uint64_t unescaped = in.lteq(0x1F);
checker.check_next_input(in);
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
@ -7902,6 +7902,7 @@ namespace simd {
really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
@ -7918,19 +7919,17 @@ namespace simd {
template<typename T>
struct simd8x64 {
static const int NUM_CHUNKS = 64 / sizeof(simd8<T>);
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
const simd8<T> chunks[NUM_CHUNKS];
really_inline simd8x64() : chunks{simd8<T>(), simd8<T>()} {}
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
simd8x64() = delete; // no default constructor allowed
really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
template <typename F>
static really_inline void each_index(F const& each) {
each(0);
each(1);
}
really_inline void compress(uint64_t mask, T * output) const {
uint32_t mask1 = uint32_t(mask);
uint32_t mask2 = uint32_t(mask >> 32);
@ -7949,6 +7948,10 @@ namespace simd {
return r_lo | (r_hi << 32);
}
really_inline simd8<T> reduce_or() const {
return this->chunks[0] | this->chunks[1];
}
really_inline simd8x64<T> bit_or(const T m) const {
const simd8<T> mask = simd8<T>::splat(m);
return simd8x64<T>(
@ -7991,7 +7994,7 @@ namespace haswell {
using namespace simd;
struct json_character_block {
static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);
static really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
really_inline uint64_t whitespace() const { return _whitespace; }
really_inline uint64_t op() const { return _op; }
@ -8001,7 +8004,7 @@ struct json_character_block {
uint64_t _op;
};
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
// we can't use the generic lookup_16.
auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
@ -8024,12 +8027,11 @@ really_inline json_character_block json_character_block::classify(const simd::si
return { whitespace, op };
}
really_inline bool is_ascii(simd8x64<uint8_t> input) {
simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]);
return !bits.any_bits_set_anywhere(0b10000000u);
really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
return input.reduce_or().is_ascii();
}
really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
@ -8037,7 +8039,7 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
}
really_inline simd8<bool> must_be_2_3_continuation(simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
@ -8083,7 +8085,7 @@ UNUSED static char * format_input_text_64(const uint8_t *text) {
}
// Routines to print masks and text for debugging bitmask operations
UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
UNUSED static char * format_input_text(const simd8x64<uint8_t>& in) {
static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
in.store((uint8_t*)buf);
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
@ -8166,7 +8168,7 @@ struct json_string_block {
// Scans blocks for string characters, storing the state necessary to do so
class json_string_scanner {
public:
really_inline json_string_block next(const simd::simd8x64<uint8_t> in);
really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
really_inline error_code finish(bool streaming);
private:
@ -8232,7 +8234,7 @@ really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t bac
//
// Backslash sequences outside of quotes will be detected in stage 2.
//
really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t> in) {
really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
const uint64_t backslash = in.eq('\\');
const uint64_t escaped = find_escaped(backslash);
const uint64_t quote = in.eq('"') & ~escaped;
@ -8323,7 +8325,7 @@ private:
class json_scanner {
public:
json_scanner() {}
really_inline json_block next(const simd::simd8x64<uint8_t> in);
really_inline json_block next(const simd::simd8x64<uint8_t>& in);
really_inline error_code finish(bool streaming);
private:
@ -8360,7 +8362,7 @@ really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint
return result;
}
really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t> in) {
really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
json_string_block strings = string_scanner.next(in);
json_character_block characters = json_character_block::classify(in);
uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);
@ -8404,13 +8406,13 @@ private:
{}
template<size_t STEP_SIZE>
really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
really_inline void next(simd::simd8x64<uint8_t> in, json_block block);
really_inline void next(const simd::simd8x64<uint8_t>& in, json_block block);
really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
json_scanner scanner{};
uint8_t *dst;
};
really_inline void json_minifier::next(simd::simd8x64<uint8_t> in, json_block block) {
really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, json_block block) {
uint64_t mask = block.whitespace();
in.compress(mask, dst);
dst += 64 - count_ones(mask);
@ -8563,7 +8565,7 @@ namespace utf8_validation {
using namespace simd;
really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t>& input, const simd8<uint8_t>& prev1) {
really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
// Bit 1 = Too Long (ASCII followed by continuation)
// Bit 2 = Overlong 3-byte
@ -8653,8 +8655,8 @@ using namespace simd;
);
return (byte_1_high & byte_1_low & byte_2_high);
}
really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t>& input,
const simd8<uint8_t>& prev_input, const simd8<uint8_t>& sc) {
really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
@ -8666,7 +8668,7 @@ using namespace simd;
// Return nonzero if there are incomplete multibyte characters at the end of the block:
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
//
really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t>& input) {
really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
// ... 1111____ 111_____ 11______
static const uint8_t max_array[32] = {
@ -8690,7 +8692,7 @@ using namespace simd;
//
// Check whether the current bytes are valid UTF-8.
//
really_inline void check_utf8_bytes(const simd8<uint8_t>& input, const simd8<uint8_t>& prev_input) {
really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
@ -8706,7 +8708,9 @@ using namespace simd;
}
really_inline void check_next_input(const simd8x64<uint8_t>& input) {
if (unlikely(!is_ascii(input))) {
if(likely(is_ascii(input))) {
this->error |= this->prev_incomplete;
} else {
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
"We support either two or four chunks per 64-byte block.");
@ -8721,10 +8725,7 @@ using namespace simd;
}
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
} else {
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
// possibly finish them.
this->error |= this->prev_incomplete;
}
}
@ -8811,7 +8812,7 @@ private:
really_inline json_structural_indexer(uint32_t *structural_indexes);
template<size_t STEP_SIZE>
really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
really_inline void next(const simd::simd8x64<uint8_t>& in, json_block block, size_t idx);
really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
json_scanner scanner{};
@ -8880,7 +8881,7 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b
reader.advance();
}
really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
really_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, json_block block, size_t idx) {
uint64_t unescaped = in.lteq(0x1F);
checker.check_next_input(in);
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
@ -11006,6 +11007,7 @@ namespace simd {
really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
@ -11022,10 +11024,14 @@ namespace simd {
template<typename T>
struct simd8x64 {
static const int NUM_CHUNKS = 64 / sizeof(simd8<T>);
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
const simd8<T> chunks[NUM_CHUNKS];
really_inline simd8x64() : chunks{simd8<T>(), simd8<T>(), simd8<T>(), simd8<T>()} {}
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
simd8x64() = delete; // no default constructor allowed
really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
@ -11036,6 +11042,10 @@ namespace simd {
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
}
really_inline simd8<T> reduce_or() const {
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
}
really_inline void compress(uint64_t mask, T * output) const {
this->chunks[0].compress(uint16_t(mask), output);
this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
@ -11043,14 +11053,6 @@ namespace simd {
this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
}
template <typename F>
static really_inline void each_index(F const& each) {
each(0);
each(1);
each(2);
each(3);
}
really_inline uint64_t to_bitmask() const {
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
uint64_t r1 = this->chunks[1].to_bitmask();
@ -11108,7 +11110,7 @@ namespace westmere {
using namespace simd;
struct json_character_block {
static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);
static really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
really_inline uint64_t whitespace() const { return _whitespace; }
really_inline uint64_t op() const { return _op; }
@ -11118,7 +11120,7 @@ struct json_character_block {
uint64_t _op;
};
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
// we can't use the generic lookup_16.
auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
@ -11146,12 +11148,11 @@ really_inline json_character_block json_character_block::classify(const simd::si
return { whitespace, op };
}
really_inline bool is_ascii(simd8x64<uint8_t> input) {
simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]) | (input.chunks[2] | input.chunks[3]);
return !bits.any_bits_set_anywhere(0b10000000u);
really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
return input.reduce_or().is_ascii();
}
really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
@ -11159,7 +11160,7 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
}
really_inline simd8<bool> must_be_2_3_continuation(simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
@ -11205,7 +11206,7 @@ UNUSED static char * format_input_text_64(const uint8_t *text) {
}
// Routines to print masks and text for debugging bitmask operations
UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
UNUSED static char * format_input_text(const simd8x64<uint8_t>& in) {
static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
in.store((uint8_t*)buf);
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
@ -11288,7 +11289,7 @@ struct json_string_block {
// Scans blocks for string characters, storing the state necessary to do so
class json_string_scanner {
public:
really_inline json_string_block next(const simd::simd8x64<uint8_t> in);
really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
really_inline error_code finish(bool streaming);
private:
@ -11354,7 +11355,7 @@ really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t bac
//
// Backslash sequences outside of quotes will be detected in stage 2.
//
really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t> in) {
really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
const uint64_t backslash = in.eq('\\');
const uint64_t escaped = find_escaped(backslash);
const uint64_t quote = in.eq('"') & ~escaped;
@ -11445,7 +11446,7 @@ private:
class json_scanner {
public:
json_scanner() {}
really_inline json_block next(const simd::simd8x64<uint8_t> in);
really_inline json_block next(const simd::simd8x64<uint8_t>& in);
really_inline error_code finish(bool streaming);
private:
@ -11482,7 +11483,7 @@ really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint
return result;
}
really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t> in) {
really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
json_string_block strings = string_scanner.next(in);
json_character_block characters = json_character_block::classify(in);
uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);
@ -11526,13 +11527,13 @@ private:
{}
template<size_t STEP_SIZE>
really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
really_inline void next(simd::simd8x64<uint8_t> in, json_block block);
really_inline void next(const simd::simd8x64<uint8_t>& in, json_block block);
really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
json_scanner scanner{};
uint8_t *dst;
};
really_inline void json_minifier::next(simd::simd8x64<uint8_t> in, json_block block) {
really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, json_block block) {
uint64_t mask = block.whitespace();
in.compress(mask, dst);
dst += 64 - count_ones(mask);
@ -11685,7 +11686,7 @@ namespace utf8_validation {
using namespace simd;
really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t>& input, const simd8<uint8_t>& prev1) {
really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
// Bit 1 = Too Long (ASCII followed by continuation)
// Bit 2 = Overlong 3-byte
@ -11775,8 +11776,8 @@ using namespace simd;
);
return (byte_1_high & byte_1_low & byte_2_high);
}
really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t>& input,
const simd8<uint8_t>& prev_input, const simd8<uint8_t>& sc) {
really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
@ -11788,7 +11789,7 @@ using namespace simd;
// Return nonzero if there are incomplete multibyte characters at the end of the block:
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
//
really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t>& input) {
really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
// ... 1111____ 111_____ 11______
static const uint8_t max_array[32] = {
@ -11812,7 +11813,7 @@ using namespace simd;
//
// Check whether the current bytes are valid UTF-8.
//
really_inline void check_utf8_bytes(const simd8<uint8_t>& input, const simd8<uint8_t>& prev_input) {
really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
@ -11828,7 +11829,9 @@ using namespace simd;
}
really_inline void check_next_input(const simd8x64<uint8_t>& input) {
if (unlikely(!is_ascii(input))) {
if(likely(is_ascii(input))) {
this->error |= this->prev_incomplete;
} else {
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
"We support either two or four chunks per 64-byte block.");
@ -11843,10 +11846,7 @@ using namespace simd;
}
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
} else {
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
// possibly finish them.
this->error |= this->prev_incomplete;
}
}
@ -11933,7 +11933,7 @@ private:
really_inline json_structural_indexer(uint32_t *structural_indexes);
template<size_t STEP_SIZE>
really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
really_inline void next(const simd::simd8x64<uint8_t>& in, json_block block, size_t idx);
really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
json_scanner scanner{};
@ -12002,7 +12002,7 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b
reader.advance();
}
really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
really_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, json_block block, size_t idx) {
uint64_t unescaped = in.lteq(0x1F);
checker.check_next_input(in);
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser

View File

@ -1,4 +1,4 @@
/* auto-generated on Tue 21 Jul 2020 16:54:49 EDT. Do not edit! */
/* auto-generated on Tue 21 Jul 2020 17:54:23 EDT. Do not edit! */
/* begin file include/simdjson.h */
#ifndef SIMDJSON_H
#define SIMDJSON_H

View File

@ -15,7 +15,7 @@ namespace arm64 {
using namespace simd;
struct json_character_block {
static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);
static really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
really_inline uint64_t whitespace() const { return _whitespace; }
really_inline uint64_t op() const { return _op; }
@ -25,7 +25,7 @@ struct json_character_block {
uint64_t _op;
};
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
// Functional programming causes trouble with Visual Studio.
// Keeping this version in comments since it is much nicer:
// auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
@ -38,7 +38,7 @@ really_inline json_character_block json_character_block::classify(const simd::si
const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
auto v = simd8x64<uint8_t>(
simd8x64<uint8_t> v(
(in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
(in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
(in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
@ -79,12 +79,12 @@ really_inline json_character_block json_character_block::classify(const simd::si
return { whitespace, op };
}
really_inline bool is_ascii(simd8x64<uint8_t> input) {
simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]) | (input.chunks[2] | input.chunks[3]);
really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
simd8<uint8_t> bits = input.reduce_or();
return bits.max() < 0b10000000u;
}
really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
@ -96,7 +96,7 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
return is_second_byte ^ is_third_byte ^ is_fourth_byte;
}
really_inline simd8<bool> must_be_2_3_continuation(simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
return is_third_byte ^ is_fourth_byte;

View File

@ -413,10 +413,14 @@ really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_
template<typename T>
struct simd8x64 {
static const int NUM_CHUNKS = 64 / sizeof(simd8<T>);
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
const simd8<T> chunks[NUM_CHUNKS];
really_inline simd8x64() : chunks{simd8<T>(), simd8<T>(), simd8<T>(), simd8<T>()} {}
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
simd8x64() = delete; // no default constructor allowed
really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
@ -427,6 +431,11 @@ really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
}
really_inline simd8<T> reduce_or() const {
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
}
really_inline void compress(uint64_t mask, T * output) const {
this->chunks[0].compress(uint16_t(mask), output);
this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
@ -434,14 +443,6 @@ really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_
this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
}
template <typename F>
static really_inline void each_index(F const& each) {
each(0);
each(1);
each(2);
each(3);
}
really_inline uint64_t to_bitmask() const {
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
const uint8x16_t bit_mask = make_uint8x16_t(

View File

@ -35,7 +35,7 @@ UNUSED static char * format_input_text_64(const uint8_t *text) {
}
// Routines to print masks and text for debugging bitmask operations
UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
UNUSED static char * format_input_text(const simd8x64<uint8_t>& in) {
static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
in.store((uint8_t*)buf);
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {

View File

@ -16,13 +16,13 @@ private:
{}
template<size_t STEP_SIZE>
really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
really_inline void next(simd::simd8x64<uint8_t> in, json_block block);
really_inline void next(const simd::simd8x64<uint8_t>& in, json_block block);
really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
json_scanner scanner{};
uint8_t *dst;
};
really_inline void json_minifier::next(simd::simd8x64<uint8_t> in, json_block block) {
really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, json_block block) {
uint64_t mask = block.whitespace();
in.compress(mask, dst);
dst += 64 - count_ones(mask);

View File

@ -49,7 +49,7 @@ private:
class json_scanner {
public:
json_scanner() {}
really_inline json_block next(const simd::simd8x64<uint8_t> in);
really_inline json_block next(const simd::simd8x64<uint8_t>& in);
really_inline error_code finish(bool streaming);
private:
@ -86,7 +86,7 @@ really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint
return result;
}
really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t> in) {
really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
json_string_block strings = string_scanner.next(in);
json_character_block characters = json_character_block::classify(in);
uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);

View File

@ -33,7 +33,7 @@ struct json_string_block {
// Scans blocks for string characters, storing the state necessary to do so
class json_string_scanner {
public:
really_inline json_string_block next(const simd::simd8x64<uint8_t> in);
really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
really_inline error_code finish(bool streaming);
private:
@ -99,7 +99,7 @@ really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t bac
//
// Backslash sequences outside of quotes will be detected in stage 2.
//
really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t> in) {
really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
const uint64_t backslash = in.eq('\\');
const uint64_t escaped = find_escaped(backslash);
const uint64_t quote = in.eq('"') & ~escaped;

View File

@ -71,7 +71,7 @@ private:
really_inline json_structural_indexer(uint32_t *structural_indexes);
template<size_t STEP_SIZE>
really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
really_inline void next(const simd::simd8x64<uint8_t>& in, json_block block, size_t idx);
really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
json_scanner scanner{};
@ -140,7 +140,7 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b
reader.advance();
}
really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
really_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, json_block block, size_t idx) {
uint64_t unescaped = in.lteq(0x1F);
checker.check_next_input(in);
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser

View File

@ -30,12 +30,12 @@ struct utf8_checker {
processed_utf_bytes previous;
// all byte values must be no larger than 0xF4
really_inline void check_smaller_than_0xF4(simd8<uint8_t> current_bytes) {
really_inline void check_smaller_than_0xF4(const simd8<uint8_t> current_bytes) {
// unsigned, saturates to 0 below max
this->has_error |= current_bytes.saturating_sub(0xF4u);
}
really_inline simd8<int8_t> continuation_lengths(simd8<int8_t> high_nibbles) {
really_inline simd8<int8_t> continuation_lengths(const simd8<int8_t> high_nibbles) {
return high_nibbles.lookup_16<int8_t>(
1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
0, 0, 0, 0, // 10xx (continuation)
@ -44,7 +44,7 @@ struct utf8_checker {
4); // 1111, next should be 0 (not checked here)
}
really_inline simd8<int8_t> carry_continuations(simd8<int8_t> initial_lengths) {
really_inline simd8<int8_t> carry_continuations(const simd8<int8_t>& initial_lengths) {
simd8<int8_t> prev_carried_continuations = initial_lengths.prev(this->previous.carried_continuations);
simd8<int8_t> right1 = simd8<int8_t>(simd8<uint8_t>(prev_carried_continuations).saturating_sub(1));
simd8<int8_t> sum = initial_lengths + right1;
@ -54,7 +54,7 @@ struct utf8_checker {
return sum + right2;
}
really_inline void check_continuations(simd8<int8_t> initial_lengths, simd8<int8_t> carries) {
really_inline void check_continuations(const simd8<int8_t>& initial_lengths, const simd8<int8_t>& carries) {
// overlap || underlap
// carry > length && length > 0 || !(carry > length) && !(length > 0)
// (carries > length) == (lengths > 0)
@ -76,8 +76,8 @@ struct utf8_checker {
// when 0xED is found, next byte must be no larger than 0x9F
// when 0xF4 is found, next byte must be no larger than 0x8F
// next byte must be continuation, ie sign bit is set, so signed < is ok
really_inline void check_first_continuation_max(simd8<uint8_t> current_bytes,
simd8<uint8_t> off1_current_bytes) {
really_inline void check_first_continuation_max(const simd8<uint8_t> current_bytes,
const simd8<uint8_t> off1_current_bytes) {
simd8<bool> prev_ED = off1_current_bytes == 0xEDu;
simd8<bool> prev_F4 = off1_current_bytes == 0xF4u;
// Check if ED is followed by A0 or greater
@ -94,9 +94,9 @@ struct utf8_checker {
// E => < E1 && < A0
// F => < F1 && < 90
// else false && false
really_inline void check_overlong(simd8<uint8_t> current_bytes,
simd8<uint8_t> off1_current_bytes,
simd8<int8_t> high_nibbles) {
really_inline void check_overlong(const simd8<uint8_t> current_bytes,
const simd8<uint8_t> off1_current_bytes,
const simd8<int8_t>& high_nibbles) {
simd8<int8_t> off1_high_nibbles = high_nibbles.prev(this->previous.high_nibbles);
// Two-byte characters must start with at least C2
@ -132,7 +132,7 @@ struct utf8_checker {
// check whether the current bytes are valid UTF-8
// at the end of the function, previous gets updated
really_inline void check_utf8_bytes(simd8<uint8_t> current_bytes) {
really_inline void check_utf8_bytes(const simd8<uint8_t> current_bytes) {
struct processed_utf_bytes pb {};
this->count_nibbles(current_bytes, &pb);
@ -151,7 +151,7 @@ struct utf8_checker {
this->previous = pb;
}
really_inline void check_next_input(simd8<uint8_t> in) {
really_inline void check_next_input(Dconst simd8<uint8_t> in) {
if (likely(!in.any_bits_set_anywhere(0x80u))) {
this->check_carried_continuations();
} else {
@ -159,8 +159,8 @@ struct utf8_checker {
}
}
really_inline void check_next_input(simd8x64<uint8_t> in) {
simd8<uint8_t> bits = in.reduce([&](auto a, auto b) { return a | b; });
really_inline void check_next_input(const simd8x64<uint8_t>& in) {
simd8<uint8_t> bits = in.reduce_or();
if (likely(!bits.any_bits_set_anywhere(0x80u))) {
// it is ascii, we just check carried continuations.
this->check_carried_continuations();

View File

@ -140,7 +140,7 @@ namespace utf8_validation {
return byte_1_high & byte_1_low & byte_2_high;
}
really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) {
really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input, const simd8<uint8_t> prev_input, const simd8<uint8_t> prev1) {
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
@ -154,7 +154,7 @@ namespace utf8_validation {
// Return nonzero if there are incomplete multibyte characters at the end of the block:
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
//
really_inline simd8<uint8_t> is_incomplete(simd8<uint8_t> input) {
really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
// ... 1111____ 111_____ 11______
static const uint8_t max_array[32] = {
@ -193,7 +193,7 @@ namespace utf8_validation {
this->error |= this->prev_incomplete;
}
really_inline void check_next_input(simd8x64<uint8_t> input) {
really_inline void check_next_input(const simd8x64<uint8_t>& input) {
if (likely(is_ascii(input))) {
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
// possibly finish them.

View File

@ -151,7 +151,7 @@ namespace utf8_validation {
return byte_1_high & byte_1_low & byte_2_high;
}
really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input,
really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input, const simd8<uint8_t> prev_input,
simd8<uint8_t> prev1) {
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
@ -166,7 +166,7 @@ namespace utf8_validation {
// Return nonzero if there are incomplete multibyte characters at the end of the block:
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
//
really_inline simd8<uint8_t> is_incomplete(simd8<uint8_t> input) {
really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
// ... 1111____ 111_____ 11______
static const uint8_t max_array[32] = {
@ -205,16 +205,24 @@ namespace utf8_validation {
this->error |= this->prev_incomplete;
}
really_inline void check_next_input(simd8x64<uint8_t> input) {
if (likely(is_ascii(input))) {
really_inline void check_next_input(const simd8x64<uint8_t>& input) {
if(likely(is_ascii(input))) {
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
// possibly finish them.
this->error |= this->prev_incomplete;
} else {
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
for (int i=1; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
this->check_utf8_bytes(input.chunks[i], input.chunks[i-1]);
}
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
"We support either two or four chunks per 64-byte block.");
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
}
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
}

View File

@ -4,7 +4,7 @@ namespace utf8_validation {
using namespace simd;
really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t>& input, const simd8<uint8_t>& prev1) {
really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
// Bit 1 = Too Long (ASCII followed by continuation)
// Bit 2 = Overlong 3-byte
@ -94,8 +94,8 @@ using namespace simd;
);
return (byte_1_high & byte_1_low & byte_2_high);
}
really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t>& input,
const simd8<uint8_t>& prev_input, const simd8<uint8_t>& sc) {
really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
@ -107,7 +107,7 @@ using namespace simd;
// Return nonzero if there are incomplete multibyte characters at the end of the block:
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
//
really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t>& input) {
really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
// ... 1111____ 111_____ 11______
static const uint8_t max_array[32] = {
@ -131,7 +131,7 @@ using namespace simd;
//
// Check whether the current bytes are valid UTF-8.
//
really_inline void check_utf8_bytes(const simd8<uint8_t>& input, const simd8<uint8_t>& prev_input) {
really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
@ -147,7 +147,9 @@ using namespace simd;
}
really_inline void check_next_input(const simd8x64<uint8_t>& input) {
if (unlikely(!is_ascii(input))) {
if(likely(is_ascii(input))) {
this->error |= this->prev_incomplete;
} else {
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
"We support either two or four chunks per 64-byte block.");
@ -162,10 +164,7 @@ using namespace simd;
}
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
} else {
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
// possibly finish them.
this->error |= this->prev_incomplete;
}
}

View File

@ -276,8 +276,8 @@ struct utf8_checker {
this->error |= this->prev_incomplete;
}
really_inline void check_next_input(simd8x64<uint8_t> input) {
simd8<uint8_t> bits = input.reduce([&](auto a,auto b) { return a|b; });
really_inline void check_next_input(const simd8x64<uint8_t>& input) {
simd8<uint8_t> bits = input.reduce_or();
if (likely(!bits.any_bits_set_anywhere(0b10000000u))) {
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
// possibly finish them.

View File

@ -40,7 +40,7 @@ struct utf8_checker {
// check whether the current bytes are valid UTF-8
// at the end of the function, previous gets updated
really_inline void check_utf8_bytes(simd8<uint8_t> current_bytes) {
really_inline void check_utf8_bytes(const simd8<uint8_t> current_bytes) {
/* high_nibbles = input >> 4 */
const simd8<uint8_t> high_nibbles = current_bytes.shr<4>();
@ -153,7 +153,7 @@ struct utf8_checker {
this->previous.first_len = first_len;
}
really_inline void check_next_input(simd8<uint8_t> in) {
really_inline void check_next_input(const simd8<uint8_t> in) {
if (likely(!in.any_bits_set_anywhere(0x80u))) {
this->check_carried_continuations();
} else {
@ -161,8 +161,8 @@ struct utf8_checker {
}
}
really_inline void check_next_input(simd8x64<uint8_t> in) {
simd8<uint8_t> bits = in.reduce([&](auto a, auto b) { return a | b; });
really_inline void check_next_input(const simd8x64<uint8_t>& in) {
simd8<uint8_t> bits = in.reduce_or();
if (likely(!bits.any_bits_set_anywhere(0x80u))) {
// it is ascii, we just check carried continuations.
this->check_carried_continuations();

View File

@ -335,7 +335,7 @@ struct utf8_checker {
this->check_special_cases(bytes);
}
really_inline void check_next_input(simd8<uint8_t> bytes) {
really_inline void check_next_input(const simd8<uint8_t> bytes) {
vmask_t bit_7 = bytes.get_bit<7>();
if (unlikely(bit_7)) {
// TODO (@jkeiser): To work with simdjson's caller model, I moved the calculation of
@ -348,7 +348,7 @@ struct utf8_checker {
}
}
really_inline void check_next_input(simd8x64<uint8_t> in) {
really_inline void check_next_input(const simd8x64<uint8_t>& in) {
for (int i=0; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
this->check_next_input(in.chunks[i]);
}

View File

@ -16,7 +16,7 @@ namespace haswell {
using namespace simd;
struct json_character_block {
static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);
static really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
really_inline uint64_t whitespace() const { return _whitespace; }
really_inline uint64_t op() const { return _op; }
@ -26,7 +26,7 @@ struct json_character_block {
uint64_t _op;
};
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
// we can't use the generic lookup_16.
auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
@ -49,12 +49,11 @@ really_inline json_character_block json_character_block::classify(const simd::si
return { whitespace, op };
}
really_inline bool is_ascii(simd8x64<uint8_t> input) {
simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]);
return !bits.any_bits_set_anywhere(0b10000000u);
really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
return input.reduce_or().is_ascii();
}
really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
@ -62,7 +61,7 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
}
really_inline simd8<bool> must_be_2_3_continuation(simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.

View File

@ -275,6 +275,7 @@ namespace simd {
really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
@ -291,19 +292,17 @@ namespace simd {
template<typename T>
struct simd8x64 {
static const int NUM_CHUNKS = 64 / sizeof(simd8<T>);
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
const simd8<T> chunks[NUM_CHUNKS];
really_inline simd8x64() : chunks{simd8<T>(), simd8<T>()} {}
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
simd8x64() = delete; // no default constructor allowed
really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
template <typename F>
static really_inline void each_index(F const& each) {
each(0);
each(1);
}
really_inline void compress(uint64_t mask, T * output) const {
uint32_t mask1 = uint32_t(mask);
uint32_t mask2 = uint32_t(mask >> 32);
@ -322,6 +321,10 @@ namespace simd {
return r_lo | (r_hi << 32);
}
really_inline simd8<T> reduce_or() const {
return this->chunks[0] | this->chunks[1];
}
really_inline simd8x64<T> bit_or(const T m) const {
const simd8<T> mask = simd8<T>::splat(m);
return simd8x64<T>(

View File

@ -17,7 +17,7 @@ namespace westmere {
using namespace simd;
struct json_character_block {
static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);
static really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
really_inline uint64_t whitespace() const { return _whitespace; }
really_inline uint64_t op() const { return _op; }
@ -27,7 +27,7 @@ struct json_character_block {
uint64_t _op;
};
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
// we can't use the generic lookup_16.
auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
@ -55,12 +55,11 @@ really_inline json_character_block json_character_block::classify(const simd::si
return { whitespace, op };
}
really_inline bool is_ascii(simd8x64<uint8_t> input) {
simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]) | (input.chunks[2] | input.chunks[3]);
return !bits.any_bits_set_anywhere(0b10000000u);
really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
return input.reduce_or().is_ascii();
}
really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
@ -68,7 +67,7 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
}
really_inline simd8<bool> must_be_2_3_continuation(simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.

View File

@ -247,6 +247,7 @@ namespace simd {
really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
@ -263,10 +264,14 @@ namespace simd {
template<typename T>
struct simd8x64 {
static const int NUM_CHUNKS = 64 / sizeof(simd8<T>);
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
const simd8<T> chunks[NUM_CHUNKS];
really_inline simd8x64() : chunks{simd8<T>(), simd8<T>(), simd8<T>(), simd8<T>()} {}
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
simd8x64() = delete; // no default constructor allowed
really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
@ -277,6 +282,10 @@ namespace simd {
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
}
really_inline simd8<T> reduce_or() const {
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
}
really_inline void compress(uint64_t mask, T * output) const {
this->chunks[0].compress(uint16_t(mask), output);
this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
@ -284,14 +293,6 @@ namespace simd {
this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
}
template <typename F>
static really_inline void each_index(F const& each) {
each(0);
each(1);
each(2);
each(3);
}
really_inline uint64_t to_bitmask() const {
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
uint64_t r1 = this->chunks[1].to_bitmask();