Use simd8 helpers for find_bs_bits_and_quote_bits

This commit is contained in:
John Keiser 2019-10-27 10:51:54 -07:00 committed by John Keiser
parent 4bc128f07e
commit d89046d515
8 changed files with 100 additions and 94 deletions

View File

@ -60,7 +60,7 @@ namespace simdjson::arm64::simd {
really_inline simd8<bool>::bitmask_t to_bitmask() const {
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
uint8x16_t minput = vandq_u8(*this, bit_mask);
auto minput = *this & bit_mask;
uint8x16_t tmp = vpaddq_u8(minput, minput);
tmp = vpaddq_u8(tmp, tmp);
tmp = vpaddq_u8(tmp, tmp);
@ -80,7 +80,7 @@ namespace simdjson::arm64::simd {
// Zero constructor
really_inline simd8() : simd8(zero()) {}
// Array constructor
really_inline simd8(const uint8_t* values) : simd8(load(values)) {}
really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
// Splat constructor
really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
// Member-by-member initialization
@ -92,6 +92,9 @@ namespace simdjson::arm64::simd {
v8, v9, v10,v11,v12,v13,v14,v15
}) {}
// Store to array
really_inline void store(uint8_t dst[16]) { return vst1q_u8(dst, *this); }
// Saturated math
really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
@ -159,7 +162,7 @@ namespace simdjson::arm64::simd {
static really_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
static really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
static really_inline simd8<int8_t> load(const int8_t* values) { return vld1q_s8(values); }
static really_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
// Conversion from/to SIMD register
really_inline simd8(const int8x16_t _value) : value{_value} {}
@ -181,6 +184,9 @@ namespace simdjson::arm64::simd {
v8, v9, v10,v11,v12,v13,v14,v15
}) {}
// Store to array
really_inline void store(int8_t dst[16]) { return vst1q_s8(dst, *this); }
// Explicit conversion to/from unsigned
really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(*this); }
@ -227,8 +233,15 @@ namespace simdjson::arm64::simd {
const simd8<T> chunks[4];
really_inline simd8x64() : chunks{simd8<T>(), simd8<T>(), simd8<T>(), simd8<T>()} {}
really_inline simd8x64(const uint8x16_t chunk0, const uint8x16_t chunk1, const uint8x16_t chunk2, const uint8x16_t chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
really_inline simd8x64(const T *ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
really_inline void store(T ptr[64]) {
this->chunks[0].store(ptr);
this->chunks[0].store(ptr+16);
this->chunks[0].store(ptr+32);
this->chunks[0].store(ptr+48);
}
template <typename F>
really_inline void each(F const& each_chunk) const
@ -268,14 +281,13 @@ namespace simdjson::arm64::simd {
}
really_inline uint64_t to_bitmask() const {
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
uint8x16_t t0 = vandq_u8(this->chunks[0], bit_mask);
uint8x16_t t1 = vandq_u8(this->chunks[1], bit_mask);
uint8x16_t t2 = vandq_u8(this->chunks[2], bit_mask);
uint8x16_t t3 = vandq_u8(this->chunks[3], bit_mask);
uint8x16_t sum0 = vpaddq_u8(t0, t1);
uint8x16_t sum1 = vpaddq_u8(t2, t3);
const uint8x16_t bit_mask = {
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
};
// Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
sum0 = vpaddq_u8(sum0, sum1);
sum0 = vpaddq_u8(sum0, sum0);
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);

View File

@ -5,57 +5,38 @@
#ifdef IS_ARM64
#include "arm64/simd.h"
#include "simdjson/common_defs.h"
#include "simdjson/parsedjson.h"
#include "jsoncharutils.h"
#ifdef JSON_TEST_STRINGS
void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
const uint8_t *parsed_end);
void found_bad_string(const uint8_t *buf);
#endif
namespace simdjson::arm64 {
using namespace simd;
// Holds backslashes and quotes locations.
struct parse_string_helper {
uint32_t bs_bits;
uint32_t quote_bits;
really_inline uint32_t bytes_processed() const { return sizeof(uint8x16_t)*2; }
static const uint32_t BYTES_PROCESSED = 32;
};
really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) {
// this can read up to 31 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
uint8x16_t v0 = vld1q_u8(src);
uint8x16_t v1 = vld1q_u8(src + 16);
vst1q_u8(dst, v0);
vst1q_u8(dst + 16, v1);
static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1));
simd8<uint8_t> v0(src);
simd8<uint8_t> v1(src + sizeof(v0));
v0.store(dst);
v1.store(dst + sizeof(v0));
uint8x16_t bs_mask = vmovq_n_u8('\\');
uint8x16_t qt_mask = vmovq_n_u8('"');
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
uint8x16_t cmp_bs_0 = vceqq_u8(v0, bs_mask);
uint8x16_t cmp_bs_1 = vceqq_u8(v1, bs_mask);
uint8x16_t cmp_qt_0 = vceqq_u8(v0, qt_mask);
uint8x16_t cmp_qt_1 = vceqq_u8(v1, qt_mask);
cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask);
cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask);
cmp_qt_0 = vandq_u8(cmp_qt_0, bit_mask);
cmp_qt_1 = vandq_u8(cmp_qt_1, bit_mask);
uint8x16_t sum0 = vpaddq_u8(cmp_bs_0, cmp_bs_1);
uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1);
sum0 = vpaddq_u8(sum0, sum1);
sum0 = vpaddq_u8(sum0, sum0);
// Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on ARM; therefore, we
// smash them together into a 64-byte mask and get the bitmask from there.
uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
return {
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits
static_cast<uint32_t>(bs_and_quote), // bs_bits
static_cast<uint32_t>(bs_and_quote >> 32) // quote_bits
};
}
#include "generic/stringparsing.h"

View File

@ -84,7 +84,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
*/
/* find out where the quote is... */
uint32_t quote_dist = trailing_zeroes(helper.quote_bits);
auto quote_dist = trailing_zeroes(helper.quote_bits);
/* NULL termination is still handy if you expect all your strings to
* be NULL terminated? */
@ -92,7 +92,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
dst[quote_dist] = 0;
uint32_t str_length = (dst - start_of_string) + quote_dist;
memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t));
memcpy(pj.current_string_buf_loc, &str_length, sizeof(str_length));
/*****************************
* Above, check for overflow in case someone has a crazy string
* (>=4GB?) _
@ -109,7 +109,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
}
if (((helper.quote_bits - 1) & helper.bs_bits) != 0) {
/* find out where the backspace is */
uint32_t bs_dist = trailing_zeroes(helper.bs_bits);
auto bs_dist = trailing_zeroes(helper.bs_bits);
uint8_t escape_char = src[bs_dist + 1];
/* we encountered backslash first. Handle backslash */
if (escape_char == 'u') {
@ -136,8 +136,8 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
} else {
/* they are the same. Since they can't co-occur, it means we
* encountered neither. */
src += helper.bytes_processed();
dst += helper.bytes_processed();
src += parse_string_helper::BYTES_PROCESSED;
dst += parse_string_helper::BYTES_PROCESSED;
}
}
/* can't be reached */

View File

@ -74,13 +74,16 @@ namespace simdjson::haswell::simd {
struct base8_numeric: base8<T> {
static really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
static really_inline simd8<T> zero() { return _mm256_setzero_si256(); }
static really_inline simd8<T> load(const T* values) {
static really_inline simd8<T> load(const T values[32]) {
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
}
really_inline base8_numeric() : base8<T>() {}
really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
// Store to array
really_inline void store(T dst[32]) { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
// Addition/subtraction are the same for signed and unsigned
really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
@ -131,7 +134,7 @@ namespace simdjson::haswell::simd {
// Splat constructor
really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
// Array constructor
really_inline simd8(const int8_t* values) : simd8(load(values)) {}
really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
// Member-by-member initialization
really_inline simd8(
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
@ -159,7 +162,7 @@ namespace simdjson::haswell::simd {
// Splat constructor
really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
// Array constructor
really_inline simd8(const uint8_t* values) : simd8(load(values)) {}
really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
// Member-by-member initialization
really_inline simd8(
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
@ -184,7 +187,7 @@ namespace simdjson::haswell::simd {
// Bit-specific operations
really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set(); }
really_inline simd8<bool> any_bits_set() const { return ~(*this == u8'\0'); }
really_inline simd8<bool> any_bits_set() const { return ~(*this == uint8_t(0)); }
really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !_mm256_testz_si256(*this, bits); }
really_inline bool any_bits_set_anywhere() const { return !_mm256_testz_si256(*this, *this); }
template<int N>
@ -198,10 +201,13 @@ namespace simdjson::haswell::simd {
const simd8<T> chunks[2];
really_inline simd8x64() : chunks{simd8<T>(), simd8<T>()} {}
really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
really_inline simd8x64(const __m256i chunk0, const __m256i chunk1) : chunks{chunk0, chunk1} {}
really_inline simd8x64(const T *ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
really_inline void store(T *ptr) {
this->chunks[0].store(ptr);
this->chunks[0].store(ptr+sizeof(simd8<T>));
}
template <typename F>
really_inline void each(F const& each_chunk) const

View File

@ -5,39 +5,33 @@
#ifdef IS_X86_64
#include "haswell/simd.h"
#include "simdjson/common_defs.h"
#include "simdjson/parsedjson.h"
#include "jsoncharutils.h"
#ifdef JSON_TEST_STRINGS
void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
const uint8_t *parsed_end);
void found_bad_string(const uint8_t *buf);
#endif
TARGET_HASWELL
namespace simdjson::haswell {
using namespace simd;
// Holds backslashes and quotes locations.
struct parse_string_helper {
uint32_t bs_bits;
uint32_t quote_bits;
really_inline uint32_t bytes_processed() const { return sizeof(__m256i); }
static const uint32_t BYTES_PROCESSED = 32;
};
really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) {
// this can read up to 31 bytes beyond the buffer size, but we require
// this can read up to 15 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
__m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
// store to dest unconditionally - we can overwrite the bits we don't like
// later
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1));
simd8<uint8_t> v(src);
// store to dest unconditionally - we can overwrite the bits we don't like later
v.store(dst);
return {
static_cast<uint32_t>(_mm256_movemask_epi8(
_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits
static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask)) // quote_bits
(v == '\\').to_bitmask(), // bs_bits
(v == '"').to_bitmask(), // quote_bits
};
}

View File

@ -65,6 +65,12 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) {
return error == 0;
}
#ifdef JSON_TEST_STRINGS
void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
const uint8_t *parsed_end);
void found_bad_string(const uint8_t *buf);
#endif
#include "arm64/stage2_build_tape.h"
#include "haswell/stage2_build_tape.h"
#include "westmere/stage2_build_tape.h"

View File

@ -75,13 +75,16 @@ namespace simdjson::westmere::simd {
struct base8_numeric: base8<T> {
static really_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
static really_inline simd8<T> zero() { return _mm_setzero_si128(); }
static really_inline simd8<T> load(const T* values) {
static really_inline simd8<T> load(const T values[16]) {
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
}
really_inline base8_numeric() : base8<T>() {}
really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
// Store to array
really_inline void store(T dst[16]) { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
// Addition/subtraction are the same for signed and unsigned
really_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
really_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
@ -174,7 +177,7 @@ namespace simdjson::westmere::simd {
// Bit-specific operations
really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set(); }
really_inline simd8<bool> any_bits_set() const { return ~(*this == u8'\0'); }
really_inline simd8<bool> any_bits_set() const { return ~(*this == uint8_t(0)); }
really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !_mm_testz_si128(*this, bits); }
really_inline bool any_bits_set_anywhere() const { return !_mm_testz_si128(*this, *this); }
template<int N>
@ -188,8 +191,15 @@ namespace simdjson::westmere::simd {
const simd8<T> chunks[4];
really_inline simd8x64() : chunks{simd8<T>(), simd8<T>(), simd8<T>(), simd8<T>()} {}
really_inline simd8x64(const __m128i chunk0, const __m128i chunk1, const __m128i chunk2, const __m128i chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
really_inline simd8x64(const T *ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
really_inline void store(T ptr[64]) {
this->chunks[0].store(ptr);
this->chunks[0].store(ptr+16);
this->chunks[0].store(ptr+32);
this->chunks[0].store(ptr+48);
}
template <typename F>
really_inline void each(F const& each_chunk) const

View File

@ -5,38 +5,35 @@
#ifdef IS_X86_64
#include "westmere/simd.h"
#include "simdjson/common_defs.h"
#include "simdjson/parsedjson.h"
#include "jsoncharutils.h"
#ifdef JSON_TEST_STRINGS
void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
const uint8_t *parsed_end);
void found_bad_string(const uint8_t *buf);
#endif
TARGET_WESTMERE
namespace simdjson::westmere {
using namespace simd;
// Holds backslashes and quotes locations.
struct parse_string_helper {
uint32_t bs_bits;
uint32_t quote_bits;
really_inline uint32_t bytes_processed() const { return sizeof(__m128i); }
static const uint32_t BYTES_PROCESSED = 32;
};
really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) {
// this can read up to 31 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
__m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
// store to dest unconditionally - we can overwrite the bits we don't like
// later
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v);
auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"'));
static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1));
simd8<uint8_t> v0(src);
simd8<uint8_t> v1(src + 16);
v0.store(dst);
v1.store(dst + 16);
uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
return {
static_cast<uint32_t>(
_mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits
static_cast<uint32_t>(_mm_movemask_epi8(quote_mask)) // quote_bits
static_cast<uint32_t>(bs_and_quote), // bs_bits
static_cast<uint32_t>(bs_and_quote >> 32) // quote_bits
};
}