From d89046d515f1076fa2d5ce4584797c2533ad526c Mon Sep 17 00:00:00 2001 From: John Keiser Date: Sun, 27 Oct 2019 10:51:54 -0700 Subject: [PATCH] Use simd8 helpers for find_bs_bits_and_quote_bits --- src/arm64/simd.h | 38 +++++++++++++++++++---------- src/arm64/stringparsing.h | 47 +++++++++++------------------------- src/generic/stringparsing.h | 10 ++++---- src/haswell/simd.h | 20 +++++++++------ src/haswell/stringparsing.h | 28 +++++++++------------ src/stage2_build_tape.cpp | 6 +++++ src/westmere/simd.h | 18 +++++++++++--- src/westmere/stringparsing.h | 27 +++++++++------------ 8 files changed, 100 insertions(+), 94 deletions(-) diff --git a/src/arm64/simd.h b/src/arm64/simd.h index 525959bc..e14da8e1 100644 --- a/src/arm64/simd.h +++ b/src/arm64/simd.h @@ -60,7 +60,7 @@ namespace simdjson::arm64::simd { really_inline simd8::bitmask_t to_bitmask() const { const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; - uint8x16_t minput = vandq_u8(*this, bit_mask); + auto minput = *this & bit_mask; uint8x16_t tmp = vpaddq_u8(minput, minput); tmp = vpaddq_u8(tmp, tmp); tmp = vpaddq_u8(tmp, tmp); @@ -80,7 +80,7 @@ namespace simdjson::arm64::simd { // Zero constructor really_inline simd8() : simd8(zero()) {} // Array constructor - really_inline simd8(const uint8_t* values) : simd8(load(values)) {} + really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {} // Splat constructor really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} // Member-by-member initialization @@ -92,6 +92,9 @@ namespace simdjson::arm64::simd { v8, v9, v10,v11,v12,v13,v14,v15 }) {} + // Store to array + really_inline void store(uint8_t dst[16]) { return vst1q_u8(dst, *this); } + // Saturated math really_inline simd8 saturating_add(const simd8 other) const { return vqaddq_u8(*this, other); } really_inline simd8 saturating_sub(const simd8 other) const { return vqsubq_u8(*this, other); } @@ -159,7 +162,7 @@ namespace simdjson::arm64::simd { static really_inline simd8 splat(int8_t _value) { return vmovq_n_s8(_value); } static really_inline simd8 zero() { return vdupq_n_s8(0); } - static really_inline simd8 load(const int8_t* values) { return vld1q_s8(values); } + static really_inline simd8 load(const int8_t values[16]) { return vld1q_s8(values); } // Conversion from/to SIMD register really_inline simd8(const int8x16_t _value) : value{_value} {} @@ -181,6 +184,9 @@ namespace simdjson::arm64::simd { v8, v9, v10,v11,v12,v13,v14,v15 }) {} + // Store to array + really_inline void store(int8_t dst[16]) { return vst1q_s8(dst, *this); } + // Explicit conversion to/from unsigned really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {} really_inline explicit operator simd8() const { return vreinterpretq_u8_s8(*this); } @@ -227,8 +233,15 @@ namespace simdjson::arm64::simd { const simd8 chunks[4]; really_inline simd8x64() : chunks{simd8(), simd8(), simd8(), simd8()} {} - really_inline simd8x64(const uint8x16_t chunk0, const uint8x16_t chunk1, const uint8x16_t chunk2, const uint8x16_t chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} - really_inline simd8x64(const T *ptr) : chunks{simd8::load(ptr), simd8::load(ptr+16), simd8::load(ptr+32), simd8::load(ptr+48)} {} + really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, const simd8 chunk2, const simd8 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + really_inline simd8x64(const T ptr[64]) : chunks{simd8::load(ptr), simd8::load(ptr+16), simd8::load(ptr+32), simd8::load(ptr+48)} {} + + really_inline void store(T ptr[64]) { + this->chunks[0].store(ptr); + this->chunks[0].store(ptr+16); + this->chunks[0].store(ptr+32); + this->chunks[0].store(ptr+48); + } template really_inline void each(F const& each_chunk) const @@ -268,14 +281,13 @@ namespace simdjson::arm64::simd { } really_inline uint64_t to_bitmask() const { - const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; - uint8x16_t t0 = vandq_u8(this->chunks[0], bit_mask); - uint8x16_t t1 = vandq_u8(this->chunks[1], bit_mask); - uint8x16_t t2 = vandq_u8(this->chunks[2], bit_mask); - uint8x16_t t3 = vandq_u8(this->chunks[3], bit_mask); - uint8x16_t sum0 = vpaddq_u8(t0, t1); - uint8x16_t sum1 = vpaddq_u8(t2, t3); + const uint8x16_t bit_mask = { + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + }; + // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. + uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask); + uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask); sum0 = vpaddq_u8(sum0, sum1); sum0 = vpaddq_u8(sum0, sum0); return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); diff --git a/src/arm64/stringparsing.h b/src/arm64/stringparsing.h index 319c5a29..cddb4c56 100644 --- a/src/arm64/stringparsing.h +++ b/src/arm64/stringparsing.h @@ -5,57 +5,38 @@ #ifdef IS_ARM64 +#include "arm64/simd.h" #include "simdjson/common_defs.h" #include "simdjson/parsedjson.h" #include "jsoncharutils.h" -#ifdef JSON_TEST_STRINGS -void found_string(const uint8_t *buf, const uint8_t *parsed_begin, - const uint8_t *parsed_end); -void found_bad_string(const uint8_t *buf); -#endif - namespace simdjson::arm64 { +using namespace simd; + // Holds backslashes and quotes locations. struct parse_string_helper { uint32_t bs_bits; uint32_t quote_bits; - really_inline uint32_t bytes_processed() const { return sizeof(uint8x16_t)*2; } + static const uint32_t BYTES_PROCESSED = 32; }; really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) { // this can read up to 31 bytes beyond the buffer size, but we require // SIMDJSON_PADDING of padding - static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING); - uint8x16_t v0 = vld1q_u8(src); - uint8x16_t v1 = vld1q_u8(src + 16); - vst1q_u8(dst, v0); - vst1q_u8(dst + 16, v1); + static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1)); + simd8 v0(src); + simd8 v1(src + sizeof(v0)); + v0.store(dst); + v1.store(dst + sizeof(v0)); - uint8x16_t bs_mask = vmovq_n_u8('\\'); - uint8x16_t qt_mask = vmovq_n_u8('"'); - const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; - uint8x16_t cmp_bs_0 = vceqq_u8(v0, bs_mask); - uint8x16_t cmp_bs_1 = vceqq_u8(v1, bs_mask); - uint8x16_t cmp_qt_0 = vceqq_u8(v0, qt_mask); - uint8x16_t cmp_qt_1 = vceqq_u8(v1, qt_mask); - - cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask); - cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask); - cmp_qt_0 = vandq_u8(cmp_qt_0, bit_mask); - cmp_qt_1 = vandq_u8(cmp_qt_1, bit_mask); - - uint8x16_t sum0 = vpaddq_u8(cmp_bs_0, cmp_bs_1); - uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1); - sum0 = vpaddq_u8(sum0, sum1); - sum0 = vpaddq_u8(sum0, sum0); + // Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on ARM; therefore, we + // smash them together into a 64-byte mask and get the bitmask from there. + uint64_t bs_and_quote = simd8x64(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask(); return { - vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits - vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits + static_cast(bs_and_quote), // bs_bits + static_cast(bs_and_quote >> 32) // quote_bits }; - } #include "generic/stringparsing.h" diff --git a/src/generic/stringparsing.h b/src/generic/stringparsing.h index 230d30b5..b71584d8 100644 --- a/src/generic/stringparsing.h +++ b/src/generic/stringparsing.h @@ -84,7 +84,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf, */ /* find out where the quote is... */ - uint32_t quote_dist = trailing_zeroes(helper.quote_bits); + auto quote_dist = trailing_zeroes(helper.quote_bits); /* NULL termination is still handy if you expect all your strings to * be NULL terminated? */ @@ -92,7 +92,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf, dst[quote_dist] = 0; uint32_t str_length = (dst - start_of_string) + quote_dist; - memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t)); + memcpy(pj.current_string_buf_loc, &str_length, sizeof(str_length)); /***************************** * Above, check for overflow in case someone has a crazy string * (>=4GB?) _ @@ -109,7 +109,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf, } if (((helper.quote_bits - 1) & helper.bs_bits) != 0) { /* find out where the backspace is */ - uint32_t bs_dist = trailing_zeroes(helper.bs_bits); + auto bs_dist = trailing_zeroes(helper.bs_bits); uint8_t escape_char = src[bs_dist + 1]; /* we encountered backslash first. Handle backslash */ if (escape_char == 'u') { @@ -136,8 +136,8 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf, } else { /* they are the same. Since they can't co-occur, it means we * encountered neither. */ - src += helper.bytes_processed(); - dst += helper.bytes_processed(); + src += parse_string_helper::BYTES_PROCESSED; + dst += parse_string_helper::BYTES_PROCESSED; } } /* can't be reached */ diff --git a/src/haswell/simd.h b/src/haswell/simd.h index ad6d0589..0f862956 100644 --- a/src/haswell/simd.h +++ b/src/haswell/simd.h @@ -74,13 +74,16 @@ namespace simdjson::haswell::simd { struct base8_numeric: base8 { static really_inline simd8 splat(T _value) { return _mm256_set1_epi8(_value); } static really_inline simd8 zero() { return _mm256_setzero_si256(); } - static really_inline simd8 load(const T* values) { + static really_inline simd8 load(const T values[32]) { return _mm256_loadu_si256(reinterpret_cast(values)); } really_inline base8_numeric() : base8() {} really_inline base8_numeric(const __m256i _value) : base8(_value) {} + // Store to array + really_inline void store(T dst[32]) { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); } + // Addition/subtraction are the same for signed and unsigned really_inline simd8 operator+(const simd8 other) const { return _mm256_add_epi8(*this, other); } really_inline simd8 operator-(const simd8 other) const { return _mm256_sub_epi8(*this, other); } @@ -131,7 +134,7 @@ namespace simdjson::haswell::simd { // Splat constructor really_inline simd8(int8_t _value) : simd8(splat(_value)) {} // Array constructor - really_inline simd8(const int8_t* values) : simd8(load(values)) {} + really_inline simd8(const int8_t values[32]) : simd8(load(values)) {} // Member-by-member initialization really_inline simd8( int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, @@ -159,7 +162,7 @@ namespace simdjson::haswell::simd { // Splat constructor really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} // Array constructor - really_inline simd8(const uint8_t* values) : simd8(load(values)) {} + really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {} // Member-by-member initialization really_inline simd8( uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, @@ -184,7 +187,7 @@ namespace simdjson::haswell::simd { // Bit-specific operations really_inline simd8 any_bits_set(simd8 bits) const { return (*this & bits).any_bits_set(); } - really_inline simd8 any_bits_set() const { return ~(*this == u8'\0'); } + really_inline simd8 any_bits_set() const { return ~(*this == uint8_t(0)); } really_inline bool any_bits_set_anywhere(simd8 bits) const { return !_mm256_testz_si256(*this, bits); } really_inline bool any_bits_set_anywhere() const { return !_mm256_testz_si256(*this, *this); } template @@ -198,10 +201,13 @@ namespace simdjson::haswell::simd { const simd8 chunks[2]; really_inline simd8x64() : chunks{simd8(), simd8()} {} + really_inline simd8x64(const simd8 chunk0, const simd8 chunk1) : chunks{chunk0, chunk1} {} + really_inline simd8x64(const T ptr[64]) : chunks{simd8::load(ptr), simd8::load(ptr+32)} {} - really_inline simd8x64(const __m256i chunk0, const __m256i chunk1) : chunks{chunk0, chunk1} {} - - really_inline simd8x64(const T *ptr) : chunks{simd8::load(ptr), simd8::load(ptr+32)} {} + really_inline void store(T *ptr) { + this->chunks[0].store(ptr); + this->chunks[0].store(ptr+sizeof(simd8)); + } template really_inline void each(F const& each_chunk) const diff --git a/src/haswell/stringparsing.h b/src/haswell/stringparsing.h index 97c68361..47b3d70e 100644 --- a/src/haswell/stringparsing.h +++ b/src/haswell/stringparsing.h @@ -5,39 +5,33 @@ #ifdef IS_X86_64 +#include "haswell/simd.h" #include "simdjson/common_defs.h" #include "simdjson/parsedjson.h" #include "jsoncharutils.h" -#ifdef JSON_TEST_STRINGS -void found_string(const uint8_t *buf, const uint8_t *parsed_begin, - const uint8_t *parsed_end); -void found_bad_string(const uint8_t *buf); -#endif - TARGET_HASWELL namespace simdjson::haswell { +using namespace simd; + // Holds backslashes and quotes locations. struct parse_string_helper { uint32_t bs_bits; uint32_t quote_bits; - really_inline uint32_t bytes_processed() const { return sizeof(__m256i); } + static const uint32_t BYTES_PROCESSED = 32; }; really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) { - // this can read up to 31 bytes beyond the buffer size, but we require + // this can read up to 15 bytes beyond the buffer size, but we require // SIMDJSON_PADDING of padding - static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING); - __m256i v = _mm256_loadu_si256(reinterpret_cast(src)); - // store to dest unconditionally - we can overwrite the bits we don't like - // later - _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v); - auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"')); + static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1)); + simd8 v(src); + // store to dest unconditionally - we can overwrite the bits we don't like later + v.store(dst); return { - static_cast(_mm256_movemask_epi8( - _mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits - static_cast(_mm256_movemask_epi8(quote_mask)) // quote_bits + (v == '\\').to_bitmask(), // bs_bits + (v == '"').to_bitmask(), // quote_bits }; } diff --git a/src/stage2_build_tape.cpp b/src/stage2_build_tape.cpp index 08f95638..e1372779 100644 --- a/src/stage2_build_tape.cpp +++ b/src/stage2_build_tape.cpp @@ -65,6 +65,12 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) { return error == 0; } +#ifdef JSON_TEST_STRINGS +void found_string(const uint8_t *buf, const uint8_t *parsed_begin, + const uint8_t *parsed_end); +void found_bad_string(const uint8_t *buf); +#endif + #include "arm64/stage2_build_tape.h" #include "haswell/stage2_build_tape.h" #include "westmere/stage2_build_tape.h" diff --git a/src/westmere/simd.h b/src/westmere/simd.h index 808427ca..6494bc51 100644 --- a/src/westmere/simd.h +++ b/src/westmere/simd.h @@ -75,13 +75,16 @@ namespace simdjson::westmere::simd { struct base8_numeric: base8 { static really_inline simd8 splat(T _value) { return _mm_set1_epi8(_value); } static really_inline simd8 zero() { return _mm_setzero_si128(); } - static really_inline simd8 load(const T* values) { + static really_inline simd8 load(const T values[16]) { return _mm_loadu_si128(reinterpret_cast(values)); } really_inline base8_numeric() : base8() {} really_inline base8_numeric(const __m128i _value) : base8(_value) {} + // Store to array + really_inline void store(T dst[16]) { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); } + // Addition/subtraction are the same for signed and unsigned really_inline simd8 operator+(const simd8 other) const { return _mm_add_epi8(*this, other); } really_inline simd8 operator-(const simd8 other) const { return _mm_sub_epi8(*this, other); } @@ -174,7 +177,7 @@ namespace simdjson::westmere::simd { // Bit-specific operations really_inline simd8 any_bits_set(simd8 bits) const { return (*this & bits).any_bits_set(); } - really_inline simd8 any_bits_set() const { return ~(*this == u8'\0'); } + really_inline simd8 any_bits_set() const { return ~(*this == uint8_t(0)); } really_inline bool any_bits_set_anywhere(simd8 bits) const { return !_mm_testz_si128(*this, bits); } really_inline bool any_bits_set_anywhere() const { return !_mm_testz_si128(*this, *this); } template @@ -188,8 +191,15 @@ namespace simdjson::westmere::simd { const simd8 chunks[4]; really_inline simd8x64() : chunks{simd8(), simd8(), simd8(), simd8()} {} - really_inline simd8x64(const __m128i chunk0, const __m128i chunk1, const __m128i chunk2, const __m128i chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} - really_inline simd8x64(const T *ptr) : chunks{simd8::load(ptr), simd8::load(ptr+16), simd8::load(ptr+32), simd8::load(ptr+48)} {} + really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, const simd8 chunk2, const simd8 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + really_inline simd8x64(const T ptr[64]) : chunks{simd8::load(ptr), simd8::load(ptr+16), simd8::load(ptr+32), simd8::load(ptr+48)} {} + + really_inline void store(T ptr[64]) { + this->chunks[0].store(ptr); + this->chunks[0].store(ptr+16); + this->chunks[0].store(ptr+32); + this->chunks[0].store(ptr+48); + } template really_inline void each(F const& each_chunk) const diff --git a/src/westmere/stringparsing.h b/src/westmere/stringparsing.h index 0c128ae7..dcc8deb5 100644 --- a/src/westmere/stringparsing.h +++ b/src/westmere/stringparsing.h @@ -5,38 +5,35 @@ #ifdef IS_X86_64 +#include "westmere/simd.h" #include "simdjson/common_defs.h" #include "simdjson/parsedjson.h" #include "jsoncharutils.h" -#ifdef JSON_TEST_STRINGS -void found_string(const uint8_t *buf, const uint8_t *parsed_begin, - const uint8_t *parsed_end); -void found_bad_string(const uint8_t *buf); -#endif - TARGET_WESTMERE namespace simdjson::westmere { +using namespace simd; + // Holds backslashes and quotes locations. struct parse_string_helper { uint32_t bs_bits; uint32_t quote_bits; - really_inline uint32_t bytes_processed() const { return sizeof(__m128i); } + static const uint32_t BYTES_PROCESSED = 32; }; really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) { // this can read up to 31 bytes beyond the buffer size, but we require // SIMDJSON_PADDING of padding - __m128i v = _mm_loadu_si128(reinterpret_cast(src)); - // store to dest unconditionally - we can overwrite the bits we don't like - // later - _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v); - auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"')); + static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1)); + simd8 v0(src); + simd8 v1(src + 16); + v0.store(dst); + v1.store(dst + 16); + uint64_t bs_and_quote = simd8x64(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask(); return { - static_cast( - _mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits - static_cast(_mm_movemask_epi8(quote_mask)) // quote_bits + static_cast(bs_and_quote), // bs_bits + static_cast(bs_and_quote >> 32) // quote_bits }; }