From f7e893667d8f8c9893b1d6b075034c7c34a0d79c Mon Sep 17 00:00:00 2001 From: John Keiser Date: Mon, 2 Sep 2019 09:46:05 -0700 Subject: [PATCH] Use simd_input generic methods for utf8 checking (#301) * Use generic each/reduce in simdutf8check * Remove macros from generic simd_input uses * Use array instead of members to store simd registers * Default local checkperf to clone from . --- .drone.yml | 4 ++ scripts/checkperf.sh | 4 +- src/arm64/simd_input.h | 65 +++++++++++++++++++++----------- src/arm64/simdutf8check.h | 21 ++++------- src/arm64/stage1_find_marks.h | 8 +++- src/haswell/simd_input.h | 41 +++++++++++++------- src/haswell/simdutf8check.h | 12 +++--- src/haswell/stage1_find_marks.h | 4 +- src/simd_input.h | 12 +++--- src/westmere/simd_input.h | 61 ++++++++++++++++++------------ src/westmere/simdutf8check.h | 29 ++++---------- src/westmere/stage1_find_marks.h | 4 +- 12 files changed, 154 insertions(+), 111 deletions(-) diff --git a/.drone.yml b/.drone.yml index ba19f79d..7680785c 100644 --- a/.drone.yml +++ b/.drone.yml @@ -14,6 +14,8 @@ steps: - make amalgamate - name: checkperf image: gcc:8 + environment: + CHECKPERF_REPOSITORY: https://github.com/lemire/simdjson commands: - make checkperf --- @@ -33,6 +35,8 @@ steps: - make amalgamate - name: checkperf image: gcc:8 + environment: + CHECKPERF_REPOSITORY: https://github.com/lemire/simdjson commands: - make checkperf --- diff --git a/scripts/checkperf.sh b/scripts/checkperf.sh index 3817a990..1e90da3a 100644 --- a/scripts/checkperf.sh +++ b/scripts/checkperf.sh @@ -3,6 +3,8 @@ set -e SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" +if [ -z "$CHECKPERF_REPOSITORY"]; then CHECKPERF_REPOSITORY=.; fi + # Arguments: perfdiff.sh if [ -z "$1" ]; then reference_branch="master"; else reference_branch=$1; shift; fi if [ -z "$*" ]; then perftests="jsonexamples/twitter.json"; else perftests=$*; fi @@ -13,7 +15,7 @@ current=$SCRIPTPATH/.. reference=$current/benchbranch/$reference_branch rm -rf $reference mkdir -p $reference -git clone --depth 1 -b $reference_branch https://github.com/lemire/simdjson $reference +git clone --depth 1 -b $reference_branch $CHECKPERF_REPOSITORY $reference cd $reference make parse diff --git a/src/arm64/simd_input.h b/src/arm64/simd_input.h index 86b5793a..ff126356 100644 --- a/src/arm64/simd_input.h +++ b/src/arm64/simd_input.h @@ -5,7 +5,7 @@ #ifdef IS_ARM64 -namespace simdjson { +namespace simdjson::arm64 { really_inline uint16_t neon_movemask(uint8x16_t input) { const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, @@ -32,49 +32,68 @@ really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1, return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); } +} // namespace simdjson::arm64 + +namespace simdjson { + +using namespace simdjson::arm64; + template <> struct simd_input { - uint8x16_t i0; - uint8x16_t i1; - uint8x16_t i2; - uint8x16_t i3; + uint8x16_t chunks[4]; really_inline simd_input(const uint8_t *ptr) { - this->i0 = vld1q_u8(ptr + 0); - this->i1 = vld1q_u8(ptr + 16); - this->i2 = vld1q_u8(ptr + 32); - this->i3 = vld1q_u8(ptr + 48); + this->chunks[0] = vld1q_u8(ptr + 0*16); + this->chunks[1] = vld1q_u8(ptr + 1*16); + this->chunks[2] = vld1q_u8(ptr + 2*16); + this->chunks[3] = vld1q_u8(ptr + 3*16); } - really_inline simd_input(uint8x16_t a0, uint8x16_t a1, uint8x16_t a2, uint8x16_t a3) { - this->i0 = a0; - this->i1 = a1; - this->i2 = a2; - this->i3 = a3; + really_inline simd_input(uint8x16_t chunk0, uint8x16_t chunk1, uint8x16_t chunk2, uint8x16_t chunk3) { + this->chunks[0] = chunk0; + this->chunks[1] = chunk1; + this->chunks[2] = chunk2; + this->chunks[3] = chunk3; + } + + template + really_inline void each(F const& each_chunk) + { + each_chunk(this->chunks[0]); + each_chunk(this->chunks[1]); + each_chunk(this->chunks[2]); + each_chunk(this->chunks[3]); } template really_inline simd_input map(F const& map_chunk) { return simd_input( - map_chunk(this->i0), - map_chunk(this->i1), - map_chunk(this->i2), - map_chunk(this->i3) + map_chunk(this->chunks[0]), + map_chunk(this->chunks[1]), + map_chunk(this->chunks[2]), + map_chunk(this->chunks[3]) ); } template really_inline simd_input map(simd_input b, F const& map_chunk) { return simd_input( - map_chunk(this->i0, b.i0), - map_chunk(this->i1, b.i1), - map_chunk(this->i2, b.i2), - map_chunk(this->i3, b.i3) + map_chunk(this->chunks[0], b.chunks[0]), + map_chunk(this->chunks[1], b.chunks[1]), + map_chunk(this->chunks[2], b.chunks[2]), + map_chunk(this->chunks[3], b.chunks[3]) ); } + template + really_inline uint8x16_t reduce(F const& reduce_pair) { + uint8x16_t r01 = reduce_pair(this->chunks[0], this->chunks[1]); + uint8x16_t r23 = reduce_pair(this->chunks[2], this->chunks[3]); + return reduce_pair(r01, r23); + } + really_inline uint64_t to_bitmask() { - return neon_movemask_bulk(this->i0, this->i1, this->i2, this->i3); + return neon_movemask_bulk(this->chunks[0], this->chunks[1], this->chunks[2], this->chunks[3]); } really_inline uint64_t eq(uint8_t m) { diff --git a/src/arm64/simdutf8check.h b/src/arm64/simdutf8check.h index b777c177..f49f2cfb 100644 --- a/src/arm64/simdutf8check.h +++ b/src/arm64/simdutf8check.h @@ -181,11 +181,11 @@ check_utf8_bytes(int8x16_t current_bytes, struct processed_utf_bytes *previous, really_inline bool check_ascii_neon(simd_input in) { // checking if the most significant bit is always equal to 0. uint8x16_t high_bit = vdupq_n_u8(0x80); - uint8x16_t t0 = vorrq_u8(in.i0, in.i1); - uint8x16_t t1 = vorrq_u8(in.i2, in.i3); - uint8x16_t t3 = vorrq_u8(t0, t1); - uint8x16_t t4 = vandq_u8(t3, high_bit); - uint64x2_t v64 = vreinterpretq_u64_u8(t4); + uint8x16_t any_bits_on = in.reduce([&](auto a, auto b) { + return vorrq_u8(a, b); + }); + uint8x16_t high_bit_on = vandq_u8(any_bits_on, high_bit); + uint64x2_t v64 = vreinterpretq_u64_u8(high_bit_on); uint32x2_t v32 = vqmovn_u64(v64); uint64x1_t result = vreinterpret_u64_u32(v32); return vget_lane_u64(result, 0) == 0; @@ -215,14 +215,9 @@ struct utf8_checker { this->has_error); } else { // it is not ascii so we have to do heavy work - this->previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0), - &(this->previous), &(this->has_error)); - this->previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1), - &(this->previous), &(this->has_error)); - this->previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2), - &(this->previous), &(this->has_error)); - this->previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3), - &(this->previous), &(this->has_error)); + in.each([&](auto _in) { + this->previous = check_utf8_bytes(vreinterpretq_s8_u8(_in), &(this->previous), &(this->has_error)); + }); } } diff --git a/src/arm64/stage1_find_marks.h b/src/arm64/stage1_find_marks.h index 1cfde5fd..91978b51 100644 --- a/src/arm64/stage1_find_marks.h +++ b/src/arm64/stage1_find_marks.h @@ -39,10 +39,14 @@ really_inline void find_whitespace_and_structurals( }); const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7); - structurals = MAP_BITMASK( v, vtstq_u8(_v, structural_shufti_mask) ); + structurals = v.map([&](auto _v) { + return vtstq_u8(_v, structural_shufti_mask); + }).to_bitmask(); const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); - whitespace = MAP_BITMASK( v, vtstq_u8(_v, whitespace_shufti_mask) ); + whitespace = v.map([&](auto _v) { + return vtstq_u8(_v, whitespace_shufti_mask); + }).to_bitmask(); } #include "generic/stage1_find_marks_flatten.h" diff --git a/src/haswell/simd_input.h b/src/haswell/simd_input.h index 3a9f40d6..c9b8a5f7 100644 --- a/src/haswell/simd_input.h +++ b/src/haswell/simd_input.h @@ -10,38 +10,51 @@ namespace simdjson { template <> struct simd_input { - __m256i lo; - __m256i hi; + __m256i chunks[2]; - really_inline simd_input(const uint8_t *ptr) { - this->lo = _mm256_loadu_si256(reinterpret_cast(ptr + 0)); - this->hi = _mm256_loadu_si256(reinterpret_cast(ptr + 32)); + really_inline simd_input(const uint8_t *ptr) + { + this->chunks[0] = _mm256_loadu_si256(reinterpret_cast(ptr + 0*32)); + this->chunks[1] = _mm256_loadu_si256(reinterpret_cast(ptr + 1*32)); } - really_inline simd_input(__m256i a_lo, __m256i a_hi) { - this->lo = a_lo; - this->hi = a_hi; + really_inline simd_input(__m256i chunk0, __m256i chunk1) + { + this->chunks[0] = chunk0; + this->chunks[1] = chunk1; + } + + template + really_inline void each(F const& each_chunk) + { + each_chunk(this->chunks[0]); + each_chunk(this->chunks[1]); } template really_inline simd_input map(F const& map_chunk) { return simd_input( - map_chunk(this->lo), - map_chunk(this->hi) + map_chunk(this->chunks[0]), + map_chunk(this->chunks[1]) ); } template really_inline simd_input map(simd_input b, F const& map_chunk) { return simd_input( - map_chunk(this->lo, b.lo), - map_chunk(this->hi, b.hi) + map_chunk(this->chunks[0], b.chunks[0]), + map_chunk(this->chunks[1], b.chunks[1]) ); } + template + really_inline __m256i reduce(F const& reduce_pair) { + return reduce_pair(this->chunks[0], this->chunks[1]); + } + really_inline uint64_t to_bitmask() { - uint64_t r_lo = static_cast(_mm256_movemask_epi8(this->lo)); - uint64_t r_hi = _mm256_movemask_epi8(this->hi); + uint64_t r_lo = static_cast(_mm256_movemask_epi8(this->chunks[0])); + uint64_t r_hi = _mm256_movemask_epi8(this->chunks[1]); return r_lo | (r_hi << 32); } diff --git a/src/haswell/simdutf8check.h b/src/haswell/simdutf8check.h index e0c7b70b..e0b993d4 100644 --- a/src/haswell/simdutf8check.h +++ b/src/haswell/simdutf8check.h @@ -215,7 +215,10 @@ struct utf8_checker { really_inline void check_next_input(simd_input in) { __m256i high_bit = _mm256_set1_epi8(0x80u); - if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) { + __m256i any_bits_on = in.reduce([&](auto a, auto b) { + return _mm256_or_si256(a, b); + }); + if ((_mm256_testz_si256(any_bits_on, high_bit)) == 1) { // it is ascii, we just check continuation this->has_error = _mm256_or_si256( _mm256_cmpgt_epi8(this->previous.carried_continuations, @@ -225,10 +228,9 @@ struct utf8_checker { this->has_error); } else { // it is not ascii so we have to do heavy work - this->previous = - avx_check_utf8_bytes(in.lo, &(this->previous), &(this->has_error)); - this->previous = - avx_check_utf8_bytes(in.hi, &(this->previous), &(this->has_error)); + in.each([&](auto _in) { + this->previous = avx_check_utf8_bytes(_in, &(this->previous), &(this->has_error)); + }); } } diff --git a/src/haswell/stage1_find_marks.h b/src/haswell/stage1_find_marks.h index a054b2d7..88579cb2 100644 --- a/src/haswell/stage1_find_marks.h +++ b/src/haswell/stage1_find_marks.h @@ -70,7 +70,9 @@ really_inline void find_whitespace_and_structurals(simd_input in, const __m256i struct_offset = _mm256_set1_epi8(0xd4u); const __m256i struct_mask = _mm256_set1_epi8(32); - whitespace = MAP_BITMASK( in, _mm256_cmpeq_epi8(_in, _mm256_shuffle_epi8(white_table, _in)) ); + whitespace = in.map([&](auto _in) { + return _mm256_cmpeq_epi8(_in, _mm256_shuffle_epi8(white_table, _in)); + }).to_bitmask(); structurals = in.map([&](auto _in) { const __m256i r1 = _mm256_add_epi8(struct_offset, _in); diff --git a/src/simd_input.h b/src/simd_input.h index 37bd5863..e370bc94 100644 --- a/src/simd_input.h +++ b/src/simd_input.h @@ -4,19 +4,24 @@ #include "simdjson/common_defs.h" #include "simdjson/portability.h" #include "simdjson/simdjson.h" -#include namespace simdjson { template struct simd_input { simd_input(const uint8_t *ptr); + // Run an operation on each chunk. + template + really_inline void each(F const& each_chunk); // Map through each simd register in this input, producing another simd_input. template really_inline simd_input map(F const& map_chunk); // Map through each simd register across two inputs, producing a single simd_input. template really_inline simd_input map(simd_input b, F const& map_chunk); + // Run a horizontal operation like "sum" across the whole input + // template + // really_inline simd reduce(F const& map_chunk); // turn this bytemask (usually the result of a simd comparison operation) into a bitmask. uint64_t to_bitmask(); // a straightforward comparison of a mask against input. @@ -25,11 +30,6 @@ struct simd_input { uint64_t lteq(uint8_t m); }; // struct simd_input -#define MAP_CHUNKS(A, EXPR) A.map([&](auto _##A) { return (EXPR); }) -#define MAP_BITMASK(A, EXPR) MAP_CHUNKS(A, EXPR).to_bitmask() -#define MAP_CHUNKS2(A, B, EXPR) A.map((B), [&](auto _##A, auto _##B) { return (EXPR); }) -#define MAP_BITMASK2(A, B, EXPR) MAP_CHUNKS2(A, B, EXPR).to_bitmask() - } // namespace simdjson #endif diff --git a/src/westmere/simd_input.h b/src/westmere/simd_input.h index 40f356ee..7c9a1b6d 100644 --- a/src/westmere/simd_input.h +++ b/src/westmere/simd_input.h @@ -10,51 +10,64 @@ namespace simdjson { template <> struct simd_input { - __m128i v0; - __m128i v1; - __m128i v2; - __m128i v3; + __m128i chunks[4]; really_inline simd_input(const uint8_t *ptr) { - this->v0 = _mm_loadu_si128(reinterpret_cast(ptr + 0)); - this->v1 = _mm_loadu_si128(reinterpret_cast(ptr + 16)); - this->v2 = _mm_loadu_si128(reinterpret_cast(ptr + 32)); - this->v3 = _mm_loadu_si128(reinterpret_cast(ptr + 48)); + this->chunks[0] = _mm_loadu_si128(reinterpret_cast(ptr + 0)); + this->chunks[1] = _mm_loadu_si128(reinterpret_cast(ptr + 16)); + this->chunks[2] = _mm_loadu_si128(reinterpret_cast(ptr + 32)); + this->chunks[3] = _mm_loadu_si128(reinterpret_cast(ptr + 48)); } really_inline simd_input(__m128i i0, __m128i i1, __m128i i2, __m128i i3) { - this->v0 = i0; - this->v1 = i1; - this->v2 = i2; - this->v3 = i3; + this->chunks[0] = i0; + this->chunks[1] = i1; + this->chunks[2] = i2; + this->chunks[3] = i3; + } + + template + really_inline void each(F const& each_chunk) + { + each_chunk(this->chunks[0]); + each_chunk(this->chunks[1]); + each_chunk(this->chunks[2]); + each_chunk(this->chunks[3]); } template really_inline simd_input map(F const& map_chunk) { return simd_input( - map_chunk(this->v0), - map_chunk(this->v1), - map_chunk(this->v2), - map_chunk(this->v3) + map_chunk(this->chunks[0]), + map_chunk(this->chunks[1]), + map_chunk(this->chunks[2]), + map_chunk(this->chunks[3]) ); } template really_inline simd_input map(simd_input b, F const& map_chunk) { return simd_input( - map_chunk(this->v0, b.v0), - map_chunk(this->v1, b.v1), - map_chunk(this->v2, b.v2), - map_chunk(this->v3, b.v3) + map_chunk(this->chunks[0], b.chunks[0]), + map_chunk(this->chunks[1], b.chunks[1]), + map_chunk(this->chunks[2], b.chunks[2]), + map_chunk(this->chunks[3], b.chunks[3]) ); } + template + really_inline __m128i reduce(F const& reduce_pair) { + __m128i r01 = reduce_pair(this->chunks[0], this->chunks[1]); + __m128i r23 = reduce_pair(this->chunks[2], this->chunks[3]); + return reduce_pair(r01, r23); + } + really_inline uint64_t to_bitmask() { - uint64_t r0 = static_cast(_mm_movemask_epi8(this->v0)); - uint64_t r1 = _mm_movemask_epi8(this->v1); - uint64_t r2 = _mm_movemask_epi8(this->v2); - uint64_t r3 = _mm_movemask_epi8(this->v3); + uint64_t r0 = static_cast(_mm_movemask_epi8(this->chunks[0])); + uint64_t r1 = _mm_movemask_epi8(this->chunks[1]); + uint64_t r2 = _mm_movemask_epi8(this->chunks[2]); + uint64_t r3 = _mm_movemask_epi8(this->chunks[3]); return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); } diff --git a/src/westmere/simdutf8check.h b/src/westmere/simdutf8check.h index c64f1c2b..627b4ef0 100644 --- a/src/westmere/simdutf8check.h +++ b/src/westmere/simdutf8check.h @@ -164,7 +164,7 @@ check_utf8_bytes(__m128i current_bytes, struct processed_utf_bytes *previous, } } // namespace simdjson::westmere -UNTARGET_REGION // westmere +UNTARGET_REGION TARGET_WESTMERE namespace simdjson { @@ -182,7 +182,10 @@ struct utf8_checker { really_inline void check_next_input(simd_input in) { __m128i high_bit = _mm_set1_epi8(0x80u); - if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), high_bit)) == 1) { + __m128i any_bits_on = in.reduce([&](auto a, auto b) { + return _mm_or_si128(a, b); + }); + if ((_mm_testz_si128( any_bits_on, high_bit)) == 1) { // it is ascii, we just check continuation this->has_error = _mm_or_si128(_mm_cmpgt_epi8(this->previous.carried_continuations, @@ -191,25 +194,9 @@ struct utf8_checker { this->has_error); } else { // it is not ascii so we have to do heavy work - this->previous = - check_utf8_bytes(in.v0, &(this->previous), &(this->has_error)); - this->previous = - check_utf8_bytes(in.v1, &(this->previous), &(this->has_error)); - } - - if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), high_bit)) == 1) { - // it is ascii, we just check continuation - this->has_error = - _mm_or_si128(_mm_cmpgt_epi8(this->previous.carried_continuations, - _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 1)), - this->has_error); - } else { - // it is not ascii so we have to do heavy work - this->previous = - check_utf8_bytes(in.v2, &(this->previous), &(this->has_error)); - this->previous = - check_utf8_bytes(in.v3, &(this->previous), &(this->has_error)); + in.each([&](auto _in) { + this->previous = check_utf8_bytes(_in, &(this->previous), &(this->has_error)); + }); } } diff --git a/src/westmere/stage1_find_marks.h b/src/westmere/stage1_find_marks.h index 27aea6b7..71e5a440 100644 --- a/src/westmere/stage1_find_marks.h +++ b/src/westmere/stage1_find_marks.h @@ -28,7 +28,9 @@ really_inline void find_whitespace_and_structurals(simd_input in, const __m128i struct_offset = _mm_set1_epi8(0xd4u); const __m128i struct_mask = _mm_set1_epi8(32); - whitespace = MAP_BITMASK( in, _mm_cmpeq_epi8(_in, _mm_shuffle_epi8(white_table, _in)) ); + whitespace = in.map([&](auto _in) { + return _mm_cmpeq_epi8(_in, _mm_shuffle_epi8(white_table, _in)); + }).to_bitmask(); structurals = in.map([&](auto _in) { const __m128i r1 = _mm_add_epi8(struct_offset, _in);