diff --git a/src/arm64/simd.h b/src/arm64/simd.h index 19491c3e..525e302c 100644 --- a/src/arm64/simd.h +++ b/src/arm64/simd.h @@ -91,6 +91,16 @@ namespace simdjson::arm64::simd { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,v11,v12,v13,v14,v15 }) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + really_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } // Store to array really_inline void store(uint8_t dst[16]) { return vst1q_u8(dst, *this); } @@ -119,7 +129,7 @@ namespace simdjson::arm64::simd { template really_inline simd8 shl() const { return vshlq_n_u8(*this, N); } - // Perform a lookup assuming no value is larger than 16 + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) template really_inline simd8 lookup_16( L replace0, L replace1, L replace2, L replace3, @@ -135,23 +145,9 @@ namespace simdjson::arm64::simd { return lookup_table.apply_lookup_16_to(*this); } - // Perform a lookup of the lower 4 bits - template - really_inline simd8 lookup_lower_4_bits( - L replace0, L replace1, L replace2, L replace3, - L replace4, L replace5, L replace6, L replace7, - L replace8, L replace9, L replace10, L replace11, - L replace12, L replace13, L replace14, L replace15) const { - return (*this & 0xF).lookup_16( - replace0, replace1, replace2, replace3, - replace4, replace5, replace6, replace7, - replace8, replace9, replace10, replace11, - replace12, replace13, replace14, replace15 - ); - } - - really_inline simd8 apply_lookup_16_to(const simd8 original) { - return vqtbl1q_u8(*this, original); + template + really_inline simd8 apply_lookup_16_to(const simd8 original) { + return vqtbl1q_u8(*this, simd8(original)); } }; @@ -183,6 +179,16 @@ namespace simdjson::arm64::simd { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,v11,v12,v13,v14,v15 }) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + really_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } // Store to array really_inline void store(int8_t dst[16]) { return vst1q_s8(dst, *this); } @@ -223,7 +229,8 @@ namespace simdjson::arm64::simd { ); } - really_inline simd8 apply_lookup_16_to(const simd8 original) { + template + really_inline simd8 apply_lookup_16_to(const simd8 original) { return vqtbl1q_s8(*this, original); } }; diff --git a/src/haswell/simd.h b/src/haswell/simd.h index 27552bf2..17192b1b 100644 --- a/src/haswell/simd.h +++ b/src/haswell/simd.h @@ -90,39 +90,23 @@ namespace simdjson::haswell::simd { really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } - // Perform a lookup of the lower 4 bits + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) template - really_inline simd8 lookup_lower_4_bits( - L replace0, L replace1, L replace2, L replace3, - L replace4, L replace5, L replace6, L replace7, - L replace8, L replace9, L replace10, L replace11, - L replace12, L replace13, L replace14, L replace15) const { - simd8 lookup_table( - replace0, replace1, replace2, replace3, - replace4, replace5, replace6, replace7, - replace8, replace9, replace10, replace11, - replace12, replace13, replace14, replace15, - replace0, replace1, replace2, replace3, - replace4, replace5, replace6, replace7, - replace8, replace9, replace10, replace11, - replace12, replace13, replace14, replace15 - ); + really_inline simd8 lookup_16(simd8 lookup_table) const { return _mm256_shuffle_epi8(lookup_table, *this); } - - // Perform a lookup assuming the value is between 0 and 16 template really_inline simd8 lookup_16( L replace0, L replace1, L replace2, L replace3, L replace4, L replace5, L replace6, L replace7, L replace8, L replace9, L replace10, L replace11, L replace12, L replace13, L replace14, L replace15) const { - return lookup_lower_4_bits( + return lookup_16(simd8::repeat_16( replace0, replace1, replace2, replace3, replace4, replace5, replace6, replace7, replace8, replace9, replace10, replace11, replace12, replace13, replace14, replace15 - ); + )); } }; @@ -147,6 +131,18 @@ namespace simdjson::haswell::simd { v16,v17,v18,v19,v20,v21,v22,v23, v24,v25,v26,v27,v28,v29,v30,v31 )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + really_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } // Order-sensitive comparisons really_inline simd8 max(const simd8 other) const { return _mm256_max_epi8(*this, other); } @@ -175,6 +171,18 @@ namespace simdjson::haswell::simd { v16,v17,v18,v19,v20,v21,v22,v23, v24,v25,v26,v27,v28,v29,v30,v31 )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + really_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } // Saturated math really_inline simd8 saturating_add(const simd8 other) const { return _mm256_adds_epu8(*this, other); } diff --git a/src/haswell/stage1_find_marks.h b/src/haswell/stage1_find_marks.h index c4d5e65e..a6aca2d4 100644 --- a/src/haswell/stage1_find_marks.h +++ b/src/haswell/stage1_find_marks.h @@ -18,12 +18,18 @@ really_inline void find_whitespace_and_operators( const simd::simd8x64 in, uint64_t &whitespace, uint64_t &op) { + // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why + // we can't use the generic lookup_16. + auto whitespace_table = simd8::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100); + auto op_table = simd8::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{'); + whitespace = in.map([&](simd8 _in) { - return _in == _in.lookup_lower_4_bits(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100); + return _in == simd8(_mm256_shuffle_epi8(whitespace_table, _in)); }).to_bitmask(); op = in.map([&](simd8 _in) { - return (_in | 32) == (_in+0xd4u).lookup_lower_4_bits(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{'); + // | 32 handles the fact that { } and [ ] are exactly 32 bytes apart + return (_in | 32) == simd8(_mm256_shuffle_epi8(op_table, _in-',')); }).to_bitmask(); } diff --git a/src/westmere/simd.h b/src/westmere/simd.h index 39b93450..25a20387 100644 --- a/src/westmere/simd.h +++ b/src/westmere/simd.h @@ -91,36 +91,23 @@ namespace simdjson::westmere::simd { really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } - // Perform a lookup of the lower 4 bits + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) template - really_inline simd8 lookup_lower_4_bits( - L replace0, L replace1, L replace2, L replace3, - L replace4, L replace5, L replace6, L replace7, - L replace8, L replace9, L replace10, L replace11, - L replace12, L replace13, L replace14, L replace15) const { - - simd8 lookup_table( - replace0, replace1, replace2, replace3, - replace4, replace5, replace6, replace7, - replace8, replace9, replace10, replace11, - replace12, replace13, replace14, replace15 - ); + really_inline simd8 lookup_16(simd8 lookup_table) const { return _mm_shuffle_epi8(lookup_table, *this); } - - // Perform a lookup assuming the value is between 0 and 16 template really_inline simd8 lookup_16( L replace0, L replace1, L replace2, L replace3, L replace4, L replace5, L replace6, L replace7, L replace8, L replace9, L replace10, L replace11, L replace12, L replace13, L replace14, L replace15) const { - return lookup_lower_4_bits( + return lookup_16(simd8::repeat_16( replace0, replace1, replace2, replace3, replace4, replace5, replace6, replace7, replace8, replace9, replace10, replace11, replace12, replace13, replace14, replace15 - ); + )); } }; @@ -141,6 +128,16 @@ namespace simdjson::westmere::simd { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,v11,v12,v13,v14,v15 )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + really_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } // Order-sensitive comparisons really_inline simd8 max(const simd8 other) const { return _mm_max_epi8(*this, other); } @@ -165,6 +162,16 @@ namespace simdjson::westmere::simd { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,v11,v12,v13,v14,v15 )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + really_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } // Saturated math really_inline simd8 saturating_add(const simd8 other) const { return _mm_adds_epu8(*this, other); } diff --git a/src/westmere/stage1_find_marks.h b/src/westmere/stage1_find_marks.h index d84e952b..58e3a91c 100644 --- a/src/westmere/stage1_find_marks.h +++ b/src/westmere/stage1_find_marks.h @@ -18,12 +18,18 @@ really_inline void find_whitespace_and_operators( const simd8x64 in, uint64_t &whitespace, uint64_t &op) { + // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why + // we can't use the generic lookup_16. + auto whitespace_table = simd8::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100); + auto op_table = simd8::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{'); + whitespace = in.map([&](simd8 _in) { - return _in == _in.lookup_lower_4_bits(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100); + return _in == simd8(_mm_shuffle_epi8(whitespace_table, _in)); }).to_bitmask(); op = in.map([&](simd8 _in) { - return (_in | 32) == (_in+0xd4u).lookup_lower_4_bits(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{'); + // | 32 handles the fact that { } and [ ] are exactly 32 bytes apart + return (_in | 32) == simd8(_mm_shuffle_epi8(op_table, _in-',')); }).to_bitmask(); }