simd8x64::compress optimisation for Apple M1 (#1708)

* Optimized the arm64 implementation of simd8x64::compress

This is ~35% faster on the fast_minify benchmarks on Apple M1

* Return byte-count from simd8x64::compress

This avoids a redundant popcount on ARM, for ~3% faster minify
on Apple M1
This commit is contained in:
Dougall Johnson 2021-09-02 05:25:11 +10:00 committed by GitHub
parent cebe3fb299
commit 64b62fd3b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 50 additions and 10 deletions

View File

@ -57,6 +57,19 @@ simdjson_really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint
return x;
}
simdjson_really_inline uint8x8_t make_uint8x8_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8) {
uint8x8_t x{};
x = vset_lane_u8(x1, x, 0);
x = vset_lane_u8(x2, x, 1);
x = vset_lane_u8(x3, x, 2);
x = vset_lane_u8(x4, x, 3);
x = vset_lane_u8(x5, x, 4);
x = vset_lane_u8(x6, x, 5);
x = vset_lane_u8(x7, x, 6);
x = vset_lane_u8(x8, x, 7);
return x;
}
// We have to do the same work for make_int8x16_t
simdjson_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
@ -289,6 +302,27 @@ simdjson_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x
vst1q_u8(reinterpret_cast<uint8_t*>(output), answer);
}
// Copies all bytes corresponding to a 0 in the low half of the mask (interpreted as a
// bitset) to output1, then those corresponding to a 0 in the high half to output2.
template<typename L>
simdjson_really_inline void compress_halves(uint16_t mask, L *output1, L *output2) const {
using internal::thintable_epi8;
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]);
uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]);
// we increment by 0x08 the second half of the mask
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
uint8x8_t inc = make_uint8x8_t(0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
#else
uint8x8_t inc = {0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
#endif
compactmask2 = vadd_u8(compactmask2, inc);
// store each result (with the second store possibly overlapping the first)
vst1_u8((uint8_t*)output1, vqtbl1_u8(*this, compactmask1));
vst1_u8((uint8_t*)output2, vqtbl1_u8(*this, compactmask2));
}
template<typename L>
simdjson_really_inline simd8<L> lookup_16(
L replace0, L replace1, L replace2, L replace3,
@ -439,11 +473,15 @@ simdjson_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x
}
simdjson_really_inline void compress(uint64_t mask, T * output) const {
this->chunks[0].compress(uint16_t(mask), output);
this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
this->chunks[2].compress(uint16_t(mask >> 32), output + 32 - count_ones(mask & 0xFFFFFFFF));
this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
simdjson_really_inline uint64_t compress(uint64_t mask, T * output) const {
uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
// compute the prefix sum of the popcounts of each byte
uint64_t offsets = popcounts * 0x0101010101010101;
this->chunks[0].compress_halves(uint16_t(mask), output, &output[popcounts & 0xFF]);
this->chunks[1].compress_halves(uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF], &output[(offsets >> 16) & 0xFF]);
this->chunks[2].compress_halves(uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF], &output[(offsets >> 32) & 0xFF]);
this->chunks[3].compress_halves(uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF], &output[(offsets >> 48) & 0xFF]);
return offsets >> 56;
}
simdjson_really_inline uint64_t to_bitmask() const {

View File

@ -303,11 +303,12 @@ namespace simd {
simdjson_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
simdjson_really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
simdjson_really_inline void compress(uint64_t mask, T * output) const {
simdjson_really_inline uint64_t compress(uint64_t mask, T * output) const {
uint32_t mask1 = uint32_t(mask);
uint32_t mask2 = uint32_t(mask >> 32);
this->chunks[0].compress(mask1, output);
this->chunks[1].compress(mask2, output + 32 - count_ones(mask1));
return 64 - count_ones(mask);
}
simdjson_really_inline void store(T ptr[64]) const {

View File

@ -422,7 +422,7 @@ template <typename T> struct simd8x64 {
(this->chunks[2] | this->chunks[3]);
}
simdjson_really_inline void compress(uint64_t mask, T *output) const {
simdjson_really_inline uint64_t compress(uint64_t mask, T *output) const {
this->chunks[0].compress(uint16_t(mask), output);
this->chunks[1].compress(uint16_t(mask >> 16),
output + 16 - count_ones(mask & 0xFFFF));
@ -430,6 +430,7 @@ template <typename T> struct simd8x64 {
output + 32 - count_ones(mask & 0xFFFFFFFF));
this->chunks[3].compress(uint16_t(mask >> 48),
output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
return 64 - count_ones(mask);
}
simdjson_really_inline uint64_t to_bitmask() const {

View File

@ -284,11 +284,12 @@ namespace simd {
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
}
simdjson_really_inline void compress(uint64_t mask, T * output) const {
simdjson_really_inline uint64_t compress(uint64_t mask, T * output) const {
this->chunks[0].compress(uint16_t(mask), output);
this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
this->chunks[2].compress(uint16_t(mask >> 32), output + 32 - count_ones(mask & 0xFFFFFFFF));
this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
return 64 - count_ones(mask);
}
simdjson_really_inline uint64_t to_bitmask() const {

View File

@ -27,8 +27,7 @@ private:
simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
uint64_t mask = block.whitespace();
in.compress(mask, dst);
dst += 64 - count_ones(mask);
dst += in.compress(mask, dst);
}
simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {