simd8x64::compress optimisation for Apple M1 (#1708)
* Optimized the arm64 implementation of simd8x64::compress This is ~35% faster on the fast_minify benchmarks on Apple M1 * Return byte-count from simd8x64::compress This avoids a redundant popcount on ARM, for ~3% faster minify on Apple M1
This commit is contained in:
parent
cebe3fb299
commit
64b62fd3b3
|
@ -57,6 +57,19 @@ simdjson_really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint
|
|||
return x;
|
||||
}
|
||||
|
||||
simdjson_really_inline uint8x8_t make_uint8x8_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
|
||||
uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8) {
|
||||
uint8x8_t x{};
|
||||
x = vset_lane_u8(x1, x, 0);
|
||||
x = vset_lane_u8(x2, x, 1);
|
||||
x = vset_lane_u8(x3, x, 2);
|
||||
x = vset_lane_u8(x4, x, 3);
|
||||
x = vset_lane_u8(x5, x, 4);
|
||||
x = vset_lane_u8(x6, x, 5);
|
||||
x = vset_lane_u8(x7, x, 6);
|
||||
x = vset_lane_u8(x8, x, 7);
|
||||
return x;
|
||||
}
|
||||
|
||||
// We have to do the same work for make_int8x16_t
|
||||
simdjson_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
|
||||
|
@ -289,6 +302,27 @@ simdjson_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x
|
|||
vst1q_u8(reinterpret_cast<uint8_t*>(output), answer);
|
||||
}
|
||||
|
||||
// Copies all bytes corresponding to a 0 in the low half of the mask (interpreted as a
|
||||
// bitset) to output1, then those corresponding to a 0 in the high half to output2.
|
||||
template<typename L>
|
||||
simdjson_really_inline void compress_halves(uint16_t mask, L *output1, L *output2) const {
|
||||
using internal::thintable_epi8;
|
||||
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
||||
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
||||
uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]);
|
||||
uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]);
|
||||
// we increment by 0x08 the second half of the mask
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
uint8x8_t inc = make_uint8x8_t(0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
|
||||
#else
|
||||
uint8x8_t inc = {0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
|
||||
#endif
|
||||
compactmask2 = vadd_u8(compactmask2, inc);
|
||||
// store each result (with the second store possibly overlapping the first)
|
||||
vst1_u8((uint8_t*)output1, vqtbl1_u8(*this, compactmask1));
|
||||
vst1_u8((uint8_t*)output2, vqtbl1_u8(*this, compactmask2));
|
||||
}
|
||||
|
||||
template<typename L>
|
||||
simdjson_really_inline simd8<L> lookup_16(
|
||||
L replace0, L replace1, L replace2, L replace3,
|
||||
|
@ -439,11 +473,15 @@ simdjson_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x
|
|||
}
|
||||
|
||||
|
||||
simdjson_really_inline void compress(uint64_t mask, T * output) const {
|
||||
this->chunks[0].compress(uint16_t(mask), output);
|
||||
this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
|
||||
this->chunks[2].compress(uint16_t(mask >> 32), output + 32 - count_ones(mask & 0xFFFFFFFF));
|
||||
this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
|
||||
simdjson_really_inline uint64_t compress(uint64_t mask, T * output) const {
|
||||
uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
|
||||
// compute the prefix sum of the popcounts of each byte
|
||||
uint64_t offsets = popcounts * 0x0101010101010101;
|
||||
this->chunks[0].compress_halves(uint16_t(mask), output, &output[popcounts & 0xFF]);
|
||||
this->chunks[1].compress_halves(uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF], &output[(offsets >> 16) & 0xFF]);
|
||||
this->chunks[2].compress_halves(uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF], &output[(offsets >> 32) & 0xFF]);
|
||||
this->chunks[3].compress_halves(uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF], &output[(offsets >> 48) & 0xFF]);
|
||||
return offsets >> 56;
|
||||
}
|
||||
|
||||
simdjson_really_inline uint64_t to_bitmask() const {
|
||||
|
|
|
@ -303,11 +303,12 @@ namespace simd {
|
|||
simdjson_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
|
||||
simdjson_really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
|
||||
|
||||
simdjson_really_inline void compress(uint64_t mask, T * output) const {
|
||||
simdjson_really_inline uint64_t compress(uint64_t mask, T * output) const {
|
||||
uint32_t mask1 = uint32_t(mask);
|
||||
uint32_t mask2 = uint32_t(mask >> 32);
|
||||
this->chunks[0].compress(mask1, output);
|
||||
this->chunks[1].compress(mask2, output + 32 - count_ones(mask1));
|
||||
return 64 - count_ones(mask);
|
||||
}
|
||||
|
||||
simdjson_really_inline void store(T ptr[64]) const {
|
||||
|
|
|
@ -422,7 +422,7 @@ template <typename T> struct simd8x64 {
|
|||
(this->chunks[2] | this->chunks[3]);
|
||||
}
|
||||
|
||||
simdjson_really_inline void compress(uint64_t mask, T *output) const {
|
||||
simdjson_really_inline uint64_t compress(uint64_t mask, T *output) const {
|
||||
this->chunks[0].compress(uint16_t(mask), output);
|
||||
this->chunks[1].compress(uint16_t(mask >> 16),
|
||||
output + 16 - count_ones(mask & 0xFFFF));
|
||||
|
@ -430,6 +430,7 @@ template <typename T> struct simd8x64 {
|
|||
output + 32 - count_ones(mask & 0xFFFFFFFF));
|
||||
this->chunks[3].compress(uint16_t(mask >> 48),
|
||||
output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
|
||||
return 64 - count_ones(mask);
|
||||
}
|
||||
|
||||
simdjson_really_inline uint64_t to_bitmask() const {
|
||||
|
|
|
@ -284,11 +284,12 @@ namespace simd {
|
|||
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
|
||||
}
|
||||
|
||||
simdjson_really_inline void compress(uint64_t mask, T * output) const {
|
||||
simdjson_really_inline uint64_t compress(uint64_t mask, T * output) const {
|
||||
this->chunks[0].compress(uint16_t(mask), output);
|
||||
this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
|
||||
this->chunks[2].compress(uint16_t(mask >> 32), output + 32 - count_ones(mask & 0xFFFFFFFF));
|
||||
this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
|
||||
return 64 - count_ones(mask);
|
||||
}
|
||||
|
||||
simdjson_really_inline uint64_t to_bitmask() const {
|
||||
|
|
|
@ -27,8 +27,7 @@ private:
|
|||
|
||||
simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
|
||||
uint64_t mask = block.whitespace();
|
||||
in.compress(mask, dst);
|
||||
dst += 64 - count_ones(mask);
|
||||
dst += in.compress(mask, dst);
|
||||
}
|
||||
|
||||
simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
|
||||
|
|
Loading…
Reference in New Issue