From 85b910814eff8c21ccd055e4136b2931d9211d41 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 30 Apr 2021 18:34:21 -0400 Subject: [PATCH] Under ARM, it is slightly better to reverse the word once and then extract the bits. (#1545) * Under ARM, it is slightly better to reverse the word once and then extract the bits. * Guarding the zero_leading_bit call to avoid sanitizer warnings. --- include/simdjson/arm64/bitmanipulation.h | 34 ++++++++++++ src/generic/stage1/json_structural_indexer.h | 57 +++++++++++++++++++- 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/include/simdjson/arm64/bitmanipulation.h b/include/simdjson/arm64/bitmanipulation.h index b56e1a0e..694aea85 100644 --- a/include/simdjson/arm64/bitmanipulation.h +++ b/include/simdjson/arm64/bitmanipulation.h @@ -46,6 +46,40 @@ simdjson_really_inline int count_ones(uint64_t input_num) { return vaddv_u8(vcnt_u8(vcreate_u8(input_num))); } + +#if defined(__GNUC__) // catches clang and gcc +/** + * ARM has a fast 64-bit "bit reversal function" that is handy. However, + * it is not generally available as an intrinsic function under Visual + * Studio (though this might be changing). Even under clang/gcc, we + * apparently need to invoke inline assembly. + */ +/* + * We use SIMDJSON_PREFER_REVERSE_BITS as a hint that algorithms that + * work well with bit reversal may use it. + */ +#define SIMDJSON_PREFER_REVERSE_BITS 1 + +/* reverse the bits */ +simdjson_really_inline uint64_t reverse_bits(uint64_t input_num) { + uint64_t rev_bits; + __asm("rbit %0, %1" : "=r"(rev_bits) : "r"(input_num)); + return rev_bits; +} + +/** + * Flips bit at index 63 - lz. Thus if you have 'leading_zeroes' leading zeroes, + * then this will set to zero the leading bit. It is possible for leading_zeroes to be + * greating or equal to 63 in which case we trigger undefined behavior, but the ouput + * of such undefined behavior is never used. + **/ +NO_SANITIZE_UNDEFINED +simdjson_really_inline uint64_t zero_leading_bit(uint64_t rev_bits, int leading_zeroes) { + return rev_bits ^ (uint64_t(0x8000000000000000) >> leading_zeroes); +} + +#endif + simdjson_really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO *result = value1 + value2; diff --git a/src/generic/stage1/json_structural_indexer.h b/src/generic/stage1/json_structural_indexer.h index 78178101..915b33e7 100644 --- a/src/generic/stage1/json_structural_indexer.h +++ b/src/generic/stage1/json_structural_indexer.h @@ -31,8 +31,62 @@ public: // it helps tremendously. if (bits == 0) return; - int cnt = static_cast(count_ones(bits)); +#if defined(SIMDJSON_PREFER_REVERSE_BITS) + /** + * ARM lacks a fast trailing zero instruction, but it has a fast + * bit reversal instruction and a fast leading zero instruction. + * Thus it may be profitable to reverse the bits (once) and then + * to rely on a sequence of instructions that call the leading + * zero instruction. + * + * Performance notes: + * The chosen routine is not optimal in terms of data dependency + * since zero_leading_bit might require two instructions. However, + * it tends to minimize the total number of instructions which is + * beneficial. + */ + uint64_t rev_bits = reverse_bits(bits); + int cnt = static_cast(count_ones(bits)); + int i = 0; + // Do the first 8 all together + for (; i<8; i++) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + // Do the next 8 all together (we hope in most cases it won't happen at all + // and the branch is easily predicted). + if (simdjson_unlikely(cnt > 8)) { + i = 8; + for (; i<16; i++) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + + + // Most files don't have 16+ structurals per block, so we take several basically guaranteed + // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) + // or the start of a value ("abc" true 123) every four characters. + if (simdjson_unlikely(cnt > 16)) { + i = 16; + while (rev_bits != 0) { + int lz = leading_zeroes(rev_bits); + this->tail[i++] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } + } + } + this->tail += cnt; +#else // SIMDJSON_PREFER_REVERSE_BITS + /** + * Under recent x64 systems, we often have both a fast trailing zero + * instruction and a fast 'clear-lower-bit' instruction so the following + * algorithm can be competitive. + */ + + int cnt = static_cast(count_ones(bits)); // Do the first 8 all together for (int i=0; i<8; i++) { this->tail[i] = idx + trailing_zeroes(bits); @@ -61,6 +115,7 @@ public: } this->tail += cnt; +#endif } };