Under ARM, it is slightly better to reverse the word once and then extract the bits. (#1545)

* Under ARM, it is slightly better to reverse the word once and then extract the bits.

* Guarding the zero_leading_bit call to avoid sanitizer warnings.
This commit is contained in:
Daniel Lemire 2021-04-30 18:34:21 -04:00 committed by GitHub
parent c1dffac28c
commit 85b910814e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 90 additions and 1 deletions

View File

@ -46,6 +46,40 @@ simdjson_really_inline int count_ones(uint64_t input_num) {
return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
}
#if defined(__GNUC__) // catches clang and gcc
/**
* ARM has a fast 64-bit "bit reversal function" that is handy. However,
* it is not generally available as an intrinsic function under Visual
* Studio (though this might be changing). Even under clang/gcc, we
* apparently need to invoke inline assembly.
*/
/*
* We use SIMDJSON_PREFER_REVERSE_BITS as a hint that algorithms that
* work well with bit reversal may use it.
*/
#define SIMDJSON_PREFER_REVERSE_BITS 1
/* reverse the bits */
simdjson_really_inline uint64_t reverse_bits(uint64_t input_num) {
uint64_t rev_bits;
__asm("rbit %0, %1" : "=r"(rev_bits) : "r"(input_num));
return rev_bits;
}
/**
* Flips bit at index 63 - lz. Thus if you have 'leading_zeroes' leading zeroes,
* then this will set to zero the leading bit. It is possible for leading_zeroes to be
* greating or equal to 63 in which case we trigger undefined behavior, but the ouput
* of such undefined behavior is never used.
**/
NO_SANITIZE_UNDEFINED
simdjson_really_inline uint64_t zero_leading_bit(uint64_t rev_bits, int leading_zeroes) {
return rev_bits ^ (uint64_t(0x8000000000000000) >> leading_zeroes);
}
#endif
simdjson_really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
*result = value1 + value2;

View File

@ -31,8 +31,62 @@ public:
// it helps tremendously.
if (bits == 0)
return;
int cnt = static_cast<int>(count_ones(bits));
#if defined(SIMDJSON_PREFER_REVERSE_BITS)
/**
* ARM lacks a fast trailing zero instruction, but it has a fast
* bit reversal instruction and a fast leading zero instruction.
* Thus it may be profitable to reverse the bits (once) and then
* to rely on a sequence of instructions that call the leading
* zero instruction.
*
* Performance notes:
* The chosen routine is not optimal in terms of data dependency
* since zero_leading_bit might require two instructions. However,
* it tends to minimize the total number of instructions which is
* beneficial.
*/
uint64_t rev_bits = reverse_bits(bits);
int cnt = static_cast<int>(count_ones(bits));
int i = 0;
// Do the first 8 all together
for (; i<8; i++) {
int lz = leading_zeroes(rev_bits);
this->tail[i] = static_cast<uint32_t>(idx) + lz;
rev_bits = zero_leading_bit(rev_bits, lz);
}
// Do the next 8 all together (we hope in most cases it won't happen at all
// and the branch is easily predicted).
if (simdjson_unlikely(cnt > 8)) {
i = 8;
for (; i<16; i++) {
int lz = leading_zeroes(rev_bits);
this->tail[i] = static_cast<uint32_t>(idx) + lz;
rev_bits = zero_leading_bit(rev_bits, lz);
}
// Most files don't have 16+ structurals per block, so we take several basically guaranteed
// branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
// or the start of a value ("abc" true 123) every four characters.
if (simdjson_unlikely(cnt > 16)) {
i = 16;
while (rev_bits != 0) {
int lz = leading_zeroes(rev_bits);
this->tail[i++] = static_cast<uint32_t>(idx) + lz;
rev_bits = zero_leading_bit(rev_bits, lz);
}
}
}
this->tail += cnt;
#else // SIMDJSON_PREFER_REVERSE_BITS
/**
* Under recent x64 systems, we often have both a fast trailing zero
* instruction and a fast 'clear-lower-bit' instruction so the following
* algorithm can be competitive.
*/
int cnt = static_cast<int>(count_ones(bits));
// Do the first 8 all together
for (int i=0; i<8; i++) {
this->tail[i] = idx + trailing_zeroes(bits);
@ -61,6 +115,7 @@ public:
}
this->tail += cnt;
#endif
}
};