Under ARM, it is slightly better to reverse the word once and then extract the bits. (#1545)
* Under ARM, it is slightly better to reverse the word once and then extract the bits. * Guarding the zero_leading_bit call to avoid sanitizer warnings.
This commit is contained in:
parent
c1dffac28c
commit
85b910814e
|
@ -46,6 +46,40 @@ simdjson_really_inline int count_ones(uint64_t input_num) {
|
|||
return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
|
||||
}
|
||||
|
||||
|
||||
#if defined(__GNUC__) // catches clang and gcc
|
||||
/**
|
||||
* ARM has a fast 64-bit "bit reversal function" that is handy. However,
|
||||
* it is not generally available as an intrinsic function under Visual
|
||||
* Studio (though this might be changing). Even under clang/gcc, we
|
||||
* apparently need to invoke inline assembly.
|
||||
*/
|
||||
/*
|
||||
* We use SIMDJSON_PREFER_REVERSE_BITS as a hint that algorithms that
|
||||
* work well with bit reversal may use it.
|
||||
*/
|
||||
#define SIMDJSON_PREFER_REVERSE_BITS 1
|
||||
|
||||
/* reverse the bits */
|
||||
simdjson_really_inline uint64_t reverse_bits(uint64_t input_num) {
|
||||
uint64_t rev_bits;
|
||||
__asm("rbit %0, %1" : "=r"(rev_bits) : "r"(input_num));
|
||||
return rev_bits;
|
||||
}
|
||||
|
||||
/**
|
||||
* Flips bit at index 63 - lz. Thus if you have 'leading_zeroes' leading zeroes,
|
||||
* then this will set to zero the leading bit. It is possible for leading_zeroes to be
|
||||
* greating or equal to 63 in which case we trigger undefined behavior, but the ouput
|
||||
* of such undefined behavior is never used.
|
||||
**/
|
||||
NO_SANITIZE_UNDEFINED
|
||||
simdjson_really_inline uint64_t zero_leading_bit(uint64_t rev_bits, int leading_zeroes) {
|
||||
return rev_bits ^ (uint64_t(0x8000000000000000) >> leading_zeroes);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
simdjson_really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
*result = value1 + value2;
|
||||
|
|
|
@ -31,8 +31,62 @@ public:
|
|||
// it helps tremendously.
|
||||
if (bits == 0)
|
||||
return;
|
||||
int cnt = static_cast<int>(count_ones(bits));
|
||||
#if defined(SIMDJSON_PREFER_REVERSE_BITS)
|
||||
/**
|
||||
* ARM lacks a fast trailing zero instruction, but it has a fast
|
||||
* bit reversal instruction and a fast leading zero instruction.
|
||||
* Thus it may be profitable to reverse the bits (once) and then
|
||||
* to rely on a sequence of instructions that call the leading
|
||||
* zero instruction.
|
||||
*
|
||||
* Performance notes:
|
||||
* The chosen routine is not optimal in terms of data dependency
|
||||
* since zero_leading_bit might require two instructions. However,
|
||||
* it tends to minimize the total number of instructions which is
|
||||
* beneficial.
|
||||
*/
|
||||
|
||||
uint64_t rev_bits = reverse_bits(bits);
|
||||
int cnt = static_cast<int>(count_ones(bits));
|
||||
int i = 0;
|
||||
// Do the first 8 all together
|
||||
for (; i<8; i++) {
|
||||
int lz = leading_zeroes(rev_bits);
|
||||
this->tail[i] = static_cast<uint32_t>(idx) + lz;
|
||||
rev_bits = zero_leading_bit(rev_bits, lz);
|
||||
}
|
||||
// Do the next 8 all together (we hope in most cases it won't happen at all
|
||||
// and the branch is easily predicted).
|
||||
if (simdjson_unlikely(cnt > 8)) {
|
||||
i = 8;
|
||||
for (; i<16; i++) {
|
||||
int lz = leading_zeroes(rev_bits);
|
||||
this->tail[i] = static_cast<uint32_t>(idx) + lz;
|
||||
rev_bits = zero_leading_bit(rev_bits, lz);
|
||||
}
|
||||
|
||||
|
||||
// Most files don't have 16+ structurals per block, so we take several basically guaranteed
|
||||
// branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
|
||||
// or the start of a value ("abc" true 123) every four characters.
|
||||
if (simdjson_unlikely(cnt > 16)) {
|
||||
i = 16;
|
||||
while (rev_bits != 0) {
|
||||
int lz = leading_zeroes(rev_bits);
|
||||
this->tail[i++] = static_cast<uint32_t>(idx) + lz;
|
||||
rev_bits = zero_leading_bit(rev_bits, lz);
|
||||
}
|
||||
}
|
||||
}
|
||||
this->tail += cnt;
|
||||
#else // SIMDJSON_PREFER_REVERSE_BITS
|
||||
/**
|
||||
* Under recent x64 systems, we often have both a fast trailing zero
|
||||
* instruction and a fast 'clear-lower-bit' instruction so the following
|
||||
* algorithm can be competitive.
|
||||
*/
|
||||
|
||||
int cnt = static_cast<int>(count_ones(bits));
|
||||
// Do the first 8 all together
|
||||
for (int i=0; i<8; i++) {
|
||||
this->tail[i] = idx + trailing_zeroes(bits);
|
||||
|
@ -61,6 +115,7 @@ public:
|
|||
}
|
||||
|
||||
this->tail += cnt;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue