This commit is contained in:
Daniel Lemire 2019-12-13 13:39:15 -05:00
parent fc6133b58f
commit f02babe427
1 changed files with 8 additions and 1 deletions

View File

@ -16,12 +16,19 @@ namespace simdjson::arm64 {
// For example, prefix_xor(00100100) == 00011100 // For example, prefix_xor(00100100) == 00011100
// //
really_inline uint64_t prefix_xor(uint64_t bitmask) { really_inline uint64_t prefix_xor(uint64_t bitmask) {
// /////////////
// We could do this with PMULL, but it is apparently slow. // We could do this with PMULL, but it is apparently slow.
// //
//#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension //#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
//return vmull_p64(-1ULL, bitmask); //return vmull_p64(-1ULL, bitmask);
//#else //#else
// Analysis by @sebpop:
// When diffing the assembly for src/stage1_find_marks.cpp I see that the eors are all spread out
// in between other vector code, so effectively the extra cycles of the sequence do not matter
// because the GPR units are idle otherwise and the critical path is on the FP side.
// Also the PMULL requires two extra fmovs: GPR->FP (3 cycles in N1, 5 cycles in A72 )
// and FP->GPR (2 cycles on N1 and 5 cycles on A72.)
///////////
bitmask ^= bitmask << 1; bitmask ^= bitmask << 1;
bitmask ^= bitmask << 2; bitmask ^= bitmask << 2;
bitmask ^= bitmask << 4; bitmask ^= bitmask << 4;