Adding analysis by @sebpop from https://github.com/lemire/simdjson/pull/391#issuecomment-565551462
This commit is contained in:
parent
fc6133b58f
commit
f02babe427
|
@ -16,12 +16,19 @@ namespace simdjson::arm64 {
|
|||
// For example, prefix_xor(00100100) == 00011100
|
||||
//
|
||||
really_inline uint64_t prefix_xor(uint64_t bitmask) {
|
||||
//
|
||||
/////////////
|
||||
// We could do this with PMULL, but it is apparently slow.
|
||||
//
|
||||
//#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
|
||||
//return vmull_p64(-1ULL, bitmask);
|
||||
//#else
|
||||
// Analysis by @sebpop:
|
||||
// When diffing the assembly for src/stage1_find_marks.cpp I see that the eors are all spread out
|
||||
// in between other vector code, so effectively the extra cycles of the sequence do not matter
|
||||
// because the GPR units are idle otherwise and the critical path is on the FP side.
|
||||
// Also the PMULL requires two extra fmovs: GPR->FP (3 cycles in N1, 5 cycles in A72 )
|
||||
// and FP->GPR (2 cycles on N1 and 5 cycles on A72.)
|
||||
///////////
|
||||
bitmask ^= bitmask << 1;
|
||||
bitmask ^= bitmask << 2;
|
||||
bitmask ^= bitmask << 4;
|
||||
|
|
Loading…
Reference in New Issue