nightingale/vendor/github.com/m3db/stackmurmur3/v2/murmur128_amd64.s

247 lines
3.5 KiB
ArmAsm

// +build go1.5,amd64
// SeedSum128(seed1, seed2 uint64, data []byte) (h1 uint64, h2 uint64)
TEXT ·SeedSum128(SB), $0-56
MOVQ seed1+0(FP), R12
MOVQ seed2+8(FP), R13
MOVQ data_base+16(FP), SI
MOVQ data_len+24(FP), R9
LEAQ h1+40(FP), BX
JMP sum128internal<>(SB)
// Sum128(data []byte) (h1 uint64, h2 uint64)
TEXT ·Sum128(SB), $0-40
XORQ R12, R12
XORQ R13, R13
MOVQ data_base+0(FP), SI
MOVQ data_len+8(FP), R9
LEAQ h1+24(FP), BX
JMP sum128internal<>(SB)
// SeedStringSum128(seed1, seed2 uint64, data string) (h1 uint64, h2 uint64)
TEXT ·SeedStringSum128(SB), $0-48
MOVQ seed1+0(FP), R12
MOVQ seed2+8(FP), R13
MOVQ data_base+16(FP), SI
MOVQ data_len+24(FP), R9
LEAQ h1+32(FP), BX
JMP sum128internal<>(SB)
// StringSum128(data string) (h1 uint64, h2 uint64)
TEXT ·StringSum128(SB), $0-32
XORQ R12, R12
XORQ R13, R13
MOVQ data_base+0(FP), SI
MOVQ data_len+8(FP), R9
LEAQ h1+16(FP), BX
JMP sum128internal<>(SB)
// Expects:
// R12 == h1 uint64 seed
// R13 == h2 uint64 seed
// SI == &data
// R9 == len(data)
// BX == &[2]uint64 return
TEXT sum128internal<>(SB), $0
MOVQ $0x87c37b91114253d5, R14 // c1
MOVQ $0x4cf5ad432745937f, R15 // c2
MOVQ R9, CX
ANDQ $-16, CX // cx == data_len - (data_len % 16)
// for r10 = 0; r10 < cx; r10 += 16 {...
XORQ R10, R10
loop:
CMPQ R10, CX
JE tail
MOVQ (SI)(R10*1), AX
MOVQ 8(SI)(R10*1), DX
ADDQ $16, R10
IMULQ R14, AX
IMULQ R15, DX
ROLQ $31, AX
ROLQ $33, DX
IMULQ R15, AX
IMULQ R14, DX
XORQ AX, R12
ROLQ $27, R12
ADDQ R13, R12
XORQ DX, R13
ROLQ $31, R13
LEAQ 0x52dce729(R12)(R12*4), R12
ADDQ R12, R13
LEAQ 0x38495ab5(R13)(R13*4), R13
JMP loop
tail:
MOVQ R9, CX
ANDQ $0xf, CX
JZ finalize // if len % 16 == 0
XORQ AX, AX
// poor man's binary tree jump table
SUBQ $8, CX
JZ tail8
JG over8
ADDQ $4, CX
JZ tail4
JG over4
ADDQ $2, CX
JL tail1
JZ tail2
JMP tail3
over4:
SUBQ $2, CX
JL tail5
JZ tail6
JMP tail7
over8:
SUBQ $4, CX
JZ tail12
JG over12
ADDQ $2, CX
JL tail9
JZ tail10
JMP tail11
over12:
SUBQ $2, CX
JL tail13
JZ tail14
tail15:
MOVBQZX 14(SI)(R10*1), AX
SALQ $16, AX
tail14:
MOVW 12(SI)(R10*1), AX
SALQ $32, AX
JMP tail12
tail13:
MOVBQZX 12(SI)(R10*1), AX
SALQ $32, AX
tail12:
MOVL 8(SI)(R10*1), DX
ORQ DX, AX
JMP fintailhigh
tail11:
MOVBQZX 10(SI)(R10*1), AX
SALQ $16, AX
tail10:
MOVW 8(SI)(R10*1), AX
JMP fintailhigh
tail9:
MOVB 8(SI)(R10*1), AL
fintailhigh:
IMULQ R15, AX
ROLQ $33, AX
IMULQ R14, AX
XORQ AX, R13
tail8:
MOVQ (SI)(R10*1), AX
JMP fintaillow
tail7:
MOVBQZX 6(SI)(R10*1), AX
SALQ $16, AX
tail6:
MOVW 4(SI)(R10*1), AX
SALQ $32, AX
JMP tail4
tail5:
MOVBQZX 4(SI)(R10*1), AX
SALQ $32, AX
tail4:
MOVL (SI)(R10*1), DX
ORQ DX, AX
JMP fintaillow
tail3:
MOVBQZX 2(SI)(R10*1), AX
SALQ $16, AX
tail2:
MOVW (SI)(R10*1), AX
JMP fintaillow
tail1:
MOVB (SI)(R10*1), AL
fintaillow:
IMULQ R14, AX
ROLQ $31, AX
IMULQ R15, AX
XORQ AX, R12
finalize:
XORQ R9, R12
XORQ R9, R13
ADDQ R13, R12
ADDQ R12, R13
// fmix128 (both interleaved)
MOVQ R12, DX
MOVQ R13, AX
SHRQ $33, DX
SHRQ $33, AX
XORQ DX, R12
XORQ AX, R13
MOVQ $0xff51afd7ed558ccd, CX
IMULQ CX, R12
IMULQ CX, R13
MOVQ R12, DX
MOVQ R13, AX
SHRQ $33, DX
SHRQ $33, AX
XORQ DX, R12
XORQ AX, R13
MOVQ $0xc4ceb9fe1a85ec53, CX
IMULQ CX, R12
IMULQ CX, R13
MOVQ R12, DX
MOVQ R13, AX
SHRQ $33, DX
SHRQ $33, AX
XORQ DX, R12
XORQ AX, R13
ADDQ R13, R12
ADDQ R12, R13
MOVQ R12, (BX)
MOVQ R13, 8(BX)
RET