buildtools/gcc/gmp/mpn/x86_64/popham.asm
2018-03-19 15:46:45 -05:00

158 lines
3.3 KiB
NASM

dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
dnl Copyright 2004, 2005, 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C popcount hamdist
C cycles/limb cycles/limb
C K8,K9: 6 7
C K10: 6 7
C P4: 12 14.3
C P6-15: 7 8
C TODO
C * Tune. It should be possible to reach 5 c/l for popcount and 6 c/l for
C hamdist for K8/K9.
ifdef(`OPERATION_popcount',`
define(`func',`mpn_popcount')
define(`up', `%rdi')
define(`n', `%rsi')
define(`h55555555', `%r10')
define(`h33333333', `%r11')
define(`h0f0f0f0f', `%rcx')
define(`h01010101', `%rdx')
define(`HAM', `dnl')
')
ifdef(`OPERATION_hamdist',`
define(`func',`mpn_hamdist')
define(`up', `%rdi')
define(`vp', `%rsi')
define(`n', `%rdx')
define(`h55555555', `%r10')
define(`h33333333', `%r11')
define(`h0f0f0f0f', `%rcx')
define(`h01010101', `%r14')
define(`HAM', `$1')
')
MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(func)
pushq %r12
pushq %r13
HAM(` pushq %r14 ')
movq $0x5555555555555555, h55555555
movq $0x3333333333333333, h33333333
movq $0x0f0f0f0f0f0f0f0f, h0f0f0f0f
movq $0x0101010101010101, h01010101
leaq (up,n,8), up
HAM(` leaq (vp,n,8), vp ')
negq n
xorl %eax, %eax
btq $0, n
jnc L(oop)
movq (up,n,8), %r8
HAM(` xorq (vp,n,8), %r8 ')
movq %r8, %r9
shrq %r8
andq h55555555, %r8
subq %r8, %r9
movq %r9, %r8
shrq $2, %r9
andq h33333333, %r8
andq h33333333, %r9
addq %r8, %r9 C 16 4-bit fields (0..4)
movq %r9, %r8
shrq $4, %r9
andq h0f0f0f0f, %r8
andq h0f0f0f0f, %r9
addq %r8, %r9 C 8 8-bit fields (0..16)
imulq h01010101, %r9 C sum the 8 fields in high 8 bits
shrq $56, %r9
addq %r9, %rax C add to total
addq $1, n
jz L(done)
ALIGN(16)
L(oop): movq (up,n,8), %r8
movq 8(up,n,8), %r12
HAM(` xorq (vp,n,8), %r8 ')
HAM(` xorq 8(vp,n,8), %r12 ')
movq %r8, %r9
movq %r12, %r13
shrq %r8
shrq %r12
andq h55555555, %r8
andq h55555555, %r12
subq %r8, %r9
subq %r12, %r13
movq %r9, %r8
movq %r13, %r12
shrq $2, %r9
shrq $2, %r13
andq h33333333, %r8
andq h33333333, %r9
andq h33333333, %r12
andq h33333333, %r13
addq %r8, %r9 C 16 4-bit fields (0..4)
addq %r12, %r13 C 16 4-bit fields (0..4)
addq %r13, %r9 C 16 4-bit fields (0..8)
movq %r9, %r8
shrq $4, %r9
andq h0f0f0f0f, %r8
andq h0f0f0f0f, %r9
addq %r8, %r9 C 8 8-bit fields (0..16)
imulq h01010101, %r9 C sum the 8 fields in high 8 bits
shrq $56, %r9
addq %r9, %rax C add to total
addq $2, n
jnc L(oop)
L(done):
HAM(` popq %r14 ')
popq %r13
popq %r12
ret
EPILOGUE()