buildtools/gcc/gmp/mpn/ia64/submul_1.asm
Jérôme Duval b58ddff026 * modified gcc Makefile.in to copy gmp-impl.h and longlong.h headers to build gmp directory (see http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44455 ).
* merged mpfr 3.0.0 and gmp 5.0.1 in buildtools trunk


git-svn-id: file:///srv/svn/repos/haiku/buildtools/trunk@37378 a95241bf-73f2-0310-859d-f6bbb57e9c96
2010-07-03 15:21:01 +00:00

635 lines
12 KiB
NASM

dnl IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
dnl result from a second limb vector.
dnl Copyright 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C Itanium: 4.0
C Itanium 2: 2.25 (alignment dependent, sometimes it seems to need 3 c/l)
C TODO
C * Optimize feed-in and wind-down code, both for speed and code size.
C * Handle low limb input and results specially, using a common stf8 in the
C epilogue.
C * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in
C 2nd bundle. This will allow the bbb bundle to be one cycle earlier and
C save a cycle.
C INPUT PARAMETERS
define(`rp', `r32')
define(`up', `r33')
define(`n', `r34')
define(`vl', `r35')
ASM_START()
PROLOGUE(mpn_submul_1)
.prologue
.save ar.lc, r2
.body
ifdef(`HAVE_ABI_32',
` addp4 rp = 0, rp C M I
addp4 up = 0, up C M I
zxt4 n = n C I
;;
')
{.mmi
mov r10 = rp C M I
mov r9 = up C M I
sub vl = r0, vl C M I negate vl
}
{.mmi
ldf8 f8 = [rp], 8 C M
ldf8 f7 = [up], 8 C M
add r19 = -1, n C M I n - 1
;;
}
{.mmi
cmp.eq p6, p0 = 0, vl C M I
mov r8 = 0 C M I zero cylimb
mov r2 = ar.lc C I0
}
{.mmi
setf.sig f6 = vl C M2 M3
and r14 = 3, n C M I
shr.u r19 = r19, 2 C I0
;;
}
{.mmb
nop 0
cmp.eq p10, p0 = 0, r14 C M I
(p6) br.spnt .Ldone C B vl == 0
}
{.mmi
cmp.eq p11, p0 = 2, r14 C M I
cmp.eq p12, p0 = 3, r14 C M I
mov ar.lc = r19 C I0
}
{.bbb
(p10) br.dptk .Lb00 C B
(p11) br.dptk .Lb10 C B
(p12) br.dptk .Lb11 C B
;;
}
.Lb01: br.cloop.dptk .grt1
xma.l f39 = f7, f6, f8
xma.hu f43 = f7, f6, f8
;;
getf.sig r27 = f39 C lo
getf.sig r31 = f43 C hi
ld8 r20 = [r9], 8
br .Lcj1
.grt1: ldf8 f44 = [rp], 8
ldf8 f32 = [up], 8
;;
ldf8 f45 = [rp], 8
ldf8 f33 = [up], 8
;;
ldf8 f46 = [rp], 8
xma.l f39 = f7, f6, f8
ldf8 f34 = [up], 8
xma.hu f43 = f7, f6, f8
;;
ldf8 f47 = [rp], 8
xma.l f36 = f32, f6, f44
ldf8 f35 = [up], 8
xma.hu f40 = f32, f6, f44
br.cloop.dptk .grt5
;;
getf.sig r27 = f39 C lo
xma.l f37 = f33, f6, f45
ld8 r20 = [r9], 8
xma.hu f41 = f33, f6, f45
;;
getf.sig r31 = f43 C hi
getf.sig r24 = f36 C lo
xma.l f38 = f34, f6, f46
ld8 r21 = [r9], 8
xma.hu f42 = f34, f6, f46
;;
getf.sig r28 = f40 C hi
getf.sig r25 = f37 C lo
xma.l f39 = f35, f6, f47
ld8 r22 = [r9], 8
xma.hu f43 = f35, f6, f47
;;
getf.sig r29 = f41 C hi
getf.sig r26 = f38 C lo
ld8 r23 = [r9], 8
br .Lcj5
.grt5: ldf8 f44 = [rp], 8
ldf8 f32 = [up], 8
;;
getf.sig r27 = f39 C lo
xma.l f37 = f33, f6, f45
ld8 r20 = [r9], 8
xma.hu f41 = f33, f6, f45
;;
ldf8 f45 = [rp], 8
getf.sig r31 = f43 C hi
ldf8 f33 = [up], 8
;;
getf.sig r24 = f36 C lo
xma.l f38 = f34, f6, f46
ld8 r21 = [r9], 8
xma.hu f42 = f34, f6, f46
;;
ldf8 f46 = [rp], 8
getf.sig r28 = f40 C hi
ldf8 f34 = [up], 8
;;
getf.sig r25 = f37 C lo
xma.l f39 = f35, f6, f47
ld8 r22 = [r9], 8
xma.hu f43 = f35, f6, f47
;;
ldf8 f47 = [rp], 8
getf.sig r29 = f41 C hi
ldf8 f35 = [up], 8
;;
getf.sig r26 = f38 C lo
xma.l f36 = f32, f6, f44
ld8 r23 = [r9], 8
xma.hu f40 = f32, f6, f44
br.cloop.dptk .Loop
br .Lend
.Lb10: ldf8 f47 = [rp], 8
ldf8 f35 = [up], 8
br.cloop.dptk .grt2
xma.l f38 = f7, f6, f8
xma.hu f42 = f7, f6, f8
;;
xma.l f39 = f35, f6, f47
xma.hu f43 = f35, f6, f47
;;
getf.sig r26 = f38 C lo
getf.sig r30 = f42 C hi
ld8 r23 = [r9], 8
;;
getf.sig r27 = f39 C lo
getf.sig r31 = f43 C hi
ld8 r20 = [r9], 8
br .Lcj2
.grt2: ldf8 f44 = [rp], 8
ldf8 f32 = [up], 8
;;
ldf8 f45 = [rp], 8
ldf8 f33 = [up], 8
xma.l f38 = f7, f6, f8
xma.hu f42 = f7, f6, f8
;;
ldf8 f46 = [rp], 8
ldf8 f34 = [up], 8
xma.l f39 = f35, f6, f47
xma.hu f43 = f35, f6, f47
;;
ldf8 f47 = [rp], 8
ldf8 f35 = [up], 8
;;
getf.sig r26 = f38 C lo
xma.l f36 = f32, f6, f44
ld8 r23 = [r9], 8
xma.hu f40 = f32, f6, f44
br.cloop.dptk .grt6
getf.sig r30 = f42 C hi
;;
getf.sig r27 = f39 C lo
xma.l f37 = f33, f6, f45
ld8 r20 = [r9], 8
xma.hu f41 = f33, f6, f45
;;
getf.sig r31 = f43 C hi
getf.sig r24 = f36 C lo
xma.l f38 = f34, f6, f46
ld8 r21 = [r9], 8
xma.hu f42 = f34, f6, f46
;;
getf.sig r28 = f40 C hi
getf.sig r25 = f37 C lo
xma.l f39 = f35, f6, f47
ld8 r22 = [r9], 8
xma.hu f43 = f35, f6, f47
br .Lcj6
.grt6: ldf8 f44 = [rp], 8
getf.sig r30 = f42 C hi
ldf8 f32 = [up], 8
;;
getf.sig r27 = f39 C lo
xma.l f37 = f33, f6, f45
ld8 r20 = [r9], 8
xma.hu f41 = f33, f6, f45
;;
ldf8 f45 = [rp], 8
getf.sig r31 = f43 C hi
ldf8 f33 = [up], 8
;;
getf.sig r24 = f36 C lo
xma.l f38 = f34, f6, f46
ld8 r21 = [r9], 8
xma.hu f42 = f34, f6, f46
;;
ldf8 f46 = [rp], 8
getf.sig r28 = f40 C hi
ldf8 f34 = [up], 8
;;
getf.sig r25 = f37 C lo
xma.l f39 = f35, f6, f47
ld8 r22 = [r9], 8
xma.hu f43 = f35, f6, f47
br .LL10
.Lb11: ldf8 f46 = [rp], 8
ldf8 f34 = [up], 8
;;
ldf8 f47 = [rp], 8
ldf8 f35 = [up], 8
br.cloop.dptk .grt3
xma.l f37 = f7, f6, f8
xma.hu f41 = f7, f6, f8
;;
xma.l f38 = f34, f6, f46
xma.hu f42 = f34, f6, f46
;;
getf.sig r25 = f37 C lo
xma.l f39 = f35, f6, f47
xma.hu f43 = f35, f6, f47
;;
getf.sig r29 = f41 C hi
ld8 r22 = [r9], 8
;;
getf.sig r26 = f38 C lo
getf.sig r30 = f42 C hi
ld8 r23 = [r9], 8
;;
getf.sig r27 = f39 C lo
getf.sig r31 = f43 C hi
ld8 r20 = [r9], 8
br .Lcj3
.grt3: ldf8 f44 = [rp], 8
xma.l f37 = f7, f6, f8
ldf8 f32 = [up], 8
xma.hu f41 = f7, f6, f8
;;
ldf8 f45 = [rp], 8
xma.l f38 = f34, f6, f46
ldf8 f33 = [up], 8
xma.hu f42 = f34, f6, f46
;;
ldf8 f46 = [rp], 8
ldf8 f34 = [up], 8
;;
getf.sig r25 = f37 C lo
xma.l f39 = f35, f6, f47
ld8 r22 = [r9], 8
xma.hu f43 = f35, f6, f47
;;
ldf8 f47 = [rp], 8
getf.sig r29 = f41 C hi
ldf8 f35 = [up], 8
;;
getf.sig r26 = f38 C lo
xma.l f36 = f32, f6, f44
ld8 r23 = [r9], 8
xma.hu f40 = f32, f6, f44
br.cloop.dptk .grt7
;;
getf.sig r30 = f42 C hi
getf.sig r27 = f39 C lo
xma.l f37 = f33, f6, f45
ld8 r20 = [r9], 8
xma.hu f41 = f33, f6, f45
;;
getf.sig r31 = f43 C hi
getf.sig r24 = f36 C lo
xma.l f38 = f34, f6, f46
ld8 r21 = [r9], 8
xma.hu f42 = f34, f6, f46
br .Lcj7
.grt7: ldf8 f44 = [rp], 8
getf.sig r30 = f42 C hi
ldf8 f32 = [up], 8
;;
getf.sig r27 = f39 C lo
xma.l f37 = f33, f6, f45
ld8 r20 = [r9], 8
xma.hu f41 = f33, f6, f45
;;
ldf8 f45 = [rp], 8
getf.sig r31 = f43 C hi
ldf8 f33 = [up], 8
;;
getf.sig r24 = f36 C lo
xma.l f38 = f34, f6, f46
ld8 r21 = [r9], 8
xma.hu f42 = f34, f6, f46
br .LL11
.Lb00: ldf8 f45 = [rp], 8
ldf8 f33 = [up], 8
;;
ldf8 f46 = [rp], 8
ldf8 f34 = [up], 8
;;
ldf8 f47 = [rp], 8
xma.l f36 = f7, f6, f8
ldf8 f35 = [up], 8
xma.hu f40 = f7, f6, f8
br.cloop.dptk .grt4
xma.l f37 = f33, f6, f45
xma.hu f41 = f33, f6, f45
;;
getf.sig r24 = f36 C lo
xma.l f38 = f34, f6, f46
ld8 r21 = [r9], 8
xma.hu f42 = f34, f6, f46
;;
getf.sig r28 = f40 C hi
xma.l f39 = f35, f6, f47
getf.sig r25 = f37 C lo
ld8 r22 = [r9], 8
xma.hu f43 = f35, f6, f47
;;
getf.sig r29 = f41 C hi
getf.sig r26 = f38 C lo
ld8 r23 = [r9], 8
;;
getf.sig r30 = f42 C hi
getf.sig r27 = f39 C lo
ld8 r20 = [r9], 8
br .Lcj4
.grt4: ldf8 f44 = [rp], 8
xma.l f37 = f33, f6, f45
ldf8 f32 = [up], 8
xma.hu f41 = f33, f6, f45
;;
ldf8 f45 = [rp], 8
ldf8 f33 = [up], 8
xma.l f38 = f34, f6, f46
getf.sig r24 = f36 C lo
ld8 r21 = [r9], 8
xma.hu f42 = f34, f6, f46
;;
ldf8 f46 = [rp], 8
getf.sig r28 = f40 C hi
ldf8 f34 = [up], 8
xma.l f39 = f35, f6, f47
getf.sig r25 = f37 C lo
ld8 r22 = [r9], 8
xma.hu f43 = f35, f6, f47
;;
ldf8 f47 = [rp], 8
getf.sig r29 = f41 C hi
ldf8 f35 = [up], 8
;;
getf.sig r26 = f38 C lo
xma.l f36 = f32, f6, f44
ld8 r23 = [r9], 8
xma.hu f40 = f32, f6, f44
br.cloop.dptk .grt8
;;
getf.sig r30 = f42 C hi
getf.sig r27 = f39 C lo
xma.l f37 = f33, f6, f45
ld8 r20 = [r9], 8
xma.hu f41 = f33, f6, f45
br .Lcj8
.grt8: ldf8 f44 = [rp], 8
getf.sig r30 = f42 C hi
ldf8 f32 = [up], 8
;;
getf.sig r27 = f39 C lo
xma.l f37 = f33, f6, f45
ld8 r20 = [r9], 8
xma.hu f41 = f33, f6, f45
br .LL00
ALIGN(32)
.Loop:
{.mmi
ldf8 f44 = [rp], 8
cmp.ltu p6, p0 = r27, r8 C lo cmp
sub r14 = r27, r8 C lo sub
}
{.mmi
getf.sig r30 = f42 C hi
ldf8 f32 = [up], 8
sub r8 = r20, r31 C hi sub
;; C 01
}
{.mmf
getf.sig r27 = f39 C lo
st8 [r10] = r14, 8
xma.l f37 = f33, f6, f45
}
{.mfi
ld8 r20 = [r9], 8
xma.hu f41 = f33, f6, f45
(p6) add r8 = 1, r8
;; C 02
}
{.mmi
.LL00: ldf8 f45 = [rp], 8
cmp.ltu p6, p0 = r24, r8
sub r14 = r24, r8
}
{.mmi
getf.sig r31 = f43 C hi
ldf8 f33 = [up], 8
sub r8 = r21, r28
;; C 03
}
{.mmf
getf.sig r24 = f36 C lo
st8 [r10] = r14, 8
xma.l f38 = f34, f6, f46
}
{.mfi
ld8 r21 = [r9], 8
xma.hu f42 = f34, f6, f46
(p6) add r8 = 1, r8
;; C 04
}
{.mmi
.LL11: ldf8 f46 = [rp], 8
cmp.ltu p6, p0 = r25, r8
sub r14 = r25, r8
}
{.mmi
getf.sig r28 = f40 C hi
ldf8 f34 = [up], 8
sub r8 = r22, r29
;; C 05
}
{.mmf
getf.sig r25 = f37 C lo
st8 [r10] = r14, 8
xma.l f39 = f35, f6, f47
}
{.mfi
ld8 r22 = [r9], 8
xma.hu f43 = f35, f6, f47
(p6) add r8 = 1, r8
;; C 06
}
{.mmi
.LL10: ldf8 f47 = [rp], 8
cmp.ltu p6, p0 = r26, r8
sub r14 = r26, r8
}
{.mmi
getf.sig r29 = f41 C hi
ldf8 f35 = [up], 8
sub r8 = r23, r30
;; C 07
}
{.mmf
getf.sig r26 = f38 C lo
st8 [r10] = r14, 8
xma.l f36 = f32, f6, f44
}
{.mfi
ld8 r23 = [r9], 8
xma.hu f40 = f32, f6, f44
(p6) add r8 = 1, r8
}
br.cloop.dptk .Loop
;;
.Lend:
cmp.ltu p6, p0 = r27, r8
sub r14 = r27, r8
getf.sig r30 = f42
sub r8 = r20, r31
;;
getf.sig r27 = f39
st8 [r10] = r14, 8
xma.l f37 = f33, f6, f45
ld8 r20 = [r9], 8
xma.hu f41 = f33, f6, f45
(p6) add r8 = 1, r8
;;
.Lcj8:
cmp.ltu p6, p0 = r24, r8
sub r14 = r24, r8
getf.sig r31 = f43
sub r8 = r21, r28
;;
getf.sig r24 = f36
st8 [r10] = r14, 8
xma.l f38 = f34, f6, f46
ld8 r21 = [r9], 8
xma.hu f42 = f34, f6, f46
(p6) add r8 = 1, r8
;;
.Lcj7:
cmp.ltu p6, p0 = r25, r8
sub r14 = r25, r8
getf.sig r28 = f40
sub r8 = r22, r29
;;
getf.sig r25 = f37
st8 [r10] = r14, 8
xma.l f39 = f35, f6, f47
ld8 r22 = [r9], 8
xma.hu f43 = f35, f6, f47
(p6) add r8 = 1, r8
;;
.Lcj6:
cmp.ltu p6, p0 = r26, r8
sub r14 = r26, r8
getf.sig r29 = f41
sub r8 = r23, r30
;;
getf.sig r26 = f38
st8 [r10] = r14, 8
ld8 r23 = [r9], 8
(p6) add r8 = 1, r8
;;
.Lcj5:
cmp.ltu p6, p0 = r27, r8
sub r14 = r27, r8
getf.sig r30 = f42
sub r8 = r20, r31
;;
getf.sig r27 = f39
st8 [r10] = r14, 8
ld8 r20 = [r9], 8
(p6) add r8 = 1, r8
;;
.Lcj4:
cmp.ltu p6, p0 = r24, r8
sub r14 = r24, r8
getf.sig r31 = f43
sub r8 = r21, r28
;;
st8 [r10] = r14, 8
(p6) add r8 = 1, r8
;;
.Lcj3:
cmp.ltu p6, p0 = r25, r8
sub r14 = r25, r8
sub r8 = r22, r29
;;
st8 [r10] = r14, 8
(p6) add r8 = 1, r8
;;
.Lcj2:
cmp.ltu p6, p0 = r26, r8
sub r14 = r26, r8
sub r8 = r23, r30
;;
st8 [r10] = r14, 8
(p6) add r8 = 1, r8
;;
.Lcj1:
cmp.ltu p6, p0 = r27, r8
sub r14 = r27, r8
sub r8 = r20, r31
;;
st8 [r10] = r14, 8
mov ar.lc = r2
(p6) add r8 = 1, r8
br.ret.sptk.many b0
.Ldone: mov ar.lc = r2
br.ret.sptk.many b0
EPILOGUE()
ASM_END()