mirror of
https://review.haiku-os.org/buildtools
synced 2025-02-12 08:47:41 +01:00
4c74403188
Change-Id: Iae65e95cfa0d92091b8b0a424ae36d88efa76aa9 Reviewed-on: https://review.haiku-os.org/c/buildtools/+/3020 Reviewed-by: Adrien Destugues <pulkomandy@gmail.com>
315 lines
9.1 KiB
C
315 lines
9.1 KiB
C
/* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
|
|
quotient. The divisor is two limbs.
|
|
|
|
Contributed to the GNU project by Torbjorn Granlund and Niels Möller
|
|
|
|
THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY
|
|
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
|
|
GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
|
|
|
|
|
|
Copyright 1993-1996, 1999-2002, 2011, 2017 Free Software Foundation, Inc.
|
|
|
|
This file is part of the GNU MP Library.
|
|
|
|
The GNU MP Library is free software; you can redistribute it and/or modify
|
|
it under the terms of either:
|
|
|
|
* the GNU Lesser General Public License as published by the Free
|
|
Software Foundation; either version 3 of the License, or (at your
|
|
option) any later version.
|
|
|
|
or
|
|
|
|
* the GNU General Public License as published by the Free Software
|
|
Foundation; either version 2 of the License, or (at your option) any
|
|
later version.
|
|
|
|
or both in parallel, as here.
|
|
|
|
The GNU MP Library is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
for more details.
|
|
|
|
You should have received copies of the GNU General Public License and the
|
|
GNU Lesser General Public License along with the GNU MP Library. If not,
|
|
see https://www.gnu.org/licenses/. */
|
|
|
|
#include "gmp-impl.h"
|
|
#include "longlong.h"
|
|
|
|
#ifndef DIV_QR_2_PI2_THRESHOLD
|
|
/* Disabled unless explicitly tuned. */
|
|
#define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
|
|
#endif
|
|
|
|
#ifndef SANITY_CHECK
|
|
#define SANITY_CHECK 0
|
|
#endif
|
|
|
|
/* Define some longlong.h-style macros, but for wider operations.
|
|
* add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
|
|
an additional sum operand.
|
|
* add_csaac accepts two addends and a carry in, and generates a sum and a
|
|
carry out. A little like a "full adder".
|
|
*/
|
|
#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
|
|
|
|
#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
|
|
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
|
|
__asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \
|
|
: "=r" (s2), "=&r" (s1), "=&r" (s0) \
|
|
: "0" ((USItype)(s2)), \
|
|
"1" ((USItype)(a1)), "g" ((USItype)(b1)), \
|
|
"%2" ((USItype)(a0)), "g" ((USItype)(b0)))
|
|
#endif
|
|
|
|
#if defined (__amd64__) && W_TYPE_SIZE == 64
|
|
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
|
|
__asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \
|
|
: "=r" (s2), "=&r" (s1), "=&r" (s0) \
|
|
: "0" ((UDItype)(s2)), \
|
|
"1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
|
|
"%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
|
|
#endif
|
|
|
|
#if defined (__aarch64__) && W_TYPE_SIZE == 64
|
|
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
|
|
__asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %3, xzr"\
|
|
: "=r" (s2), "=&r" (s1), "=&r" (s0) \
|
|
: "rZ" (s2), "%rZ" (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) \
|
|
__CLOBBER_CC)
|
|
#endif
|
|
|
|
#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
|
|
/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
|
|
processor running in 32-bit mode, since the carry flag then gets the 32-bit
|
|
carry. */
|
|
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
|
|
__asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3" \
|
|
: "=r" (s2), "=&r" (s1), "=&r" (s0) \
|
|
: "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \
|
|
__CLOBBER_CC)
|
|
#endif
|
|
|
|
#endif /* __GNUC__ */
|
|
|
|
#ifndef add_sssaaaa
|
|
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
|
|
do { \
|
|
UWtype __s0, __s1, __c0, __c1; \
|
|
__s0 = (a0) + (b0); \
|
|
__s1 = (a1) + (b1); \
|
|
__c0 = __s0 < (a0); \
|
|
__c1 = __s1 < (a1); \
|
|
(s0) = __s0; \
|
|
__s1 = __s1 + __c0; \
|
|
(s1) = __s1; \
|
|
(s2) += __c1 + (__s1 < __c0); \
|
|
} while (0)
|
|
#endif
|
|
|
|
/* Typically used with r1, r0 same as n3, n2. Other types of overlap
|
|
between inputs and outputs are not supported. */
|
|
#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \
|
|
do { \
|
|
mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \
|
|
mp_limb_t _t1, _t0; \
|
|
mp_limb_t _mask; \
|
|
\
|
|
/* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */ \
|
|
umul_ppmm (_q2,_q1, n2, di1); \
|
|
umul_ppmm (_q3,_q2a, n3, di1); \
|
|
++_q2; /* _q2 cannot overflow */ \
|
|
add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a); \
|
|
umul_ppmm (_q2c,_q1c, n3, di0); \
|
|
add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c); \
|
|
umul_ppmm (_q1d,_q0, n2, di0); \
|
|
add_sssaaaa (_q2c,_q1,_q0, _q1,_q0, n1,n0); /* _q2c cannot overflow */ \
|
|
add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d); \
|
|
\
|
|
umul_ppmm (_t1,_t0, _q2, d0); \
|
|
_t1 += _q2 * d1 + _q3 * d0; \
|
|
\
|
|
sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \
|
|
\
|
|
_mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0))); /* (r1,r0) >= (q1,q0) */ \
|
|
add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask); \
|
|
sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask); \
|
|
\
|
|
if (UNLIKELY (r1 >= d1)) \
|
|
{ \
|
|
if (r1 > d1 || r0 >= d0) \
|
|
{ \
|
|
sub_ddmmss (r1, r0, r1, r0, d1, d0); \
|
|
add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
|
|
} \
|
|
} \
|
|
(q1) = _q3; \
|
|
(q0) = _q2; \
|
|
} while (0)
|
|
|
|
static void
|
|
invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
|
|
{
|
|
mp_limb_t v1, v0, p1, t1, t0, p0, mask;
|
|
invert_limb (v1, d1);
|
|
p1 = d1 * v1;
|
|
/* <1, v1> * d1 = <B-1, p1> */
|
|
p1 += d0;
|
|
if (p1 < d0)
|
|
{
|
|
v1--;
|
|
mask = -(mp_limb_t) (p1 >= d1);
|
|
p1 -= d1;
|
|
v1 += mask;
|
|
p1 -= mask & d1;
|
|
}
|
|
/* <1, v1> * d1 + d0 = <B-1, p1> */
|
|
umul_ppmm (t1, p0, d0, v1);
|
|
p1 += t1;
|
|
if (p1 < t1)
|
|
{
|
|
if (UNLIKELY (p1 >= d1))
|
|
{
|
|
if (p1 > d1 || p0 >= d0)
|
|
{
|
|
sub_ddmmss (p1, p0, p1, p0, d1, d0);
|
|
v1--;
|
|
}
|
|
}
|
|
sub_ddmmss (p1, p0, p1, p0, d1, d0);
|
|
v1--;
|
|
}
|
|
/* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
|
|
* with <p1, p0> + <d1, d0> >= B^2.
|
|
*
|
|
* The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
|
|
* partial remainder after <1, v1> is
|
|
*
|
|
* B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
|
|
* = <~p1, ~p0, B-1>
|
|
*/
|
|
udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
|
|
di[0] = v0;
|
|
di[1] = v1;
|
|
|
|
#if SANITY_CHECK
|
|
{
|
|
mp_limb_t tp[4];
|
|
mp_limb_t dp[2];
|
|
dp[0] = d0;
|
|
dp[1] = d1;
|
|
mpn_mul_n (tp, dp, di, 2);
|
|
ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
|
|
ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
|
|
ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
|
|
ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static mp_limb_t
|
|
mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
|
|
mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
|
|
{
|
|
mp_limb_t qh;
|
|
mp_size_t i;
|
|
mp_limb_t r1, r0;
|
|
|
|
ASSERT (nn >= 2);
|
|
ASSERT (d1 & GMP_NUMB_HIGHBIT);
|
|
|
|
r1 = np[nn-1];
|
|
r0 = np[nn-2];
|
|
|
|
qh = 0;
|
|
if (r1 >= d1 && (r1 > d1 || r0 >= d0))
|
|
{
|
|
#if GMP_NAIL_BITS == 0
|
|
sub_ddmmss (r1, r0, r1, r0, d1, d0);
|
|
#else
|
|
r0 = r0 - d0;
|
|
r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
|
|
r0 &= GMP_NUMB_MASK;
|
|
#endif
|
|
qh = 1;
|
|
}
|
|
|
|
for (i = nn - 2; i >= 2; i -= 2)
|
|
{
|
|
mp_limb_t n1, n0, q1, q0;
|
|
n1 = np[i-1];
|
|
n0 = np[i-2];
|
|
udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
|
|
qp[i-1] = q1;
|
|
qp[i-2] = q0;
|
|
}
|
|
|
|
if (i > 0)
|
|
{
|
|
mp_limb_t q;
|
|
udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
|
|
qp[0] = q;
|
|
}
|
|
rp[1] = r1;
|
|
rp[0] = r0;
|
|
|
|
return qh;
|
|
}
|
|
|
|
|
|
/* Divide num {np,nn} by den {dp,2} and write the nn-2 least
|
|
significant quotient limbs at qp and the 2 long remainder at np.
|
|
Return the most significant limb of the quotient.
|
|
|
|
Preconditions:
|
|
1. qp must either not overlap with the other operands at all, or
|
|
qp >= np + 2 must hold true. (This means that it's possible to put
|
|
the quotient in the high part of {np,nn}, right above the remainder.)
|
|
2. nn >= 2. */
|
|
|
|
mp_limb_t
|
|
mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
|
|
mp_srcptr dp)
|
|
{
|
|
mp_limb_t d1;
|
|
mp_limb_t d0;
|
|
gmp_pi1_t dinv;
|
|
|
|
ASSERT (nn >= 2);
|
|
ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
|
|
ASSERT_MPN (np, nn);
|
|
ASSERT_MPN (dp, 2);
|
|
|
|
d1 = dp[1]; d0 = dp[0];
|
|
|
|
ASSERT (d1 > 0);
|
|
|
|
if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
|
|
{
|
|
if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
|
|
{
|
|
gmp_pi1_t dinv;
|
|
invert_pi1 (dinv, d1, d0);
|
|
return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
|
|
}
|
|
else
|
|
{
|
|
mp_limb_t di[2];
|
|
invert_4by2 (di, d1, d0);
|
|
return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
int shift;
|
|
count_leading_zeros (shift, d1);
|
|
d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
|
|
d0 <<= shift;
|
|
invert_pi1 (dinv, d1, d0);
|
|
return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
|
|
}
|
|
}
|