651 lines
15 KiB
C
651 lines
15 KiB
C
/*
|
|
* Copyright (c) 2008-2012 Stefan Krah. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
|
|
|
|
#ifndef UMODARITH_H
|
|
#define UMODARITH_H
|
|
|
|
|
|
#include "constants.h"
|
|
#include "mpdecimal.h"
|
|
#include "typearith.h"
|
|
|
|
|
|
/* Bignum: Low level routines for unsigned modular arithmetic. These are
|
|
used in the fast convolution functions for very large coefficients. */
|
|
|
|
|
|
/**************************************************************************/
|
|
/* ANSI modular arithmetic */
|
|
/**************************************************************************/
|
|
|
|
|
|
/*
|
|
* Restrictions: a < m and b < m
|
|
* ACL2 proof: umodarith.lisp: addmod-correct
|
|
*/
|
|
static inline mpd_uint_t
|
|
addmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
|
|
{
|
|
mpd_uint_t s;
|
|
|
|
s = a + b;
|
|
s = (s < a) ? s - m : s;
|
|
s = (s >= m) ? s - m : s;
|
|
|
|
return s;
|
|
}
|
|
|
|
/*
|
|
* Restrictions: a < m and b < m
|
|
* ACL2 proof: umodarith.lisp: submod-2-correct
|
|
*/
|
|
static inline mpd_uint_t
|
|
submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
|
|
{
|
|
mpd_uint_t d;
|
|
|
|
d = a - b;
|
|
d = (a < b) ? d + m : d;
|
|
|
|
return d;
|
|
}
|
|
|
|
/*
|
|
* Restrictions: a < 2m and b < 2m
|
|
* ACL2 proof: umodarith.lisp: section ext-submod
|
|
*/
|
|
static inline mpd_uint_t
|
|
ext_submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
|
|
{
|
|
mpd_uint_t d;
|
|
|
|
a = (a >= m) ? a - m : a;
|
|
b = (b >= m) ? b - m : b;
|
|
|
|
d = a - b;
|
|
d = (a < b) ? d + m : d;
|
|
|
|
return d;
|
|
}
|
|
|
|
/*
|
|
* Reduce double word modulo m.
|
|
* Restrictions: m != 0
|
|
* ACL2 proof: umodarith.lisp: section dw-reduce
|
|
*/
|
|
static inline mpd_uint_t
|
|
dw_reduce(mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
|
|
{
|
|
mpd_uint_t r1, r2, w;
|
|
|
|
_mpd_div_word(&w, &r1, hi, m);
|
|
_mpd_div_words(&w, &r2, r1, lo, m);
|
|
|
|
return r2;
|
|
}
|
|
|
|
/*
|
|
* Subtract double word from a.
|
|
* Restrictions: a < m
|
|
* ACL2 proof: umodarith.lisp: section dw-submod
|
|
*/
|
|
static inline mpd_uint_t
|
|
dw_submod(mpd_uint_t a, mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
|
|
{
|
|
mpd_uint_t d, r;
|
|
|
|
r = dw_reduce(hi, lo, m);
|
|
d = a - r;
|
|
d = (a < r) ? d + m : d;
|
|
|
|
return d;
|
|
}
|
|
|
|
#ifdef CONFIG_64
|
|
|
|
/**************************************************************************/
|
|
/* 64-bit modular arithmetic */
|
|
/**************************************************************************/
|
|
|
|
/*
|
|
* A proof of the algorithm is in literature/mulmod-64.txt. An ACL2
|
|
* proof is in umodarith.lisp: section "Fast modular reduction".
|
|
*
|
|
* Algorithm: calculate (a * b) % p:
|
|
*
|
|
* a) hi, lo <- a * b # Calculate a * b.
|
|
*
|
|
* b) hi, lo <- R(hi, lo) # Reduce modulo p.
|
|
*
|
|
* c) Repeat step b) until 0 <= hi * 2**64 + lo < 2*p.
|
|
*
|
|
* d) If the result is less than p, return lo. Otherwise return lo - p.
|
|
*/
|
|
|
|
static inline mpd_uint_t
|
|
x64_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
|
|
{
|
|
mpd_uint_t hi, lo, x, y;
|
|
|
|
|
|
_mpd_mul_words(&hi, &lo, a, b);
|
|
|
|
if (m & (1ULL<<32)) { /* P1 */
|
|
|
|
/* first reduction */
|
|
x = y = hi;
|
|
hi >>= 32;
|
|
|
|
x = lo - x;
|
|
if (x > lo) hi--;
|
|
|
|
y <<= 32;
|
|
lo = y + x;
|
|
if (lo < y) hi++;
|
|
|
|
/* second reduction */
|
|
x = y = hi;
|
|
hi >>= 32;
|
|
|
|
x = lo - x;
|
|
if (x > lo) hi--;
|
|
|
|
y <<= 32;
|
|
lo = y + x;
|
|
if (lo < y) hi++;
|
|
|
|
return (hi || lo >= m ? lo - m : lo);
|
|
}
|
|
else if (m & (1ULL<<34)) { /* P2 */
|
|
|
|
/* first reduction */
|
|
x = y = hi;
|
|
hi >>= 30;
|
|
|
|
x = lo - x;
|
|
if (x > lo) hi--;
|
|
|
|
y <<= 34;
|
|
lo = y + x;
|
|
if (lo < y) hi++;
|
|
|
|
/* second reduction */
|
|
x = y = hi;
|
|
hi >>= 30;
|
|
|
|
x = lo - x;
|
|
if (x > lo) hi--;
|
|
|
|
y <<= 34;
|
|
lo = y + x;
|
|
if (lo < y) hi++;
|
|
|
|
/* third reduction */
|
|
x = y = hi;
|
|
hi >>= 30;
|
|
|
|
x = lo - x;
|
|
if (x > lo) hi--;
|
|
|
|
y <<= 34;
|
|
lo = y + x;
|
|
if (lo < y) hi++;
|
|
|
|
return (hi || lo >= m ? lo - m : lo);
|
|
}
|
|
else { /* P3 */
|
|
|
|
/* first reduction */
|
|
x = y = hi;
|
|
hi >>= 24;
|
|
|
|
x = lo - x;
|
|
if (x > lo) hi--;
|
|
|
|
y <<= 40;
|
|
lo = y + x;
|
|
if (lo < y) hi++;
|
|
|
|
/* second reduction */
|
|
x = y = hi;
|
|
hi >>= 24;
|
|
|
|
x = lo - x;
|
|
if (x > lo) hi--;
|
|
|
|
y <<= 40;
|
|
lo = y + x;
|
|
if (lo < y) hi++;
|
|
|
|
/* third reduction */
|
|
x = y = hi;
|
|
hi >>= 24;
|
|
|
|
x = lo - x;
|
|
if (x > lo) hi--;
|
|
|
|
y <<= 40;
|
|
lo = y + x;
|
|
if (lo < y) hi++;
|
|
|
|
return (hi || lo >= m ? lo - m : lo);
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
x64_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
|
|
{
|
|
*a = x64_mulmod(*a, w, m);
|
|
*b = x64_mulmod(*b, w, m);
|
|
}
|
|
|
|
static inline void
|
|
x64_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
|
|
mpd_uint_t m)
|
|
{
|
|
*a0 = x64_mulmod(*a0, b0, m);
|
|
*a1 = x64_mulmod(*a1, b1, m);
|
|
}
|
|
|
|
static inline mpd_uint_t
|
|
x64_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
|
|
{
|
|
mpd_uint_t r = 1;
|
|
|
|
while (exp > 0) {
|
|
if (exp & 1)
|
|
r = x64_mulmod(r, base, umod);
|
|
base = x64_mulmod(base, base, umod);
|
|
exp >>= 1;
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
/* END CONFIG_64 */
|
|
#else /* CONFIG_32 */
|
|
|
|
|
|
/**************************************************************************/
|
|
/* 32-bit modular arithmetic */
|
|
/**************************************************************************/
|
|
|
|
#if defined(ANSI)
|
|
#if !defined(LEGACY_COMPILER)
|
|
/* HAVE_UINT64_T */
|
|
static inline mpd_uint_t
|
|
std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
|
|
{
|
|
return ((mpd_uuint_t) a * b) % m;
|
|
}
|
|
|
|
static inline void
|
|
std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
|
|
{
|
|
*a = ((mpd_uuint_t) *a * w) % m;
|
|
*b = ((mpd_uuint_t) *b * w) % m;
|
|
}
|
|
|
|
static inline void
|
|
std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
|
|
mpd_uint_t m)
|
|
{
|
|
*a0 = ((mpd_uuint_t) *a0 * b0) % m;
|
|
*a1 = ((mpd_uuint_t) *a1 * b1) % m;
|
|
}
|
|
/* END HAVE_UINT64_T */
|
|
#else
|
|
/* LEGACY_COMPILER */
|
|
static inline mpd_uint_t
|
|
std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
|
|
{
|
|
mpd_uint_t hi, lo, q, r;
|
|
_mpd_mul_words(&hi, &lo, a, b);
|
|
_mpd_div_words(&q, &r, hi, lo, m);
|
|
return r;
|
|
}
|
|
|
|
static inline void
|
|
std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
|
|
{
|
|
*a = std_mulmod(*a, w, m);
|
|
*b = std_mulmod(*b, w, m);
|
|
}
|
|
|
|
static inline void
|
|
std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
|
|
mpd_uint_t m)
|
|
{
|
|
*a0 = std_mulmod(*a0, b0, m);
|
|
*a1 = std_mulmod(*a1, b1, m);
|
|
}
|
|
/* END LEGACY_COMPILER */
|
|
#endif
|
|
|
|
static inline mpd_uint_t
|
|
std_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
|
|
{
|
|
mpd_uint_t r = 1;
|
|
|
|
while (exp > 0) {
|
|
if (exp & 1)
|
|
r = std_mulmod(r, base, umod);
|
|
base = std_mulmod(base, base, umod);
|
|
exp >>= 1;
|
|
}
|
|
|
|
return r;
|
|
}
|
|
#endif /* ANSI CONFIG_32 */
|
|
|
|
|
|
/**************************************************************************/
|
|
/* Pentium Pro modular arithmetic */
|
|
/**************************************************************************/
|
|
|
|
/*
|
|
* A proof of the algorithm is in literature/mulmod-ppro.txt. The FPU
|
|
* control word must be set to 64-bit precision and truncation mode
|
|
* prior to using these functions.
|
|
*
|
|
* Algorithm: calculate (a * b) % p:
|
|
*
|
|
* p := prime < 2**31
|
|
* pinv := (long double)1.0 / p (precalculated)
|
|
*
|
|
* a) n = a * b # Calculate exact product.
|
|
* b) qest = n * pinv # Calculate estimate for q = n / p.
|
|
* c) q = (qest+2**63)-2**63 # Truncate qest to the exact quotient.
|
|
* d) r = n - q * p # Calculate remainder.
|
|
*
|
|
* Remarks:
|
|
*
|
|
* - p = dmod and pinv = dinvmod.
|
|
* - dinvmod points to an array of three uint32_t, which is interpreted
|
|
* as an 80 bit long double by fldt.
|
|
* - Intel compilers prior to version 11 do not seem to handle the
|
|
* __GNUC__ inline assembly correctly.
|
|
* - random tests are provided in tests/extended/ppro_mulmod.c
|
|
*/
|
|
|
|
#if defined(PPRO)
|
|
#if defined(ASM)
|
|
|
|
/* Return (a * b) % dmod */
|
|
static inline mpd_uint_t
|
|
ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
|
|
{
|
|
mpd_uint_t retval;
|
|
|
|
asm (
|
|
"fildl %2\n\t"
|
|
"fildl %1\n\t"
|
|
"fmulp %%st, %%st(1)\n\t"
|
|
"fldt (%4)\n\t"
|
|
"fmul %%st(1), %%st\n\t"
|
|
"flds %5\n\t"
|
|
"fadd %%st, %%st(1)\n\t"
|
|
"fsubrp %%st, %%st(1)\n\t"
|
|
"fldl (%3)\n\t"
|
|
"fmulp %%st, %%st(1)\n\t"
|
|
"fsubrp %%st, %%st(1)\n\t"
|
|
"fistpl %0\n\t"
|
|
: "=m" (retval)
|
|
: "m" (a), "m" (b), "r" (dmod), "r" (dinvmod), "m" (MPD_TWO63)
|
|
: "st", "memory"
|
|
);
|
|
|
|
return retval;
|
|
}
|
|
|
|
/*
|
|
* Two modular multiplications in parallel:
|
|
* *a0 = (*a0 * w) % dmod
|
|
* *a1 = (*a1 * w) % dmod
|
|
*/
|
|
static inline void
|
|
ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
|
|
double *dmod, uint32_t *dinvmod)
|
|
{
|
|
asm (
|
|
"fildl %2\n\t"
|
|
"fildl (%1)\n\t"
|
|
"fmul %%st(1), %%st\n\t"
|
|
"fxch %%st(1)\n\t"
|
|
"fildl (%0)\n\t"
|
|
"fmulp %%st, %%st(1) \n\t"
|
|
"fldt (%4)\n\t"
|
|
"flds %5\n\t"
|
|
"fld %%st(2)\n\t"
|
|
"fmul %%st(2)\n\t"
|
|
"fadd %%st(1)\n\t"
|
|
"fsub %%st(1)\n\t"
|
|
"fmull (%3)\n\t"
|
|
"fsubrp %%st, %%st(3)\n\t"
|
|
"fxch %%st(2)\n\t"
|
|
"fistpl (%0)\n\t"
|
|
"fmul %%st(2)\n\t"
|
|
"fadd %%st(1)\n\t"
|
|
"fsubp %%st, %%st(1)\n\t"
|
|
"fmull (%3)\n\t"
|
|
"fsubrp %%st, %%st(1)\n\t"
|
|
"fistpl (%1)\n\t"
|
|
: : "r" (a0), "r" (a1), "m" (w),
|
|
"r" (dmod), "r" (dinvmod),
|
|
"m" (MPD_TWO63)
|
|
: "st", "memory"
|
|
);
|
|
}
|
|
|
|
/*
|
|
* Two modular multiplications in parallel:
|
|
* *a0 = (*a0 * b0) % dmod
|
|
* *a1 = (*a1 * b1) % dmod
|
|
*/
|
|
static inline void
|
|
ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
|
|
double *dmod, uint32_t *dinvmod)
|
|
{
|
|
asm (
|
|
"fildl %3\n\t"
|
|
"fildl (%2)\n\t"
|
|
"fmulp %%st, %%st(1)\n\t"
|
|
"fildl %1\n\t"
|
|
"fildl (%0)\n\t"
|
|
"fmulp %%st, %%st(1)\n\t"
|
|
"fldt (%5)\n\t"
|
|
"fld %%st(2)\n\t"
|
|
"fmul %%st(1), %%st\n\t"
|
|
"fxch %%st(1)\n\t"
|
|
"fmul %%st(2), %%st\n\t"
|
|
"flds %6\n\t"
|
|
"fldl (%4)\n\t"
|
|
"fxch %%st(3)\n\t"
|
|
"fadd %%st(1), %%st\n\t"
|
|
"fxch %%st(2)\n\t"
|
|
"fadd %%st(1), %%st\n\t"
|
|
"fxch %%st(2)\n\t"
|
|
"fsub %%st(1), %%st\n\t"
|
|
"fxch %%st(2)\n\t"
|
|
"fsubp %%st, %%st(1)\n\t"
|
|
"fxch %%st(1)\n\t"
|
|
"fmul %%st(2), %%st\n\t"
|
|
"fxch %%st(1)\n\t"
|
|
"fmulp %%st, %%st(2)\n\t"
|
|
"fsubrp %%st, %%st(3)\n\t"
|
|
"fsubrp %%st, %%st(1)\n\t"
|
|
"fxch %%st(1)\n\t"
|
|
"fistpl (%2)\n\t"
|
|
"fistpl (%0)\n\t"
|
|
: : "r" (a0), "m" (b0), "r" (a1), "m" (b1),
|
|
"r" (dmod), "r" (dinvmod),
|
|
"m" (MPD_TWO63)
|
|
: "st", "memory"
|
|
);
|
|
}
|
|
/* END PPRO GCC ASM */
|
|
#elif defined(MASM)
|
|
|
|
/* Return (a * b) % dmod */
|
|
static inline mpd_uint_t __cdecl
|
|
ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
|
|
{
|
|
mpd_uint_t retval;
|
|
|
|
__asm {
|
|
mov eax, dinvmod
|
|
mov edx, dmod
|
|
fild b
|
|
fild a
|
|
fmulp st(1), st
|
|
fld TBYTE PTR [eax]
|
|
fmul st, st(1)
|
|
fld MPD_TWO63
|
|
fadd st(1), st
|
|
fsubp st(1), st
|
|
fld QWORD PTR [edx]
|
|
fmulp st(1), st
|
|
fsubp st(1), st
|
|
fistp retval
|
|
}
|
|
|
|
return retval;
|
|
}
|
|
|
|
/*
|
|
* Two modular multiplications in parallel:
|
|
* *a0 = (*a0 * w) % dmod
|
|
* *a1 = (*a1 * w) % dmod
|
|
*/
|
|
static inline mpd_uint_t __cdecl
|
|
ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
|
|
double *dmod, uint32_t *dinvmod)
|
|
{
|
|
__asm {
|
|
mov ecx, dmod
|
|
mov edx, a1
|
|
mov ebx, dinvmod
|
|
mov eax, a0
|
|
fild w
|
|
fild DWORD PTR [edx]
|
|
fmul st, st(1)
|
|
fxch st(1)
|
|
fild DWORD PTR [eax]
|
|
fmulp st(1), st
|
|
fld TBYTE PTR [ebx]
|
|
fld MPD_TWO63
|
|
fld st(2)
|
|
fmul st, st(2)
|
|
fadd st, st(1)
|
|
fsub st, st(1)
|
|
fmul QWORD PTR [ecx]
|
|
fsubp st(3), st
|
|
fxch st(2)
|
|
fistp DWORD PTR [eax]
|
|
fmul st, st(2)
|
|
fadd st, st(1)
|
|
fsubrp st(1), st
|
|
fmul QWORD PTR [ecx]
|
|
fsubp st(1), st
|
|
fistp DWORD PTR [edx]
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Two modular multiplications in parallel:
|
|
* *a0 = (*a0 * b0) % dmod
|
|
* *a1 = (*a1 * b1) % dmod
|
|
*/
|
|
static inline void __cdecl
|
|
ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
|
|
double *dmod, uint32_t *dinvmod)
|
|
{
|
|
__asm {
|
|
mov ecx, dmod
|
|
mov edx, a1
|
|
mov ebx, dinvmod
|
|
mov eax, a0
|
|
fild b1
|
|
fild DWORD PTR [edx]
|
|
fmulp st(1), st
|
|
fild b0
|
|
fild DWORD PTR [eax]
|
|
fmulp st(1), st
|
|
fld TBYTE PTR [ebx]
|
|
fld st(2)
|
|
fmul st, st(1)
|
|
fxch st(1)
|
|
fmul st, st(2)
|
|
fld DWORD PTR MPD_TWO63
|
|
fld QWORD PTR [ecx]
|
|
fxch st(3)
|
|
fadd st, st(1)
|
|
fxch st(2)
|
|
fadd st, st(1)
|
|
fxch st(2)
|
|
fsub st, st(1)
|
|
fxch st(2)
|
|
fsubrp st(1), st
|
|
fxch st(1)
|
|
fmul st, st(2)
|
|
fxch st(1)
|
|
fmulp st(2), st
|
|
fsubp st(3), st
|
|
fsubp st(1), st
|
|
fxch st(1)
|
|
fistp DWORD PTR [edx]
|
|
fistp DWORD PTR [eax]
|
|
}
|
|
}
|
|
#endif /* PPRO MASM (_MSC_VER) */
|
|
|
|
|
|
/* Return (base ** exp) % dmod */
|
|
static inline mpd_uint_t
|
|
ppro_powmod(mpd_uint_t base, mpd_uint_t exp, double *dmod, uint32_t *dinvmod)
|
|
{
|
|
mpd_uint_t r = 1;
|
|
|
|
while (exp > 0) {
|
|
if (exp & 1)
|
|
r = ppro_mulmod(r, base, dmod, dinvmod);
|
|
base = ppro_mulmod(base, base, dmod, dinvmod);
|
|
exp >>= 1;
|
|
}
|
|
|
|
return r;
|
|
}
|
|
#endif /* PPRO */
|
|
#endif /* CONFIG_32 */
|
|
|
|
|
|
#endif /* UMODARITH_H */
|
|
|
|
|
|
|