bpo-46407: Optimizing some modulo operations (GH-30653)

Added new internal functions to compute mod without also computing the quotient.

The loops can be leaner then, which leads to modestly but reliably faster execution in contexts that know they don't need the quotient.

Code by Jeremiah Vivian (Pascual).
This commit is contained in:
Crowthebird 2022-01-28 08:46:45 +08:00 committed by GitHub
parent e7a6285f1b
commit f10dafc430
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 108 additions and 9 deletions

View File

@ -1860,6 +1860,7 @@ Kurt Vile
Norman Vine Norman Vine
Pauli Virtanen Pauli Virtanen
Frank Visser Frank Visser
Jeremiah Vivian (Pascual)
Johannes Vogel Johannes Vogel
Michael Vogt Michael Vogt
Radu Voicilas Radu Voicilas

View File

@ -0,0 +1 @@
Optimize some modulo operations in ``Objects/longobject.c``. Patch by Jeremiah Vivian.

View File

@ -1670,6 +1670,35 @@ divrem1(PyLongObject *a, digit n, digit *prem)
return long_normalize(z); return long_normalize(z);
} }
/* Remainder of long pin, w/ size digits, by non-zero digit n,
returning the remainder. pin points at the LSD. */
static digit
inplace_rem1(digit *pin, Py_ssize_t size, digit n)
{
twodigits rem = 0;
assert(n > 0 && n <= PyLong_MASK);
while (--size >= 0)
rem = ((rem << PyLong_SHIFT) | pin[size]) % n;
return (digit)rem;
}
/* Get the remainder of an integer divided by a digit, returning
the remainder as the result of the function. The sign of a is
ignored; n should not be zero. */
static PyLongObject *
rem1(PyLongObject *a, digit n)
{
const Py_ssize_t size = Py_ABS(Py_SIZE(a));
assert(n > 0 && n <= PyLong_MASK);
return (PyLongObject *)PyLong_FromLong(
(long)inplace_rem1(a->ob_digit, size, n)
);
}
/* Convert an integer to a base 10 string. Returns a new non-shared /* Convert an integer to a base 10 string. Returns a new non-shared
string. (Return value is non-shared so that callers can modify the string. (Return value is non-shared so that callers can modify the
returned value if necessary.) */ returned value if necessary.) */
@ -2689,6 +2718,47 @@ long_divrem(PyLongObject *a, PyLongObject *b,
return 0; return 0;
} }
/* Int remainder, top-level routine */
static int
long_rem(PyLongObject *a, PyLongObject *b, PyLongObject **prem)
{
Py_ssize_t size_a = Py_ABS(Py_SIZE(a)), size_b = Py_ABS(Py_SIZE(b));
if (size_b == 0) {
PyErr_SetString(PyExc_ZeroDivisionError,
"integer modulo by zero");
return -1;
}
if (size_a < size_b ||
(size_a == size_b &&
a->ob_digit[size_a-1] < b->ob_digit[size_b-1])) {
/* |a| < |b|. */
*prem = (PyLongObject *)long_long((PyObject *)a);
return -(*prem == NULL);
}
if (size_b == 1) {
*prem = rem1(a, b->ob_digit[0]);
if (*prem == NULL)
return -1;
}
else {
/* Slow path using divrem. */
x_divrem(a, b, prem);
if (*prem == NULL)
return -1;
}
/* Set the sign. */
if (Py_SIZE(a) < 0 && Py_SIZE(*prem) != 0) {
_PyLong_Negate(prem);
if (*prem == NULL) {
Py_CLEAR(*prem);
return -1;
}
}
return 0;
}
/* Unsigned int division with remainder -- the algorithm. The arguments v1 /* Unsigned int division with remainder -- the algorithm. The arguments v1
and w1 should satisfy 2 <= Py_ABS(Py_SIZE(w1)) <= Py_ABS(Py_SIZE(v1)). */ and w1 should satisfy 2 <= Py_ABS(Py_SIZE(w1)) <= Py_ABS(Py_SIZE(v1)). */
@ -3814,6 +3884,37 @@ l_divmod(PyLongObject *v, PyLongObject *w,
return 0; return 0;
} }
/* Compute
* *pmod = v % w
* pmod cannot be NULL. The caller owns a reference to pmod.
*/
static int
l_mod(PyLongObject *v, PyLongObject *w, PyLongObject **pmod)
{
PyLongObject *mod;
assert(pmod);
if (Py_ABS(Py_SIZE(v)) == 1 && Py_ABS(Py_SIZE(w)) == 1) {
/* Fast path for single-digit longs */
*pmod = (PyLongObject *)fast_mod(v, w);
return -(*pmod == NULL);
}
if (long_rem(v, w, &mod) < 0)
return -1;
if ((Py_SIZE(mod) < 0 && Py_SIZE(w) > 0) ||
(Py_SIZE(mod) > 0 && Py_SIZE(w) < 0)) {
PyLongObject *temp;
temp = (PyLongObject *) long_add(mod, w);
Py_DECREF(mod);
mod = temp;
if (mod == NULL)
return -1;
}
*pmod = mod;
return 0;
}
static PyObject * static PyObject *
long_div(PyObject *a, PyObject *b) long_div(PyObject *a, PyObject *b)
{ {
@ -4100,11 +4201,7 @@ long_mod(PyObject *a, PyObject *b)
CHECK_BINOP(a, b); CHECK_BINOP(a, b);
if (Py_ABS(Py_SIZE(a)) == 1 && Py_ABS(Py_SIZE(b)) == 1) { if (l_mod((PyLongObject*)a, (PyLongObject*)b, &mod) < 0)
return fast_mod((PyLongObject*)a, (PyLongObject*)b);
}
if (l_divmod((PyLongObject*)a, (PyLongObject*)b, NULL, &mod) < 0)
mod = NULL; mod = NULL;
return (PyObject *)mod; return (PyObject *)mod;
} }
@ -4333,10 +4430,10 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
while the "large exponent" case multiplies directly by base 31 while the "large exponent" case multiplies directly by base 31
times. It can be unboundedly faster to multiply by times. It can be unboundedly faster to multiply by
base % modulus instead. base % modulus instead.
We could _always_ do this reduction, but l_divmod() isn't cheap, We could _always_ do this reduction, but l_mod() isn't cheap,
so we only do it when it buys something. */ so we only do it when it buys something. */
if (Py_SIZE(a) < 0 || Py_SIZE(a) > Py_SIZE(c)) { if (Py_SIZE(a) < 0 || Py_SIZE(a) > Py_SIZE(c)) {
if (l_divmod(a, c, NULL, &temp) < 0) if (l_mod(a, c, &temp) < 0)
goto Error; goto Error;
Py_DECREF(a); Py_DECREF(a);
a = temp; a = temp;
@ -4357,7 +4454,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
#define REDUCE(X) \ #define REDUCE(X) \
do { \ do { \
if (c != NULL) { \ if (c != NULL) { \
if (l_divmod(X, c, NULL, &temp) < 0) \ if (l_mod(X, c, &temp) < 0) \
goto Error; \ goto Error; \
Py_XDECREF(X); \ Py_XDECREF(X); \
X = temp; \ X = temp; \
@ -5022,7 +5119,7 @@ _PyLong_GCD(PyObject *aarg, PyObject *barg)
if (k == 0) { if (k == 0) {
/* no progress; do a Euclidean step */ /* no progress; do a Euclidean step */
if (l_divmod(a, b, NULL, &r) < 0) if (l_mod(a, b, &r) < 0)
goto error; goto error;
Py_DECREF(a); Py_DECREF(a);
a = b; a = b;