diff --git a/Include/longintrepr.h b/Include/longintrepr.h index 9ed1fe737b7..254076e4d4d 100644 --- a/Include/longintrepr.h +++ b/Include/longintrepr.h @@ -15,7 +15,8 @@ extern "C" { (at most (BASE-1)*(2*BASE+1) == MASK*(2*MASK+3)). Also, x_sub assumes that 'digit' is an unsigned type, and overflow is handled by taking the result mod 2**N for some N > SHIFT. - And, at some places it is assumed that MASK fits in an int, as well. */ + And, at some places it is assumed that MASK fits in an int, as well. + long_pow() requires that SHIFT be divisible by 5. */ typedef unsigned short digit; typedef unsigned int wdigit; /* digit widened to parameter size */ @@ -27,6 +28,10 @@ typedef BASE_TWODIGITS_TYPE stwodigits; /* signed variant of twodigits */ #define BASE ((digit)1 << SHIFT) #define MASK ((int)(BASE - 1)) +#if SHIFT % 5 != 0 +#error "longobject.c requires that SHIFT be divisible by 5" +#endif + /* Long integer representation. The absolute value of a number is equal to SUM(for i=0 through abs(ob_size)-1) ob_digit[i] * 2**(SHIFT*i) diff --git a/Misc/NEWS b/Misc/NEWS index 431b343aa06..660c49fa755 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -20,7 +20,11 @@ Core and builtins to compute 17**1000000 dropped from about 14 seconds to 9 on my box due to this much. The cutoff for Karatsuba multiplication was raised, since gradeschool multiplication got quicker, and the cutoff was - aggressively small regardless. + aggressively small regardless. The exponentiation algorithm was switched + from right-to-left to left-to-right, which is more efficient for small + bases. In addition, if the exponent is large, the algorithm now does + 5 bits (instead of 1 bit) at a time. That cut the time to compute + 17**1000000 on my box in half again, down to about 4.5 seconds. - OverflowWarning is no longer generated. PEP 237 scheduled this to occur in Python 2.3, but since OverflowWarning was disabled by default, @@ -156,6 +160,14 @@ Tools/Demos Build ----- +- Backward incompatibility: longintrepr.h now triggers a compile-time + error if SHIFT (the number of bits in a Python long "digit") isn't + divisible by 5. This new requirement allows simple code for the new + 5-bits-at-a-time long_pow() implementation. If necessary, the + restriction could be removed (by complicating long_pow(), or by + falling back to the 1-bit-at-a-time algorithm), but there are no + plans to do so. + - bug #991962: When building with --disable-toolbox-glue on Darwin no attempt to build Mac-specific modules occurs. diff --git a/Objects/longobject.c b/Objects/longobject.c index 2f6d103bfec..05c42ad47d7 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -15,6 +15,13 @@ #define KARATSUBA_CUTOFF 70 #define KARATSUBA_SQUARE_CUTOFF (2 * KARATSUBA_CUTOFF) +/* For exponentiation, use the binary left-to-right algorithm + * unless the exponent contains more than FIVEARY_CUTOFF digits. + * In that case, do 5 bits at a time. The potential drawback is that + * a table of 2**5 intermediate results is computed. + */ +#define FIVEARY_CUTOFF 8 + #define ABS(x) ((x) < 0 ? -(x) : (x)) #undef MIN @@ -2136,6 +2143,12 @@ long_mul(PyLongObject *v, PyLongObject *w) have different signs. We then subtract one from the 'div' part of the outcome to keep the invariant intact. */ +/* Compute + * *pdiv, *pmod = divmod(v, w) + * NULL can be passed for pdiv or pmod, in which case that part of + * the result is simply thrown away. The caller owns a reference to + * each of these it requests (does not pass NULL for). + */ static int l_divmod(PyLongObject *v, PyLongObject *w, PyLongObject **pdiv, PyLongObject **pmod) @@ -2167,44 +2180,43 @@ l_divmod(PyLongObject *v, PyLongObject *w, Py_DECREF(div); div = temp; } - *pdiv = div; - *pmod = mod; + if (pdiv != NULL) + *pdiv = div; + else + Py_DECREF(div); + + if (pmod != NULL) + *pmod = mod; + else + Py_DECREF(mod); + return 0; } static PyObject * long_div(PyObject *v, PyObject *w) { - PyLongObject *a, *b, *div, *mod; + PyLongObject *a, *b, *div; CONVERT_BINOP(v, w, &a, &b); - - if (l_divmod(a, b, &div, &mod) < 0) { - Py_DECREF(a); - Py_DECREF(b); - return NULL; - } + if (l_divmod(a, b, &div, NULL) < 0) + div = NULL; Py_DECREF(a); Py_DECREF(b); - Py_DECREF(mod); return (PyObject *)div; } static PyObject * long_classic_div(PyObject *v, PyObject *w) { - PyLongObject *a, *b, *div, *mod; + PyLongObject *a, *b, *div; CONVERT_BINOP(v, w, &a, &b); - if (Py_DivisionWarningFlag && PyErr_Warn(PyExc_DeprecationWarning, "classic long division") < 0) div = NULL; - else if (l_divmod(a, b, &div, &mod) < 0) + else if (l_divmod(a, b, &div, NULL) < 0) div = NULL; - else - Py_DECREF(mod); - Py_DECREF(a); Py_DECREF(b); return (PyObject *)div; @@ -2255,18 +2267,14 @@ overflow: static PyObject * long_mod(PyObject *v, PyObject *w) { - PyLongObject *a, *b, *div, *mod; + PyLongObject *a, *b, *mod; CONVERT_BINOP(v, w, &a, &b); - if (l_divmod(a, b, &div, &mod) < 0) { - Py_DECREF(a); - Py_DECREF(b); - return NULL; - } + if (l_divmod(a, b, NULL, &mod) < 0) + mod = NULL; Py_DECREF(a); Py_DECREF(b); - Py_DECREF(div); return (PyObject *)mod; } @@ -2297,22 +2305,33 @@ long_divmod(PyObject *v, PyObject *w) return z; } +/* pow(v, w, x) */ static PyObject * long_pow(PyObject *v, PyObject *w, PyObject *x) { - PyLongObject *a, *b; - PyObject *c; - PyLongObject *z, *div, *mod; - int size_b, i; + PyLongObject *a, *b, *c; /* a,b,c = v,w,x */ + int negativeOutput = 0; /* if x<0 return negative output */ + PyLongObject *z = NULL; /* accumulated result */ + int i, j, k; /* counters */ + PyLongObject *temp = NULL; + + /* 5-ary values. If the exponent is large enough, table is + * precomputed so that table[i] == a**i % c for i in range(32). + */ + PyLongObject *table[32] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + /* a, b, c = v, w, x */ CONVERT_BINOP(v, w, &a, &b); - if (PyLong_Check(x) || Py_None == x) { - c = x; + if (PyLong_Check(x)) { + c = (PyLongObject *)x; Py_INCREF(x); } - else if (PyInt_Check(x)) { - c = PyLong_FromLong(PyInt_AS_LONG(x)); - } + else if (PyInt_Check(x)) + c = (PyLongObject *)PyLong_FromLong(PyInt_AS_LONG(x)); + else if (x == Py_None) + c = NULL; else { Py_DECREF(a); Py_DECREF(b); @@ -2320,95 +2339,154 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) return Py_NotImplemented; } - if (c != Py_None && ((PyLongObject *)c)->ob_size == 0) { - PyErr_SetString(PyExc_ValueError, - "pow() 3rd argument cannot be 0"); - z = NULL; - goto error; - } - - size_b = b->ob_size; - if (size_b < 0) { - Py_DECREF(a); - Py_DECREF(b); - Py_DECREF(c); - if (x != Py_None) { + if (b->ob_size < 0) { /* if exponent is negative */ + if (c) { PyErr_SetString(PyExc_TypeError, "pow() 2nd argument " - "cannot be negative when 3rd argument specified"); + "cannot be negative when 3rd argument specified"); return NULL; } - /* Return a float. This works because we know that - this calls float_pow() which converts its - arguments to double. */ - return PyFloat_Type.tp_as_number->nb_power(v, w, x); - } - z = (PyLongObject *)PyLong_FromLong(1L); - for (i = 0; i < size_b; ++i) { - digit bi = b->ob_digit[i]; - int j; - - for (j = 0; j < SHIFT; ++j) { - PyLongObject *temp; - - if (bi & 1) { - temp = (PyLongObject *)long_mul(z, a); - Py_DECREF(z); - if (c!=Py_None && temp!=NULL) { - if (l_divmod(temp,(PyLongObject *)c, - &div,&mod) < 0) { - Py_DECREF(temp); - z = NULL; - goto error; - } - Py_XDECREF(div); - Py_DECREF(temp); - temp = mod; - } - z = temp; - if (z == NULL) - break; - } - bi >>= 1; - if (bi == 0 && i+1 == size_b) - break; - temp = (PyLongObject *)long_mul(a, a); - Py_DECREF(a); - if (c!=Py_None && temp!=NULL) { - if (l_divmod(temp, (PyLongObject *)c, &div, - &mod) < 0) { - Py_DECREF(temp); - z = NULL; - goto error; - } - Py_XDECREF(div); - Py_DECREF(temp); - temp = mod; - } - a = temp; - if (a == NULL) { - Py_DECREF(z); - z = NULL; - break; - } - } - if (a == NULL || z == NULL) - break; - } - if (c!=Py_None && z!=NULL) { - if (l_divmod(z, (PyLongObject *)c, &div, &mod) < 0) { - Py_DECREF(z); - z = NULL; - } else { - Py_XDECREF(div); - Py_DECREF(z); - z = mod; + /* else return a float. This works because we know + that this calls float_pow() which converts its + arguments to double. */ + Py_DECREF(a); + Py_DECREF(b); + return PyFloat_Type.tp_as_number->nb_power(v, w, x); } } - error: + + if (c) { + /* if modulus == 0: + raise ValueError() */ + if (c->ob_size == 0) { + PyErr_SetString(PyExc_ValueError, + "pow() 3rd argument cannot be 0"); + goto Done; + } + + /* if modulus < 0: + negativeOutput = True + modulus = -modulus */ + if (c->ob_size < 0) { + negativeOutput = 1; + temp = (PyLongObject *)_PyLong_Copy(c); + if (temp == NULL) + goto Error; + Py_DECREF(c); + c = temp; + temp = NULL; + c->ob_size = - c->ob_size; + } + + /* if modulus == 1: + return 0 */ + if ((c->ob_size == 1) && (c->ob_digit[0] == 1)) { + z = (PyLongObject *)PyLong_FromLong(0L); + goto Done; + } + + /* if base < 0: + base = base % modulus + Having the base positive just makes things easier. */ + if (a->ob_size < 0) { + if (l_divmod(a, c, NULL, &temp) < 0) + goto Done; + Py_DECREF(a); + a = temp; + temp = NULL; + } + } + + /* At this point a, b, and c are guaranteed non-negative UNLESS + c is NULL, in which case a may be negative. */ + + z = (PyLongObject *)PyLong_FromLong(1L); + if (z == NULL) + goto Error; + + /* Perform a modular reduction, X = X % c, but leave X alone if c + * is NULL. + */ +#define REDUCE(X) \ + if (c != NULL) { \ + if (l_divmod(X, c, NULL, &temp) < 0) \ + goto Error; \ + Py_XDECREF(X); \ + X = temp; \ + temp = NULL; \ + } + + /* Multiply two values, then reduce the result: + result = X*Y % c. If c is NULL, skip the mod. */ +#define MULT(X, Y, result) \ +{ \ + temp = (PyLongObject *)long_mul(X, Y); \ + if (temp == NULL) \ + goto Error; \ + Py_XDECREF(result); \ + result = temp; \ + temp = NULL; \ + REDUCE(result) \ +} + + if (b->ob_size <= FIVEARY_CUTOFF) { + /* Left-to-right binary exponentiation (HAC Algorithm 14.79) */ + /* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf */ + for (i = b->ob_size - 1; i >= 0; --i) { + digit bi = b->ob_digit[i]; + + for (j = 1 << (SHIFT-1); j != 0; j >>= 1) { + MULT(z, z, z) + if (bi & j) + MULT(z, a, z) + } + } + } + else { + /* Left-to-right 5-ary exponentiation (HAC Algorithm 14.82) */ + Py_INCREF(z); /* still holds 1L */ + table[0] = z; + for (i = 1; i < 32; ++i) + MULT(table[i-1], a, table[i]) + + for (i = b->ob_size - 1; i >= 0; --i) { + const digit bi = b->ob_digit[i]; + + for (j = SHIFT - 5; j >= 0; j -= 5) { + const int index = (bi >> j) & 0x1f; + for (k = 0; k < 5; ++k) + MULT(z, z, z) + if (index) + MULT(z, table[index], z) + } + } + } + + if (negativeOutput && (z->ob_size != 0)) { + temp = (PyLongObject *)long_sub(z, c); + if (temp == NULL) + goto Error; + Py_DECREF(z); + z = temp; + temp = NULL; + } + goto Done; + + Error: + if (z != NULL) { + Py_DECREF(z); + z = NULL; + } + /* fall through */ + Done: Py_XDECREF(a); - Py_DECREF(b); - Py_DECREF(c); + Py_XDECREF(b); + Py_XDECREF(c); + Py_XDECREF(temp); + if (b->ob_size > FIVEARY_CUTOFF) { + for (i = 0; i < 32; ++i) + Py_XDECREF(table[i]); + } return (PyObject *)z; }