k_mul() and long_mul(): I'm confident that the Karatsuba algorithm is

correct now, so added some final comments, did some cleanup, and enabled
it for all long-int multiplies.  The KARAT envar no longer matters,
although I left some #if 0'ed code in there for my own use (temporary).
k_mul() is still much slower than x_mul() if the inputs have very
differenent sizes, and that still needs to be addressed.
This commit is contained in:
Tim Peters 2002-08-12 17:36:03 +00:00
parent a6fa0e6f2e
commit d64c1def7c
2 changed files with 40 additions and 12 deletions

View File

@ -57,9 +57,16 @@ Type/class unification and new-style classes
Core and builtins Core and builtins
- XXX Karatsuba multiplication. This is currently used if and only - When multiplying very large integers, a version of the so-called
if envar KARAT exists. It needs more correctness and speed testing, Karatsuba algorithm is now used. This is most effective if the
the latter especially with unbalanced bit lengths. inputs have roughly the same size. If they both have about N digits,
Karatsuba multiplication has O(N**1.58) runtime (the exponent is
log_base_2(3)) instead of the previous O(N**2). Measured results may
be better or worse than that, depending on platform quirks. Note that
this is a simple implementation, and there's no intent here to compete
with, e.g., gmp. It simply gives a very nice speedup when it applies.
XXX Karatsuba multiplication can be slower when the inputs have very
XXX different sizes.
- u'%c' will now raise a ValueError in case the argument is an - u'%c' will now raise a ValueError in case the argument is an
integer outside the valid range of Unicode code point ordinals. integer outside the valid range of Unicode code point ordinals.

View File

@ -1645,7 +1645,23 @@ k_mul(PyLongObject *a, PyLongObject *b)
if (kmul_split(a, shift, &ah, &al) < 0) goto fail; if (kmul_split(a, shift, &ah, &al) < 0) goto fail;
if (kmul_split(b, shift, &bh, &bl) < 0) goto fail; if (kmul_split(b, shift, &bh, &bl) < 0) goto fail;
/* Allocate result space. */ /* The plan:
* 1. Allocate result space (asize + bsize digits: that's always
* enough).
* 2. Compute ah*bh, and copy into result at 2*shift.
* 3. Compute al*bl, and copy into result at 0. Note that this
* can't overlap with #2.
* 4. Subtract al*bl from the result, starting at shift. This may
* underflow (borrow out of the high digit), but we don't care:
* we're effectively doing unsigned arithmetic mod
* BASE**(sizea + sizeb), and so long as the *final* result fits,
* borrows and carries out of the high digit can be ignored.
* 5. Subtract ah*bh from the result, starting at shift.
* 6. Compute (ah+al)*(bh+bl), and add it into the result starting
* at shift.
*/
/* 1. Allocate result space. */
ret = _PyLong_New(asize + bsize); ret = _PyLong_New(asize + bsize);
if (ret == NULL) goto fail; if (ret == NULL) goto fail;
#ifdef Py_DEBUG #ifdef Py_DEBUG
@ -1653,7 +1669,7 @@ k_mul(PyLongObject *a, PyLongObject *b)
memset(ret->ob_digit, 0xDF, ret->ob_size * sizeof(digit)); memset(ret->ob_digit, 0xDF, ret->ob_size * sizeof(digit));
#endif #endif
/* t1 <- ah*bh, and copy into high digits of result. */ /* 2. t1 <- ah*bh, and copy into high digits of result. */
if ((t1 = k_mul(ah, bh)) == NULL) goto fail; if ((t1 = k_mul(ah, bh)) == NULL) goto fail;
assert(t1->ob_size >= 0); assert(t1->ob_size >= 0);
assert(2*shift + t1->ob_size <= ret->ob_size); assert(2*shift + t1->ob_size <= ret->ob_size);
@ -1666,7 +1682,7 @@ k_mul(PyLongObject *a, PyLongObject *b)
memset(ret->ob_digit + 2*shift + t1->ob_size, 0, memset(ret->ob_digit + 2*shift + t1->ob_size, 0,
i * sizeof(digit)); i * sizeof(digit));
/* t2 <- al*bl, and copy into the low digits. */ /* 3. t2 <- al*bl, and copy into the low digits. */
if ((t2 = k_mul(al, bl)) == NULL) { if ((t2 = k_mul(al, bl)) == NULL) {
Py_DECREF(t1); Py_DECREF(t1);
goto fail; goto fail;
@ -1680,15 +1696,17 @@ k_mul(PyLongObject *a, PyLongObject *b)
if (i) if (i)
memset(ret->ob_digit + t2->ob_size, 0, i * sizeof(digit)); memset(ret->ob_digit + t2->ob_size, 0, i * sizeof(digit));
/* Subtract ah*bh (t1) and al*bl (t2) from "the middle" digits. */ /* 4 & 5. Subtract ah*bh (t1) and al*bl (t2). We do al*bl first
* because it's fresher in cache.
*/
i = ret->ob_size - shift; /* # digits after shift */ i = ret->ob_size - shift; /* # digits after shift */
v_isub(ret->ob_digit + shift, i, t2->ob_digit, t2->ob_size); (void)v_isub(ret->ob_digit + shift, i, t2->ob_digit, t2->ob_size);
Py_DECREF(t2); Py_DECREF(t2);
v_isub(ret->ob_digit + shift, i, t1->ob_digit, t1->ob_size); (void)v_isub(ret->ob_digit + shift, i, t1->ob_digit, t1->ob_size);
Py_DECREF(t1); Py_DECREF(t1);
/* t3 <- (ah+al)(bh+bl) */ /* 6. t3 <- (ah+al)(bh+bl), and add into result. */
if ((t1 = x_add(ah, al)) == NULL) goto fail; if ((t1 = x_add(ah, al)) == NULL) goto fail;
Py_DECREF(ah); Py_DECREF(ah);
Py_DECREF(al); Py_DECREF(al);
@ -1709,8 +1727,7 @@ k_mul(PyLongObject *a, PyLongObject *b)
if (t3 == NULL) goto fail; if (t3 == NULL) goto fail;
/* Add t3. */ /* Add t3. */
v_iadd(ret->ob_digit + shift, ret->ob_size - shift, (void)v_iadd(ret->ob_digit + shift, i, t3->ob_digit, t3->ob_size);
t3->ob_digit, t3->ob_size);
Py_DECREF(t3); Py_DECREF(t3);
return long_normalize(ret); return long_normalize(ret);
@ -1743,10 +1760,14 @@ long_mul(PyLongObject *v, PyLongObject *w)
return Py_NotImplemented; return Py_NotImplemented;
} }
#if 0
if (Py_GETENV("KARAT") != NULL) if (Py_GETENV("KARAT") != NULL)
z = k_mul(a, b); z = k_mul(a, b);
else else
z = x_mul(a, b); z = x_mul(a, b);
#else
z = k_mul(a, b);
#endif
if(z == NULL) { if(z == NULL) {
Py_DECREF(a); Py_DECREF(a);
Py_DECREF(b); Py_DECREF(b);