From 6000464d08b94114baeef0a464896654fb0faa62 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Mon, 12 Aug 2002 22:01:34 +0000 Subject: [PATCH] Added new function k_lopsided_mul(), which is much more efficient than k_mul() when inputs have vastly different sizes, and a little more efficient when they're close to a factor of 2 out of whack. I consider this done now, although I'll set up some more correctness tests to run overnight. --- Misc/NEWS | 6 ++-- Objects/longobject.c | 82 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 75 insertions(+), 13 deletions(-) diff --git a/Misc/NEWS b/Misc/NEWS index efeb3acc66d..ba9bf3c89b3 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -64,9 +64,9 @@ Core and builtins log_base_2(3)) instead of the previous O(N**2). Measured results may be better or worse than that, depending on platform quirks. Note that this is a simple implementation, and there's no intent here to compete - with, e.g., gmp. It simply gives a very nice speedup when it applies. - XXX Karatsuba multiplication can be slower when the inputs have very - XXX different sizes. + with, e.g., GMP. It gives a very nice speedup when it applies, but + a package devoted to fast large-integer arithmetic should run circles + around it. - u'%c' will now raise a ValueError in case the argument is an integer outside the valid range of Unicode code point ordinals. diff --git a/Objects/longobject.c b/Objects/longobject.c index 8c9f69a5671..3cc6f138e18 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -1592,6 +1592,8 @@ kmul_split(PyLongObject *n, int size, PyLongObject **high, PyLongObject **low) return 0; } +static PyLongObject *k_lopsided_mul(PyLongObject *a, PyLongObject *b); + /* Karatsuba multiplication. Ignores the input signs, and returns the * absolute value of the product (or NULL if error). * See Knuth Vol. 2 Chapter 4.3.3 (Pp. 294-295). @@ -1633,15 +1635,21 @@ k_mul(PyLongObject *a, PyLongObject *b) /* Use gradeschool math when either number is too small. */ if (asize <= KARATSUBA_CUTOFF) { - /* 0 is inevitable if one kmul arg has more than twice - * the digits of another, so it's worth special-casing. - */ if (asize == 0) return _PyLong_New(0); else return x_mul(a, b); } + /* If a is small compared to b, splitting on b gives a degenerate + * case with ah==0, and Karatsuba may be (even much) less efficient + * than "grade school" then. However, we can still win, by viewing + * b as a string of "big digits", each of width a->ob_size. That + * leads to a sequence of balanced calls to k_mul. + */ + if (2 * asize <= bsize) + return k_lopsided_mul(a, b); + shift = bsize >> 1; if (kmul_split(a, shift, &ah, &al) < 0) goto fail; if (kmul_split(b, shift, &bh, &bl) < 0) goto fail; @@ -1750,6 +1758,67 @@ k_mul(PyLongObject *a, PyLongObject *b) return NULL; } +/* b has at least twice the digits of a, and a is big enough that Karatsuba + * would pay off *if* the inputs had balanced sizes. View b as a sequence + * of slices, each with a->ob_size digits, and multiply the slices by a, + * one at a time. This gives k_mul balanced inputs to work with, and is + * also cache-friendly (we compute one double-width slice of the result + * at a time, then move on, never bactracking except for the helpful + * single-width slice overlap between successive partial sums). + */ +static PyLongObject * +k_lopsided_mul(PyLongObject *a, PyLongObject *b) +{ + const int asize = ABS(a->ob_size); + int bsize = ABS(b->ob_size); + int nbdone; /* # of b digits already multiplied */ + PyLongObject *ret; + PyLongObject *bslice = NULL; + + assert(asize > KARATSUBA_CUTOFF); + assert(2 * asize <= bsize); + + /* Allocate result space, and zero it out. */ + ret = _PyLong_New(asize + bsize); + if (ret == NULL) + return NULL; + memset(ret->ob_digit, 0, ret->ob_size * sizeof(digit)); + + /* Successive slices of b are copied into bslice. */ + bslice = _PyLong_New(bsize); + if (bslice == NULL) + goto fail; + + nbdone = 0; + while (bsize > 0) { + PyLongObject *product; + const int nbtouse = MIN(bsize, asize); + + /* Multiply the next slice of b by a. */ + memcpy(bslice->ob_digit, b->ob_digit + nbdone, + nbtouse * sizeof(digit)); + bslice->ob_size = nbtouse; + product = k_mul(a, bslice); + if (product == NULL) + goto fail; + + /* Add into result. */ + (void)v_iadd(ret->ob_digit + nbdone, ret->ob_size - nbdone, + product->ob_digit, product->ob_size); + Py_DECREF(product); + + bsize -= nbtouse; + nbdone += nbtouse; + } + + Py_DECREF(bslice); + return long_normalize(ret); + + fail: + Py_DECREF(ret); + Py_XDECREF(bslice); + return NULL; +} static PyObject * long_mul(PyLongObject *v, PyLongObject *w) @@ -1769,14 +1838,7 @@ long_mul(PyLongObject *v, PyLongObject *w) return Py_NotImplemented; } -#if 0 - if (Py_GETENV("KARAT") != NULL) - z = k_mul(a, b); - else - z = x_mul(a, b); -#else z = k_mul(a, b); -#endif if(z == NULL) { Py_DECREF(a); Py_DECREF(b);