Cautious introduction of a patch that started from

SF 560379: Karatsuba multiplication. Lots of things were changed from that. This needs a lot more testing, for correctness and speed, the latter especially when bit lengths are unbalanced. For now, the Karatsuba code gets invoked if and only if envar KARAT exists.
2002-08-12 02:31:19 +00:00 · 2002-08-12 02:31:19 +00:00 · 5af4e6c739
parent 5f7617b5f6
commit 5af4e6c739
3 changed files with 272 additions and 88 deletions
--- a/Misc/ACKS
+++ b/Misc/ACKS
@ -106,6 +106,7 @@ Greg Couch
 Steve Cousins
 Alex Coventry
 Matthew Dixon Cowles
+Christopher A. Craig
 Drew Csillag
 Tom Culliton
 John Cugini
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -6,6 +6,10 @@ Type/class unification and new-style classes

 Core and builtins

+- XXX Karatsuba multiplication.  This is currently used if and only
+  if envar KARAT exists.  It needs more correctness and speed testing,
+  the latter especially with unbalanced bit lengths.
+
 - u'%c' will now raise a ValueError in case the argument is an
  integer outside the valid range of Unicode code point ordinals.

@ -83,7 +87,7 @@ Core and builtins
  as directory names.

 - The built-ins slice() and buffer() are now callable types.  The
-  types classobj (formerly class), code, function, instance, and
+0  types classobj (formerly class), code, function, instance, and
  instancemethod (formerly instance-method), which have no built-in
  names but are accessible through the types module, are now also
  callable.  The type dict-proxy is renamed to dictproxy.
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@ -8,8 +8,19 @@

 #include <ctype.h>

+/* For long multiplication, use the O(N**2) school algorithm unless
+ * both operands contain more than KARATSUBA_CUTOFF digits (this
+ * being an internal Python long digit, in base BASE).
+ */
+#define KARATSUBA_CUTOFF 35
+
 #define ABS(x) ((x) < 0 ? -(x) : (x))

+#undef MIN
+#undef MAX
+#define MAX(x, y) ((x) < (y) ? (y) : (x))
+#define MIN(x, y) ((x) > (y) ? (y) : (x))
+
 /* Forward */
 static PyLongObject *long_normalize(PyLongObject *);
 static PyLongObject *mul1(PyLongObject *, wdigit);
@ -1457,54 +1468,28 @@ long_repeat(PyObject *v, PyLongObject *w)
 		return (*v->ob_type->tp_as_sequence->sq_repeat)(v, n);
 }

-static PyObject *
-long_mul(PyLongObject *v, PyLongObject *w)
+/* Grade school multiplication, ignoring the signs.
+ * Returns the absolute value of the product, or NULL if error.
+ */
+static PyLongObject *
+x_mul(PyLongObject *a, PyLongObject *b)
 {
-	PyLongObject *a, *b, *z;
-	int size_a;
-	int size_b;
+	PyLongObject *z;
+	int size_a = ABS(a->ob_size);
+	int size_b = ABS(b->ob_size);
 	int i;

-	if (!convert_binop((PyObject *)v, (PyObject *)w, &a, &b)) {
-		if (!PyLong_Check(v) &&
-		    v->ob_type->tp_as_sequence &&
-		    v->ob_type->tp_as_sequence->sq_repeat)
-			return long_repeat((PyObject *)v, w);
-		if (!PyLong_Check(w) &&
-			 w->ob_type->tp_as_sequence &&
-			 w->ob_type->tp_as_sequence->sq_repeat)
-			return long_repeat((PyObject *)w, v);
-		Py_INCREF(Py_NotImplemented);
-		return Py_NotImplemented;
-	}
-
-	size_a = ABS(a->ob_size);
-	size_b = ABS(b->ob_size);
-	if (size_a > size_b) {
-		/* we are faster with the small object on the left */
-		int hold_sa = size_a;
-		PyLongObject *hold_a = a;
-		size_a = size_b;
-		size_b = hold_sa;
-		a = b;
-		b = hold_a;
-	}
-	z = _PyLong_New(size_a + size_b);
-	if (z == NULL) {
-		Py_DECREF(a);
-		Py_DECREF(b);
+     	z = _PyLong_New(size_a + size_b);
+	if (z == NULL)
 		return NULL;
-	}
-	for (i = 0; i < z->ob_size; ++i)
-		z->ob_digit[i] = 0;
+
+	memset(z->ob_digit, 0, z->ob_size * sizeof(digit));
 	for (i = 0; i < size_a; ++i) {
 		twodigits carry = 0;
 		twodigits f = a->ob_digit[i];
 		int j;

 		SIGCHECK({
-			Py_DECREF(a);
-			Py_DECREF(b);
 			Py_DECREF(z);
 			return NULL;
 		})
@ -1520,6 +1505,205 @@ long_mul(PyLongObject *v, PyLongObject *w)
 			carry >>= SHIFT;
 		}
 	}
+	return z;
+}
+
+/* A helper for Karatsuba multiplication (k_mul).
+   Takes a long "n" and an integer "size" representing the place to
+   split, and sets low and high such that abs(n) == (high << size) + low,
+   viewing the shift as being by digits.  The sign bit is ignored, and
+   the return values are >= 0.
+   Returns 0 on success, -1 on failure.
+*/
+static int
+kmul_split(PyLongObject *n, int size, PyLongObject **high, PyLongObject **low)
+{
+	PyLongObject *hi, *lo;
+	int size_lo, size_hi;
+	const int size_n = ABS(n->ob_size);
+
+	size_lo = MIN(size_n, size);
+	size_hi = size_n - size_lo;
+
+	if ((hi = _PyLong_New(size_hi)) == NULL)
+		return -1;
+	if ((lo = _PyLong_New(size_lo)) == NULL) {
+		Py_DECREF(hi);
+		return -1;
+	}
+
+	memcpy(lo->ob_digit, n->ob_digit, size_lo * sizeof(digit));
+	memcpy(hi->ob_digit, n->ob_digit + size_lo, size_hi * sizeof(digit));
+
+	*high = long_normalize(hi);
+	*low = long_normalize(lo);
+	return 0;
+}
+
+/* Karatsuba multiplication.  Ignores the input signs, and returns the
+ * absolute value of the product (or NULL if error).
+ * See Knuth Vol. 2 Chapter 4.3.3 (Pp. 294-295).
+ */
+static PyLongObject *
+k_mul(PyLongObject *a, PyLongObject *b)
+{
+	PyLongObject *ah = NULL;
+	PyLongObject *al = NULL;
+	PyLongObject *bh = NULL;
+	PyLongObject *bl = NULL;
+	PyLongObject *albl = NULL;
+	PyLongObject *ahbh = NULL;
+	PyLongObject *k = NULL;
+	PyLongObject *ret = NULL;
+	PyLongObject *t1, *t2;
+	int shift;	/* the number of digits we split off */
+	int i;
+
+	/* (ah*X+al)(bh*X+bl) = ah*bh*X*X + (ah*bl + al*bh)*X + al*bl
+	 * Let k = (ah+al)*(bh+bl) = ah*bl + al*bh  + ah*bh + al*bl
+	 * Then the original product is
+	 *     ah*bh*X*X + (k - ah*bh - ah*bl)*X + al*bl
+	 * By picking X to be a power of 2, "*X" is just shifting, and it's
+	 * been reduced to 3 multiplies on numbers half the size.
+	 */
+
+	/* We want to split based on the larger number; fiddle so that a
+	 * is largest.
+	 */
+	if (ABS(a->ob_size) > ABS(b->ob_size)) {
+		t1 = a;
+		a = b;
+		b = t1;
+	}
+
+	/* Use gradeschool math when either number is too small. */
+	if (ABS(a->ob_size) <= KARATSUBA_CUTOFF)
+		return x_mul(a, b);
+
+	shift = ABS(b->ob_size) >> 1;
+	if (kmul_split(a, shift, &ah, &al) < 0) goto fail;
+	if (kmul_split(b, shift, &bh, &bl) < 0) goto fail;
+
+	if ((ahbh = k_mul(ah, bh)) == NULL) goto fail;
+	assert(ahbh->ob_size >= 0);
+
+	/* Allocate result space, and copy ahbh into the high digits. */
+	ret = _PyLong_New(ahbh->ob_size + 2*shift + 1);
+	if (ret == NULL) goto fail;
+#ifdef Py_DEBUG
+	/* Fill with trash, to catch reference to uninitialized digits. */
+	memset(ret->ob_digit, 0xDF, ret->ob_size * sizeof(digit));
+#endif
+	memcpy(ret->ob_digit + 2*shift, ahbh->ob_digit,
+	       ahbh->ob_size * sizeof(digit));
+	/* That didn't copy into the most-significant (overflow) digit. */
+	ret->ob_digit[ret->ob_size - 1] = 0;
+
+	/* Compute al*bl, and copy into the low digits. */
+	if ((albl = k_mul(al, bl)) == NULL) goto fail;
+	assert(albl->ob_size >= 0);
+	assert(albl->ob_size <= 2*shift); /* no overlap with high digits */
+	memcpy(ret->ob_digit, albl->ob_digit, albl->ob_size * sizeof(digit));
+
+	/* Zero out remaining digits. */
+	i = 2*shift - albl->ob_size;	/* number of uninitialized digits */
+	if (i)
+		memset(ret->ob_digit + albl->ob_size, 0, i * sizeof(digit));
+
+	/* k = (ah+al)(bh+bl) */
+	if ((t1 = x_add(ah, al)) == NULL) goto fail;
+	Py_DECREF(ah);
+	Py_DECREF(al);
+	ah = al = NULL;
+
+	if ((t2 = x_add(bh, bl)) == NULL) {
+		Py_DECREF(t1);
+		goto fail;
+	}
+	Py_DECREF(bh);
+	Py_DECREF(bl);
+	bh = bl = NULL;
+
+	k = k_mul(t1, t2);
+	Py_DECREF(t1);
+	Py_DECREF(t2);
+	if (k == NULL) goto fail;
+
+	/* Subtract ahbh and albl from k.  Note that this can't become
+	 * negative, since k = ahbh + albl + other stuff.
+	 */
+	if ((t1 = x_sub(k, ahbh)) == NULL) goto fail;
+	Py_DECREF(k);
+	k = t1;
+	Py_DECREF(ahbh);
+	ahbh = NULL;
+
+	if ((t1 = x_sub(k, albl)) == NULL) goto fail;
+	Py_DECREF(k);
+	k = t1;
+	Py_DECREF(albl);
+	albl = NULL;
+
+	/* Add k into the result, at the shift-th least-significant digit. */
+	{
+		int j;		/* index into k */
+		digit carry = 0;
+
+		for (i = shift, j = 0; j < k->ob_size; ++i, ++j) {
+			carry += ret->ob_digit[i] + k->ob_digit[j];
+			ret->ob_digit[i] = carry & MASK;
+			carry >>= SHIFT;
+		}
+		for (; carry && i < ret->ob_size; ++i) {
+			carry += ret->ob_digit[i];
+			ret->ob_digit[i] = carry & MASK;
+			carry >>= SHIFT;
+		}
+		assert(carry == 0);
+	}
+	Py_DECREF(k);
+	return long_normalize(ret);
+
+ fail:
+ 	Py_XDECREF(ret);
+	Py_XDECREF(ah);
+	Py_XDECREF(al);
+	Py_XDECREF(bh);
+	Py_XDECREF(bl);
+	Py_XDECREF(ahbh);
+	Py_XDECREF(albl);
+	Py_XDECREF(k);
+	return NULL;
+}
+
+
+static PyObject *
+long_mul(PyLongObject *v, PyLongObject *w)
+{
+	PyLongObject *a, *b, *z;
+
+	if (!convert_binop((PyObject *)v, (PyObject *)w, &a, &b)) {
+		if (!PyLong_Check(v) &&
+		    v->ob_type->tp_as_sequence &&
+		    v->ob_type->tp_as_sequence->sq_repeat)
+			return long_repeat((PyObject *)v, w);
+		if (!PyLong_Check(w) &&
+		    w->ob_type->tp_as_sequence &&
+		    w->ob_type->tp_as_sequence->sq_repeat)
+			return long_repeat((PyObject *)w, v);
+		Py_INCREF(Py_NotImplemented);
+		return Py_NotImplemented;
+	}
+
+	if (Py_GETENV("KARAT") != NULL)
+		z = k_mul(a, b);
+	else
+		z = x_mul(a, b);
+	if(z == NULL) {
+		Py_DECREF(a);
+		Py_DECREF(b);
+		return NULL;
+	}
 	if (a->ob_size < 0)
 		z->ob_size = -(z->ob_size);
 	if (b->ob_size < 0)
@ -2003,11 +2187,6 @@ lshift_error:

 /* Bitwise and/xor/or operations */

-#undef MIN
-#undef MAX
-#define MAX(x, y) ((x) < (y) ? (y) : (x))
-#define MIN(x, y) ((x) > (y) ? (y) : (x))
-
 static PyObject *
 long_bitwise(PyLongObject *a,
 	     int op,  /* '&', '|', '^' */