Index: Objects/longobject.c
===================================================================
--- Objects/longobject.c	(revision 70547)
+++ Objects/longobject.c	(working copy)
@@ -1495,6 +1495,27 @@
 	return (digit)rem;
 }
 
+/* Multiply by a single digit, ignoring the sign. */
+
+static PyLongObject *
+mul1(PyLongObject *a, digit n)
+{
+	Py_ssize_t size_a = ABS(Py_SIZE(a));
+	PyLongObject *z = _PyLong_New(size_a+1);
+	twodigits carry = 0;
+	Py_ssize_t i;
+
+	if (z == NULL)
+		return NULL;
+	for (i = 0; i < size_a; ++i) {
+		carry += (twodigits)a->ob_digit[i] * n;
+		z->ob_digit[i] = (digit) (carry & PyLong_MASK);
+		carry >>= PyLong_SHIFT;
+	}
+	z->ob_digit[i] = (digit) carry;
+	return long_normalize(z);
+}
+
 /* Divide a long integer by a digit, returning both the quotient
    (as function result) and the remainder (through *prem).
    The sign of a is ignored; n should not be zero. */
@@ -2523,6 +2544,122 @@
 	return (PyObject *)z;
 }
 
+/* Here's a simple optimization for basecase multiplication that can achieve
+   speedups of up to 400% on some 64-bit platforms.  It uses the fact that
+   twodigits can represent values up to 16*PyLong_BASE*PyLong_BASE (assuming
+   that PyLong_SHIFT is 30), so up to 16 partial products can be accumulated
+   at once.  The innermost loop then generally contains fewer instructions
+   than in the usual long multiplication algorithm. */
+
+#if PyLong_SHIFT == 15
+#define BLOCK_MUL_SIZE 4
+#elif PyLong_SHIFT == 30
+#define BLOCK_MUL_SIZE 16
+#else
+#error "expected PyLong_SHIFT to be 15 or 30"
+#endif
+
+/* res[0:a_size+b_size] := a*b, assuming that b_size <= BLOCK_MUL_SIZE,
+   b_size <= a_size. */
+
+static void
+digits_multiply_init(digit *res, const digit *a, Py_ssize_t a_size,
+		     const digit *b, Py_ssize_t b_size)
+{
+	twodigits acc = 0;
+	Py_ssize_t j, k;
+	assert(b_size <= BLOCK_MUL_SIZE && b_size <= a_size);
+	for (k=0; k<b_size; k++) {
+		for (j=0; j<=k; j++)
+			acc += (twodigits)a[k-j]*b[j];
+		res[k] = (digit)(acc & PyLong_MASK);
+		acc >>= PyLong_SHIFT;
+	}
+	for (; k<a_size; k++) {
+		for (j=0; j<b_size; j++)
+			acc += (twodigits)a[k-j]*b[j];
+		res[k] = (digit)(acc & PyLong_MASK);
+		acc >>= PyLong_SHIFT;
+	}
+	for (; k<a_size+b_size; k++) {
+		for (j=k+1-a_size; j<b_size; j++)
+			acc += (twodigits)a[k-j]*b[j];
+		res[k] = (digit)(acc & PyLong_MASK);
+		acc >>= PyLong_SHIFT;
+	}
+	assert(acc == 0);
+}
+
+/* variant of the above: res[0:a_size+BLOCK_MUL_SIZE] :=
+   a[0:a_size]*b[0:BLOCK_MUL_SIZE] + res[0:a_size], assuming that BLOCK_MUL_SIZE
+   <= a_size */
+
+static void
+digits_multiply_add(digit *res, const digit *a, Py_ssize_t a_size,
+		    const digit *b)
+{
+	twodigits acc = 0;
+	Py_ssize_t j, k;
+	assert(BLOCK_MUL_SIZE <= a_size);
+	for (k=0; k<BLOCK_MUL_SIZE; k++) {
+		acc += res[k];
+		for (j=0; j<=k; j++)
+			acc += (twodigits)a[k-j]*b[j];
+		res[k] = (digit)(acc & PyLong_MASK);
+		acc >>= PyLong_SHIFT;
+	}
+	for (k=BLOCK_MUL_SIZE; k<a_size; k++) {
+		acc += res[k];
+		for (j=0; j<BLOCK_MUL_SIZE; j++)
+			acc += (twodigits)a[k-j]*b[j];
+		res[k] = (digit)(acc & PyLong_MASK);
+		acc >>= PyLong_SHIFT;
+	}
+	for (k=0; k<BLOCK_MUL_SIZE; k++) {
+		for (j=k+1; j<BLOCK_MUL_SIZE; j++)
+			acc += (twodigits)a[k+a_size-j]*b[j];
+		res[k+a_size] = (digit)(acc & PyLong_MASK);
+		acc >>= PyLong_SHIFT;
+	}
+	assert(acc == 0);
+}
+
+/* res[0:a_size+b_size] := a * b */
+
+static void
+digits_multiply(digit *res, const digit *a, Py_ssize_t a_size,
+		const digit *b, Py_ssize_t b_size)
+{
+	Py_ssize_t init_size;
+
+	/* force b_size <= a_size */
+	if (a_size < b_size) {
+		const digit *temp;
+		Py_ssize_t temp_size;
+		temp = a; a = b; b = temp;
+		temp_size = a_size; a_size = b_size; b_size = temp_size;
+	}
+
+	assert(b_size > 0);
+
+	/* split b into pieces, the first of size at most BLOCK_MUL_SIZE and all
+	   others of size exactly BLOCK_MUL_SIZE.  digits_multiply_init and
+	   digits_multiply_add do the actual multiplication. */
+	init_size = (b_size - 1) % BLOCK_MUL_SIZE + 1;
+	digits_multiply_init(res, a, a_size, b, init_size);
+	b += init_size;
+	res += init_size;
+	b_size -= init_size;
+
+	while (b_size >= BLOCK_MUL_SIZE) {
+		digits_multiply_add(res, a, a_size, b);
+		b_size -= BLOCK_MUL_SIZE;
+		b += BLOCK_MUL_SIZE;
+		res += BLOCK_MUL_SIZE;
+	}
+	assert(b_size == 0);
+}
+
 /* Grade school multiplication, ignoring the signs.
  * Returns the absolute value of the product, or NULL if error.
  */
@@ -2538,8 +2675,8 @@
 	if (z == NULL)
 		return NULL;
 
-	memset(z->ob_digit, 0, Py_SIZE(z) * sizeof(digit));
 	if (a == b) {
+		memset(z->ob_digit, 0, Py_SIZE(z) * sizeof(digit));
 		/* Efficient squaring per HAC, Algorithm 14.16:
 		 * http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf
 		 * Gives slightly less than a 2x speedup when a == b,
@@ -2584,28 +2721,8 @@
 		}
 	}
 	else {	/* a is not the same as b -- gradeschool long mult */
-		for (i = 0; i < size_a; ++i) {
-			twodigits carry = 0;
-			twodigits f = a->ob_digit[i];
-			digit *pz = z->ob_digit + i;
-			digit *pb = b->ob_digit;
-			digit *pbend = b->ob_digit + size_b;
-
-			SIGCHECK({
-				Py_DECREF(z);
-				return NULL;
-			})
-
-			while (pb < pbend) {
-				carry += *pz + *pb++ * f;
-				*pz++ = (digit)(carry & PyLong_MASK);
-				carry >>= PyLong_SHIFT;
-				assert(carry <= PyLong_MASK);
-			}
-			if (carry)
-				*pz += (digit)(carry & PyLong_MASK);
-			assert((carry >> PyLong_SHIFT) == 0);
-		}
+		digits_multiply(z->ob_digit, a->ob_digit, size_a,
+				b->ob_digit, size_b);
 	}
 	return long_normalize(z);
 }
@@ -2927,30 +3044,82 @@
 long_mul(PyLongObject *a, PyLongObject *b)
 {
 	PyLongObject *z;
+	Py_ssize_t a_size, b_size;
 
 	CHECK_BINOP(a, b);
 
 	/* fast path for single-digit multiplication */
 	if (ABS(Py_SIZE(a)) <= 1 && ABS(Py_SIZE(b)) <= 1) {
-		stwodigits v = (stwodigits)(MEDIUM_VALUE(a)) * MEDIUM_VALUE(b);
-#ifdef HAVE_LONG_LONG
-		return PyLong_FromLongLong((PY_LONG_LONG)v);
-#else
-		/* if we don't have long long then we're almost certainly
-		   using 15-bit digits, so v will fit in a long.  In the
-		   unlikely event that we're using 30-bit digits on a platform
-		   without long long, a large v will just cause us to fall
-		   through to the general multiplication code below. */
-		if (v >= LONG_MIN && v <= LONG_MAX)
-			return PyLong_FromLong((long)v);
-#endif
+		/* XXX benchmark this! Is is worth keeping? */
+		twodigits absz;
+		int sign;
+		sign = Py_SIZE(a) * Py_SIZE(b);
+		if (sign == 0)
+			return PyLong_FromLong(0L);
+		absz = (twodigits)a->ob_digit[0] * b->ob_digit[0];
+		if (absz < PyLong_BASE) {
+			CHECK_SMALL_INT((sdigit)(sign*absz));
+			z = _PyLong_New(1);
+			if (z != NULL) {
+				Py_SIZE(z) = sign;
+				z->ob_digit[0] = (digit)absz;
+			}
+		}
+		else {
+			z = _PyLong_New(2);
+			if (z != NULL) {
+				Py_SIZE(z) = 2*sign;
+				z->ob_digit[0] = (digit)(absz & PyLong_MASK);
+				assert(absz >>= 2*PyLong_SHIFT == 0);
+				z->ob_digit[1] = (digit)(absz >> PyLong_SHIFT);
+			}
+		}
+		return (PyObject *)z;
 	}
 
-	z = k_mul(a, b);
+	a_size = ABS(Py_SIZE(a));
+	b_size = ABS(Py_SIZE(b));
+	if((a_size <= 1) || (b_size <= 1)){
+		if (b_size < a_size){
+			if(b_size)
+				z = mul1(a, b->ob_digit[0]);
+			else
+				return PyLong_FromLong(0L);
+		}
+		else {
+			if(a_size)
+				z = mul1(b, a->ob_digit[0]);
+			else
+				return PyLong_FromLong(0L);
+		}
+		if ((Py_SIZE(a) < 0) ^ (Py_SIZE(b) < 0)){
+			NEGATE(z);
+		}
+		return (PyObject *)z;
+	}
+
+	/* use fast basecase multiplication if the smaller of a and b has at
+	   most 480 bits (60 bits if PyLong_SHIFT = 15) */
+	if (b_size <= BLOCK_MUL_SIZE || a_size <= BLOCK_MUL_SIZE) {
+		z = _PyLong_New(a_size + b_size);
+		if (z == NULL)
+			return NULL;
+		if (b_size <= a_size)
+			digits_multiply_init(z->ob_digit,
+					     a->ob_digit, a_size,
+					     b->ob_digit, b_size);
+		else
+			digits_multiply_init(z->ob_digit,
+					     b->ob_digit, b_size,
+					     a->ob_digit, a_size);
+	}
+	else
+		z = k_mul(a, b);
+
+	if ((Py_SIZE(a) < 0) ^ (Py_SIZE(b) < 0))
 	/* Negate if exactly one of the inputs is negative. */
-	if (((Py_SIZE(a) ^ Py_SIZE(b)) < 0) && z)
 		NEGATE(z);
-	return (PyObject *)z;
+	return (PyObject *)(long_normalize(z));
 }
 
 /* The / and % operators are now defined in terms of divmod().