Index: longobject.c
===================================================================
--- longobject.c	(revision 69738)
+++ longobject.c	(working copy)
@@ -4,6 +4,7 @@
 
 #include "Python.h"
 #include "longintrepr.h"
+#include "structseq.h"
 
 #include <ctype.h>
 #include <stddef.h>
@@ -204,19 +205,6 @@
 		return (PyObject*)v;
 	}
 
-	/* 2 digits */
-	if (!(abs_ival >> 2*PyLong_SHIFT)) {
-		v = _PyLong_New(2);
-		if (v) {
-			Py_SIZE(v) = 2*sign;
-			v->ob_digit[0] = Py_SAFE_DOWNCAST(
-				abs_ival & PyLong_MASK, unsigned long, digit);
-			v->ob_digit[1] = Py_SAFE_DOWNCAST(
-			      abs_ival >> PyLong_SHIFT, unsigned long, digit);
-		}
-		return (PyObject*)v;
-	}
-
 	/* Larger numbers: loop to determine number of digits */
 	t = abs_ival;
 	while (t) {
@@ -2097,6 +2085,7 @@
 	PyLongObject *w = mul1(w1, d);
 	PyLongObject *a;
 	Py_ssize_t j, k;
+	digit wm1, wm2, carry;
 
 	if (v == NULL || w == NULL) {
 		Py_XDECREF(v);
@@ -2104,71 +2093,75 @@
 		return NULL;
 	}
 
-	assert(size_v >= size_w && size_w > 1); /* Assert checks by div() */
+	assert(size_v >= size_w && size_w >= 2); /* Assert checks by div() */
 	assert(Py_REFCNT(v) == 1); /* Since v will be used as accumulator! */
 	assert(size_w == ABS(Py_SIZE(w))); /* That's how d was calculated */
 
 	size_v = ABS(Py_SIZE(v));
+	wm1 = w->ob_digit[size_w-1]; /* top digit of w */
+	wm2 = w->ob_digit[size_w-2]; /* and next one down */
+	/* we can often save an iteration */
+	if (v->ob_digit[size_v-1] < wm1)
+		size_v--;
+	else
+		assert(v->ob_digit[size_v] == 0);
+	assert(v->ob_digit[size_v] < wm1);
+
 	k = size_v - size_w;
 	a = _PyLong_New(k + 1);
 
+	carry = 1;
 	for (j = size_v; a != NULL && k >= 0; --j, --k) {
-		digit vj = (j >= size_v) ? 0 : v->ob_digit[j];
-		twodigits q;
-		stwodigits carry = 0;
+		digit q, r, zz, vj;
+		twodigits z;
 		Py_ssize_t i;
-
 		SIGCHECK({
 			Py_DECREF(a);
 			a = NULL;
 			break;
 		})
-		if (vj == w->ob_digit[size_w-1])
-			q = PyLong_MASK;
-		else
-			q = (((twodigits)vj << PyLong_SHIFT) + v->ob_digit[j-1]) /
-				w->ob_digit[size_w-1];
+		vj = v->ob_digit[j];
+		assert(vj <= wm1);
 
-		while (w->ob_digit[size_w-2]*q >
-				((
-					((twodigits)vj << PyLong_SHIFT)
-					+ v->ob_digit[j-1]
-					- q*w->ob_digit[size_w-1]
-								) << PyLong_SHIFT)
-				+ v->ob_digit[j-2])
+		/* estimate quotient q; may (rarely) overestimate by 1 */
+		z = ((twodigits)vj << PyLong_SHIFT) + v->ob_digit[j-1];
+		q = (digit)(z / wm1);
+		r = (digit)(z % wm1);
+		while (r <= PyLong_MASK && (twodigits)wm2 * q >
+		       ((twodigits)r << PyLong_SHIFT) + v->ob_digit[j-2]) {
 			--q;
-
-		for (i = 0; i < size_w && i+k < size_v; ++i) {
-			twodigits z = w->ob_digit[i] * q;
-			digit zz = (digit) (z >> PyLong_SHIFT);
-			carry += v->ob_digit[i+k] - z
-				+ ((twodigits)zz << PyLong_SHIFT);
-			v->ob_digit[i+k] = (digit)(carry & PyLong_MASK);
-			carry = Py_ARITHMETIC_RIGHT_SHIFT(stwodigits,
-							  carry, PyLong_SHIFT);
-			carry -= zz;
+			r += wm1;
 		}
 
-		if (i+k < size_v) {
-			carry += v->ob_digit[i+k];
-			v->ob_digit[i+k] = 0;
+		/* subtract q*w from remaining top limbs of v */
+		zz = 0;
+		for (i = 0; i < size_w; ++i) {
+			z = (twodigits)w->ob_digit[i] * q + zz;
+			zz = (digit)(z >> PyLong_SHIFT);
+			carry += v->ob_digit[i+k] + 
+				(digit)((z & PyLong_MASK) ^ PyLong_MASK);
+			v->ob_digit[i+k] = carry & PyLong_MASK;
+			carry >>= PyLong_SHIFT;
+			assert(carry == 0 || carry == 1);
 		}
+		carry += v->ob_digit[i+k] + (zz ^ PyLong_MASK);
+		assert(carry == PyLong_MASK || carry == PyLong_BASE);
+		carry >>= PyLong_SHIFT;
+		assert(carry == 0 || carry == 1);
 
-		if (carry == 0)
-			a->ob_digit[k] = (digit) q;
-		else {
-			assert(carry == -1);
-			a->ob_digit[k] = (digit) q-1;
-			carry = 0;
-			for (i = 0; i < size_w && i+k < size_v; ++i) {
+		if (carry == 0) {
+			/* this branch taken only if q was too large (rare) */
+			for (i = 0; i < size_w; ++i) {
 				carry += v->ob_digit[i+k] + w->ob_digit[i];
 				v->ob_digit[i+k] = (digit)(carry & PyLong_MASK);
-				carry = Py_ARITHMETIC_RIGHT_SHIFT(
-						stwodigits,
-						carry, PyLong_SHIFT);
+				carry >>= PyLong_SHIFT;
 			}
+			assert(carry == 1);
+			q--;
 		}
+		a->ob_digit[k] = q;
 	} /* for j, k */
+	Py_SIZE(v) = size_w;
 
 	if (a == NULL)
 		*prem = NULL;
@@ -2457,6 +2450,118 @@
 	return (PyObject *)z;
 }
 
+/* Here's a simple optimization for basecase multiplication that can achieve
+   speedups of up to 400% on some 64-bit platforms.  It uses the fact that
+   twodigits can represent values up to 16*PyLong_BASE*PyLong_BASE (assuming
+   that PyLong_SHIFT is 30), so up to 16 partial products can be accumulated
+   at once.  The innermost loop then generally contains fewer instructions
+   than in the usual long multiplication algorithm. */
+
+#if PyLong_SHIFT == 15
+#define MAX_PARTIALS 4
+#elif PyLong_SHIFT == 30
+#define MAX_PARTIALS 16
+#else
+#error "expected PyLong_SHIFT to be 15 or 30"
+#endif
+
+/* res[0:a_size+b_size] := a*b, assuming that b_size <= MAX_PARTIALS,
+   b_size <= a_size. */
+
+static void
+digits_multiply_init(digit *res, const digit *a, Py_ssize_t a_size,
+		     const digit *b, Py_ssize_t b_size)
+{
+	twodigits acc = 0;
+	Py_ssize_t j, k;
+	assert(b_size <= MAX_PARTIALS && b_size <= a_size);
+	for (k=0; k<b_size; k++) {
+		for (j=0; j<=k; j++)
+			acc += (twodigits)a[k-j]*b[j];
+		res[k] = (digit)(acc & PyLong_MASK);
+		acc >>= PyLong_SHIFT;
+	}
+	for (; k<a_size; k++) {
+		for (j=0; j<b_size; j++)
+			acc += (twodigits)a[k-j]*b[j];
+		res[k] = (digit)(acc & PyLong_MASK);
+		acc >>= PyLong_SHIFT;
+	}
+	for (; k<a_size+b_size; k++) {
+		for (j=k+1-a_size; j<b_size; j++)
+			acc += (twodigits)a[k-j]*b[j];
+		res[k] = (digit)(acc & PyLong_MASK);
+		acc >>= PyLong_SHIFT;
+	}
+	assert(acc == 0);
+}
+
+/* variant of the above: res[0:a_size+b_size] := a*b + res[0:a_size], assuming
+   that b_size <= MAX_PARTIALS, b_size <= a_size */
+
+static void
+digits_multiply_add(digit *res, const digit *a, Py_ssize_t a_size,
+		    const digit *b, Py_ssize_t b_size)
+{
+	twodigits acc = 0;
+	Py_ssize_t j, k;
+	assert(b_size <= MAX_PARTIALS && b_size <= a_size);
+	for (k=0; k<b_size; k++) {
+		acc += res[k];
+		for (j=0; j<=k; j++)
+			acc += (twodigits)a[k-j]*b[j];
+		res[k] = (digit)(acc & PyLong_MASK);
+		acc >>= PyLong_SHIFT;
+	}
+	for (; k<a_size; k++) {
+		acc += res[k];
+		for (j=0; j<b_size; j++)
+			acc += (twodigits)a[k-j]*b[j];
+		res[k] = (digit)(acc & PyLong_MASK);
+		acc >>= PyLong_SHIFT;
+	}
+	for (; k<a_size+b_size; k++) {
+		for (j=k+1-a_size; j<b_size; j++)
+			acc += (twodigits)a[k-j]*b[j];
+		res[k] = (digit)(acc & PyLong_MASK);
+		acc >>= PyLong_SHIFT;
+	}
+	assert(acc == 0);
+}
+
+/* res[0:a_size+b_size] := a * b */
+
+static void
+digits_multiply(digit *res, const digit *a, Py_ssize_t a_size,
+		const digit *b, Py_ssize_t b_size)
+{
+	if (a_size < b_size) {
+		const digit *temp;
+		Py_ssize_t temp_size;
+		temp = a; a = b; b = temp;
+		temp_size = a_size; a_size = b_size; b_size = temp_size;
+	}
+
+	/* split b up into pieces, each piece having <= MAX_PARTIALS limbs.
+	   Then use digits_multiply_init and digits_multiply_add to do
+	   the real work. */
+	if (b_size < MAX_PARTIALS)
+		digits_multiply_init(res, a, a_size, b, b_size);
+	else {
+		digits_multiply_init(res, a, a_size, b, MAX_PARTIALS);
+		b_size -= MAX_PARTIALS;
+		b += MAX_PARTIALS;
+		res += MAX_PARTIALS;
+		while (b_size >= MAX_PARTIALS) {
+			digits_multiply_add(res, a, a_size, b, MAX_PARTIALS);
+			b_size -= MAX_PARTIALS;
+			b += MAX_PARTIALS;
+			res += MAX_PARTIALS;
+		}
+		digits_multiply_add(res, a, a_size, b, b_size);
+	}
+}
+
 /* Grade school multiplication, ignoring the signs.
  * Returns the absolute value of the product, or NULL if error.
  */
@@ -2472,8 +2577,8 @@
 	if (z == NULL)
 		return NULL;
 
-	memset(z->ob_digit, 0, Py_SIZE(z) * sizeof(digit));
 	if (a == b) {
+		memset(z->ob_digit, 0, Py_SIZE(z) * sizeof(digit));
 		/* Efficient squaring per HAC, Algorithm 14.16:
 		 * http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf
 		 * Gives slightly less than a 2x speedup when a == b,
@@ -2518,28 +2623,8 @@
 		}
 	}
 	else {	/* a is not the same as b -- gradeschool long mult */
-		for (i = 0; i < size_a; ++i) {
-			twodigits carry = 0;
-			twodigits f = a->ob_digit[i];
-			digit *pz = z->ob_digit + i;
-			digit *pb = b->ob_digit;
-			digit *pbend = b->ob_digit + size_b;
-
-			SIGCHECK({
-				Py_DECREF(z);
-				return NULL;
-			})
-
-			while (pb < pbend) {
-				carry += *pz + *pb++ * f;
-				*pz++ = (digit)(carry & PyLong_MASK);
-				carry >>= PyLong_SHIFT;
-				assert(carry <= PyLong_MASK);
-			}
-			if (carry)
-				*pz += (digit)(carry & PyLong_MASK);
-			assert((carry >> PyLong_SHIFT) == 0);
-		}
+		digits_multiply(z->ob_digit, a->ob_digit, size_a,
+				b->ob_digit, size_b);
 	}
 	return long_normalize(z);
 }
@@ -2861,20 +2946,75 @@
 long_mul(PyLongObject *a, PyLongObject *b)
 {
 	PyLongObject *z;
+	Py_ssize_t a_size, b_size;
 
 	CHECK_BINOP(a, b);
 
+	/* fast path for single-digit multiplication */
 	if (ABS(Py_SIZE(a)) <= 1 && ABS(Py_SIZE(b)) <= 1) {
-		PyObject *r;
-		r = PyLong_FromLong(MEDIUM_VALUE(a)*MEDIUM_VALUE(b));
-		return r;
+		/* XXX benchmark this! Is is worth keeping? */
+		twodigits absz;
+		int sign;
+		sign = Py_SIZE(a) * Py_SIZE(b);
+		if (sign == 0)
+			return PyLong_FromLong(0L);
+		absz = (twodigits)a->ob_digit[0] * b->ob_digit[0];
+		if (absz < PyLong_BASE) {
+			CHECK_SMALL_INT((sdigit)(sign*absz));
+			z = _PyLong_New(1);
+			if (z != NULL) {
+				Py_SIZE(z) = sign;
+				z->ob_digit[0] = (digit)absz;
+			}
+		}
+		else {
+			z = _PyLong_New(2);
+			if (z != NULL) {
+				Py_SIZE(z) = 2*sign;
+				z->ob_digit[0] = (digit)(absz & PyLong_MASK);
+				assert(absz >>= 2*PyLong_SHIFT == 0);
+				z->ob_digit[1] = (digit)(absz >> PyLong_SHIFT);
+			}
+		}
+		return (PyObject *)z;
 	}
 
-	z = k_mul(a, b);
+	a_size = ABS(Py_SIZE(a));
+	b_size = ABS(Py_SIZE(b));
+        if((a_size == 1) || (b_size == 1)){
+                if (b_size < a_size){
+                        z = mul1(a, b->ob_digit[0]);
+                }
+                else {
+                        z = mul1(b, a->ob_digit[0]);
+                }
+                if ((Py_SIZE(a) < 0) ^ (Py_SIZE(b) < 0)){
+                        NEGATE(z);                        
+                }
+                return (PyObject *)z;
+        }
+	/* use fast basecase multiplication if the smaller of a and b has at
+	   most 480 bits (60 bits if PyLong_SHIFT = 15) */
+	if (b_size <= MAX_PARTIALS || a_size <= MAX_PARTIALS) {
+		z = _PyLong_New(a_size + b_size);
+		if (z == NULL)
+			return NULL;
+		if (b_size <= a_size)
+			digits_multiply_init(z->ob_digit,
+					     a->ob_digit, a_size,
+					     b->ob_digit, b_size);
+		else
+			digits_multiply_init(z->ob_digit,
+					     b->ob_digit, b_size,
+					     a->ob_digit, a_size);
+	}
+	else
+		z = k_mul(a, b);
+
+	if ((Py_SIZE(a) < 0) ^ (Py_SIZE(b) < 0))
 	/* Negate if exactly one of the inputs is negative. */
-	if (((Py_SIZE(a) ^ Py_SIZE(b)) < 0) && z)
 		NEGATE(z);
-	return (PyObject *)z;
+	return (PyObject *)(long_normalize(z));
 }
 
 /* The / and % operators are now defined in terms of divmod().
@@ -3991,6 +4131,45 @@
 	PyObject_Del,				/* tp_free */
 };
 
+static PyTypeObject Int_InfoType;
+
+PyDoc_STRVAR(int_info__doc__,
+"sys.int_info\n\
+\n\
+A struct sequence that holds information about Python's\n\
+internal representation of integers.  The attributes are read only.");
+
+static PyStructSequence_Field int_info_fields[] = {
+	{"bits_per_digit", "size of a digit in bits"},
+	{"sizeof_digit", "size in bytes of the C type used to "
+	                 "represent a digit"},
+	{NULL, NULL}
+};
+
+static PyStructSequence_Desc int_info_desc = {
+	"sys.int_info",   /* name */
+	int_info__doc__,  /* doc */
+	int_info_fields,  /* fields */
+	2                 /* number of fields */
+};
+
+PyObject *
+PyLong_GetInfo(void)
+{
+	PyObject* int_info;
+	int field = 0;
+	int_info = PyStructSequence_New(&Int_InfoType);
+	if (int_info == NULL)
+		return NULL;
+	PyStructSequence_SET_ITEM(int_info, field++, PyLong_FromLong(PyLong_SHIFT));
+	PyStructSequence_SET_ITEM(int_info, field++, PyLong_FromLong(sizeof(digit)));
+	if (PyErr_Occurred()) {
+		Py_CLEAR(int_info);
+		return NULL;
+	}
+	return int_info;
+}
+
 int
 _PyLong_Init(void)
 {
@@ -4023,6 +4202,10 @@
 		v->ob_digit[0] = abs(ival);
 	}
 #endif
+	/* initialize int_info */
+	if (Int_InfoType.tp_name == 0)
+		PyStructSequence_InitType(&Int_InfoType, &int_info_desc);
+
 	return 1;
 }