Index: Objects/longobject.c
===================================================================
--- Objects/longobject.c	(revision 77142)
+++ Objects/longobject.c	(working copy)
@@ -26,6 +26,15 @@
  */
 #define FIVEARY_CUTOFF 8
 
+/* For modular exponentiation, use normal reduction unless the
+ * exponent is greater than MONTGOMERY_CUTOFF.  In that case, switch
+ * to Montgomery Reduction.  The potential drawback is that values
+ * need to be converted into Montgomery representation and then
+ * back again.
+ */
+#define MONTGOMERY_CUTOFF 64
+
+
 #define ABS(x) ((x) < 0 ? -(x) : (x))
 
 #undef MIN
@@ -3345,6 +3354,160 @@
 	return z;
 }
 
+/*
+Helper for long_pow
+--------------------
+Use the Extended Euclidean algorithm to calculate the inverse of a modulo b.
+I.e., calculate x such that (a * x) % b == 1.
+
+This is translated from pseudocode in "Practical Cryptography" section
+11.3.5, Ferguson & Schneier.
+
+Below is the algorithm in Python:
+
+def invMod(a, b):
+    c, d = a, b
+    uc, ud = 1, 0
+    while c != 0:
+        q = d // c
+        c, d = d - (q * c), c
+        uc, ud = ud - (q * uc), uc
+    if d == 1:
+        return ud % b
+    else:
+        raise ValueError("argument has no inverse")
+*/
+PyLongObject* l_invmod(PyLongObject *a, PyLongObject *b)
+{
+	PyLongObject *c, *d, *uc, *ud;
+	PyLongObject *retval = NULL;
+
+	uc = (PyLongObject *)PyLong_FromLong(1L);
+	if (uc == NULL)
+		return NULL;
+	ud = (PyLongObject *)PyLong_FromLong(0L);
+	if (ud == NULL) {
+		Py_DECREF(uc);
+		return NULL;
+	}
+	c = a;
+	Py_INCREF(c);
+	d = b;
+	Py_INCREF(d);
+
+	while (c->ob_size != 0) {
+		PyLongObject *temp, *temp2, *q;
+		/* q = d // c;  c, d = d % c, c */
+		if (l_divmod(d, c, &q, &temp) < 0)
+			goto error;
+		Py_DECREF(d);
+		d = c;
+		c = temp;
+
+		/* uc, ud = ud - q * uc, uc */
+		temp = (PyLongObject *)long_mul(q, uc);
+		Py_DECREF(q);
+		if (temp == NULL)
+			goto error;
+		temp2 = (PyLongObject *)long_sub(ud, temp);
+		Py_DECREF(temp);
+		if (temp2 == NULL)
+			goto error;
+		Py_DECREF(ud);
+		ud = uc;
+		uc = temp2;
+	}
+
+	if ((d->ob_size == 1) && (d->ob_digit[0] == 1))
+		l_divmod(ud, b, NULL, &retval);
+	else
+		PyErr_SetString(PyExc_ValueError, "argument has no inverse");
+
+error:
+	Py_DECREF(c);
+	Py_DECREF(d);
+	Py_DECREF(uc);
+	Py_DECREF(ud);
+	return retval;
+}
+
+/*
+Helper for long_pow
+--------------------
+Implement Montgomery Reduction per HAC, Algorithm 14.32:
+http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf
+
+INPUT:  T, m, m'
+	PyLong_BASE = 2**15
+	n = len(m)   (# of digits in m)
+	R = PyLong_BASE**n  (next power of PyLong_BASE greater than m)
+
+OUTPUT: T*(R**-1) % m
+*/
+PyLongObject* mont_reduce(PyLongObject *T, PyLongObject *m, digit mPrime)
+{
+	int n, i;
+	PyLongObject* A = NULL;
+	PyLongObject* retval = NULL;
+
+	n = m->ob_size;
+
+	/* A = T */
+	A = (PyLongObject *)_PyLong_New(2*n+1);
+	if (A == NULL)
+		return NULL;
+	if (T->ob_size > 2*n)
+		goto Error;
+	memset(A->ob_digit, 0, A->ob_size * sizeof(digit));
+	memcpy(A->ob_digit, T->ob_digit, T->ob_size * sizeof(digit));
+
+	for (i=0; i < n; i++) {
+		digit *aptr, *mptr, *mendptr;
+		twodigits carry = 0;
+		digit u;
+
+		/* u = a[i] * m' mod base  */
+		/* the * 1U forces the multiplication to be done using
+		   unsigned types; else we could end up with both
+		   arguments being promoted to int, and the
+		   multplication overflowing to give undefined
+		   behaviour. */
+		u = (digit) ((A->ob_digit[i] * 1U * mPrime) & PyLong_MASK);
+
+		/* A += u * m * base**i
+		   Multiply and add in place:
+		   A[i+j] += (u * m[j]) + carry */
+		aptr = A->ob_digit + i;
+		mptr = m->ob_digit;
+		mendptr = mptr + n;
+		while (mptr < mendptr) {
+			carry += *aptr + (twodigits)u * *mptr++;
+			*aptr++ = (digit) (carry & PyLong_MASK);
+			carry >>= PyLong_SHIFT;
+		}
+		while (carry) {
+			carry += (twodigits) (*aptr);
+			*aptr++ = (digit) (carry & PyLong_MASK);
+			carry >>= PyLong_SHIFT;
+		}
+	}
+
+	/*  A = A/base**n  (i.e. A >>= n)  */
+	memmove(A->ob_digit, A->ob_digit + n, (n+1) * sizeof(digit));
+	A->ob_size = n+1;
+	long_normalize(A);
+
+	/*  If A >= m then A = A - m  */
+	if (long_compare(A, m) >= 0)
+		retval = (PyLongObject *)long_sub(A, m);		
+	else
+		return A;
+		
+Error:
+	Py_DECREF(A);
+	return retval;
+}
+
 /* pow(v, w, x) */
 static PyObject *
 long_pow(PyObject *v, PyObject *w, PyObject *x)
@@ -3362,6 +3525,12 @@
 	PyLongObject *table[32] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 				   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
 
+	/* Montgomery values */
+	int doMontgomery = 0;
+	PyLongObject *R = NULL;
+	PyLongObject *base = NULL;
+	digit cPrime = 0;
+
 	/* a, b, c = v, w, x */
 	CONVERT_BINOP(v, w, &a, &b);
 	if (PyLong_Check(x)) {
@@ -3443,20 +3612,79 @@
 	/* At this point a, b, and c are guaranteed non-negative UNLESS
 	   c is NULL, in which case a may be negative. */
 
-	z = (PyLongObject *)PyLong_FromLong(1L);
-	if (z == NULL)
-		goto Error;
+	/* If modulus exists and is odd, we can use Montgomery Reduction 
+	   If exponent is greater than MONTGOMERY_CUTOFF, it's worth it */
+	if (c && (c->ob_digit[0] & 1) && 
+	   (b->ob_digit[0] > MONTGOMERY_CUTOFF)) {
+		doMontgomery = 1;
 
+		/*First, calculate the Montgomery parameters (R, m')
+		  (or in our case, (R, c')).
+
+		  Then set (a, z) to the Montgomery representations
+		  of (a, 1), in preparation for the exponentiation loop.
+		*/
+
+		/* R = PyLong_BASE ** n for smallest n such that R > c, (n == c->ob_size) */
+		R = _PyLong_New(c->ob_size + 1);
+		if (R == NULL)
+			goto Error;
+		memset(R->ob_digit, 0, R->ob_size * sizeof(digit));
+		R->ob_digit[R->ob_size-1] = 1;
+
+		/* c' = (-pow(c, -1, PyLong_BASE)) % PyLong_BASE */
+		base = (PyLongObject *)PyLong_FromLong(PyLong_BASE);
+		if (base == NULL)
+			goto Error;
+		temp = (PyLongObject *)l_invmod(c, base);
+		if (temp == NULL)
+			goto Error;
+		cPrime = PyLong_BASE - temp->ob_digit[0];
+		Py_DECREF(temp);
+		temp = NULL;
+
+		/* a = a * R % c  # keep in mind a * R == (a << size_c) */
+		temp = _PyLong_New(c->ob_size + a->ob_size);
+		if (temp == NULL)
+			goto Error;
+		memset(temp->ob_digit, 0, c->ob_size * sizeof(digit));
+		memcpy(temp->ob_digit + c->ob_size, a->ob_digit,
+			a->ob_size * sizeof(digit));
+
+		Py_DECREF(a);
+		a = NULL;
+		if (l_divmod(temp, c, NULL, &a) < 0)
+			goto Error;
+		Py_DECREF(temp);
+		temp = NULL;
+
+		/* z = 1 * R % c */
+		if (l_divmod(R, c, NULL, &z) < 0)
+			goto Error;
+	}
+	else {
+		z = (PyLongObject *)PyLong_FromLong(1L);
+		if (z == NULL)
+			goto Error;
+	}
+
 	/* Perform a modular reduction, X = X % c, but leave X alone if c
-	 * is NULL.
+	 * is NULL.  Use Montgomery Reduction if applicable.
 	 */
-#define REDUCE(X)					\
-	if (c != NULL) {				\
-		if (l_divmod(X, c, NULL, &temp) < 0)	\
-			goto Error;			\
-		Py_XDECREF(X);				\
-		X = temp;				\
-		temp = NULL;				\
+#define REDUCE(X)						\
+	if (c != NULL) {					\
+		if (doMontgomery) {				\
+			temp = mont_reduce(X, c, cPrime);	\
+			if (temp == NULL)			\
+				goto Error;			\
+		}						\
+		else {						\
+			if (l_divmod(X, c, NULL, &temp) < 0)	\
+				goto Error;			\
+		}						\
+		Py_XDECREF(X);					\
+		X = temp;					\
+		temp = NULL;					\
 	}
 
 	/* Multiply two values, then reduce the result:
@@ -3487,7 +3715,7 @@
 	}
 	else {
 		/* Left-to-right 5-ary exponentiation (HAC Algorithm 14.82) */
-		Py_INCREF(z);	/* still holds 1L */
+		Py_INCREF(z);	/* still holds 1L (or Montgomery rep. of 1L) */
 		table[0] = z;
 		for (i = 1; i < 32; ++i)
 			MULT(table[i-1], a, table[i])
@@ -3505,6 +3733,9 @@
 		}
 	}
 
+	if (doMontgomery)
+		REDUCE(z);
+
 	if (negativeOutput && (Py_SIZE(z) != 0)) {
 		temp = (PyLongObject *)long_sub(z, c);
 		if (temp == NULL)
@@ -3530,6 +3761,8 @@
 	Py_DECREF(b);
 	Py_XDECREF(c);
 	Py_XDECREF(temp);
+	Py_XDECREF(R);
+	Py_XDECREF(base);
 	return (PyObject *)z;
 }