Index: Objects/longobject.c
===================================================================
--- Objects/longobject.c	(revision 77157)
+++ Objects/longobject.c	(working copy)
@@ -26,6 +26,15 @@
  */
 #define FIVEARY_CUTOFF 8
 
+/* For modular exponentiation, use normal reduction unless the
+ * exponent is greater than MONTGOMERY_CUTOFF.  In that case, switch
+ * to Montgomery Reduction.  The potential drawback is that values
+ * need to be converted into Montgomery representation and then
+ * back again.
+ */
+#define MONTGOMERY_CUTOFF 64
+
+
 #define ABS(x) ((x) < 0 ? -(x) : (x))
 
 #undef MIN
@@ -3345,6 +3354,155 @@
 	return z;
 }
 
+/*
+Helper for long_pow
+-------------------
+Compute negative inverse of an odd digit modulo PyLong_BASE: given an odd
+digit x, return a digit y such that y * x is congruent to -1 modulo
+PyLong_BASE.  */
+
+static digit
+digit_neginv(digit x)
+{
+	/* We produce successive negated inverses modulo 16, 256, 65536, ...
+	   for x, using the following trick: Suppose y satisfies
+
+	      x*y = -1 (mod N)
+
+	   for some N.  Then x*y = k*N - 1 for some integer k, so x*y*(2 +
+	   x*y) = (k*N - 1) * (k*N + 1) = k*k*N*N - 1, hence
+
+	     x*(y*(2+x*y)) = -1 (mod N*N)
+
+	   so y*(2+x*y) gives the desired negated inverse modulo N*N.  To get
+	   things started, note that -x*x*x gives a negated inverse of x
+	   modulo 16, so it requires only 2 steps of this iteration to give an
+	   inverse modulo 2**16, and one further step to give an inverse
+	   modulo 2**32.
+	*/
+	int z;
+	digit y;
+	/* Take extra care to avoid undefined behaviour below: if 'digit' is
+	   unsigned short, then x will typically be promoted to (signed!) int
+	   in the line below (C99 6.3.1.1p2).  Then e.g., 'x*x*x' can overflow
+	   an int, giving undefined behaviour.  The extra '1U * ' avoids this
+	   by forcing all multiplications to be of unsigned types.  The '0U'
+	   in the next line isn't necessary for correctness, but silences
+	   compilers that like to warn about unary negation applied to an
+	   unsigned type. */
+	y = 0U - 1U * x * x * x;
+	/* k iterations gives 4*2**k correct bits; 4*2**k >= PyLong_SHIFT iff
+	   (PyLong_SHIFT-1)/(4*2**k) == 0. */
+	for (z = (PyLong_SHIFT - 1)/4; z > 0; z >>= 1)
+		y *= 2U + 1U * x * y;
+	return y & PyLong_MASK;
+}
+
+/*
+Helper for long_pow
+--------------------
+Return the Montgomery representative of a PyLong a with respect to the modulus
+m.  Given a nonnegative integer a and odd positive modulus m, return a *
+PyLong_BASE**n % m, where n is the number of digits in the modulus.
+
+*/
+static PyLongObject *
+mont_rep(PyLongObject *a, PyLongObject *m)
+{
+	Py_ssize_t n;
+	PyLongObject *x, *y;
+	int retcode;
+	if (a->ob_size < 0 || m->ob_size <= 0 || (m->ob_digit[0] & 1) == 0) {
+		PyErr_BadInternalCall();
+		return NULL;
+	}
+	n = m->ob_size;
+	x = (PyLongObject *)_PyLong_New(a->ob_size + n);
+	if (x == NULL)
+		return NULL;
+	/* x = a * PyLong_BASE**n % m */
+	memset(x->ob_digit, 0, n * sizeof(digit));
+	memcpy(x->ob_digit + n, a->ob_digit, a->ob_size * sizeof(digit));
+
+	retcode = l_divmod(x, m, NULL, &y);
+	Py_DECREF(x);
+	if (retcode < 0)
+		return NULL;
+	return y;
+}
+
+/*
+Helper for long_pow
+--------------------
+Implement Montgomery Reduction per HAC, Algorithm 14.32:
+http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf
+
+INPUT:  T, m, m'
+	PyLong_BASE = 2**15
+	n = len(m)   (# of digits in m)
+	R = PyLong_BASE**n  (next power of PyLong_BASE greater than m)
+
+OUTPUT: T*(R**-1) % m
+*/
+static PyLongObject*
+mont_reduce(PyLongObject *T, PyLongObject *m, digit mPrime)
+{
+	Py_ssize_t n, i;
+	PyLongObject *A;
+
+	if (T->ob_size > 2*m->ob_size) {
+		PyErr_BadInternalCall();
+		return NULL;
+	}
+
+	n = m->ob_size;
+	/* A = T */
+	A = (PyLongObject *)_PyLong_New(2*n+1);
+	if (A == NULL)
+		return NULL;
+	memset(A->ob_digit, 0, A->ob_size * sizeof(digit));
+	memcpy(A->ob_digit, T->ob_digit, T->ob_size * sizeof(digit));
+
+	for (i=0; i < n; i++) {
+		digit *aptr, *mptr, *mendptr;
+		twodigits carry = 0;
+		digit u;
+
+		/* u = a[i] * m' mod base  */
+		/* the * 1U forces the multiplication to be done using
+		   unsigned types; see comments in digit_neginv above. */
+		u = ((1U * A->ob_digit[i] * mPrime) & PyLong_MASK);
+
+		/* A += u * m * base**i
+		   Multiply and add in place:
+		   A[i+j] += (u * m[j]) + carry */
+		aptr = A->ob_digit + i;
+		mptr = m->ob_digit;
+		mendptr = mptr + n;
+		while (mptr < mendptr) {
+			carry += *aptr + (twodigits)u * *mptr++;
+			*aptr++ = (digit) (carry & PyLong_MASK);
+			carry >>= PyLong_SHIFT;
+		}
+		while (carry) {
+			carry += (twodigits) (*aptr);
+			*aptr++ = (digit) (carry & PyLong_MASK);
+			carry >>= PyLong_SHIFT;
+		}
+	}
+
+	/*  A = A/base**n  (i.e. A >>= n)  */
+	memmove(A->ob_digit, A->ob_digit + n, (n+1) * sizeof(digit));
+	A->ob_size = n+1;
+	long_normalize(A);
+
+	/*  If A >= m then A = A - m  */
+	if (long_compare(A, m) >= 0)
+		return (PyLongObject *)long_sub(A, m);
+	else
+		return A;
+}
+
 /* pow(v, w, x) */
 static PyObject *
 long_pow(PyObject *v, PyObject *w, PyObject *x)
@@ -3362,6 +3520,10 @@
 	PyLongObject *table[32] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 				   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
 
+	/* Montgomery values */
+	int doMontgomery = 0;
+	digit cPrime = 0;
+
 	/* a, b, c = v, w, x */
 	CONVERT_BINOP(v, w, &a, &b);
 	if (PyLong_Check(x)) {
@@ -3447,16 +3609,49 @@
 	if (z == NULL)
 		goto Error;
 
+	/* If modulus exists and is odd, we can use Montgomery Reduction
+	   If exponent is greater than MONTGOMERY_CUTOFF, it's worth it */
+	doMontgomery = c && (c->ob_digit[0] & 1) &&
+		(b->ob_size > 1 || b->ob_digit[0] > MONTGOMERY_CUTOFF);
+
+	if (doMontgomery) {
+		/* Calculate parameter c' = -pow(-c, -1, PyLong_BASE)
+		   for use in the Montgomery reduction step. */
+		cPrime = digit_neginv(c->ob_digit[0]);
+
+		/* Replace a and z by their Montgomery representations. */
+		temp = mont_rep(a, c);
+		if (temp == NULL)
+			goto Error;
+		Py_DECREF(a);
+		a = temp;
+		temp = NULL;
+
+		temp = mont_rep(z, c);
+		if (temp == NULL)
+			goto Error;
+		Py_DECREF(z);
+		z = temp;
+		temp = NULL;
+	}
+
 	/* Perform a modular reduction, X = X % c, but leave X alone if c
-	 * is NULL.
+	 * is NULL.  Use Montgomery Reduction if applicable.
 	 */
-#define REDUCE(X)					\
-	if (c != NULL) {				\
-		if (l_divmod(X, c, NULL, &temp) < 0)	\
-			goto Error;			\
-		Py_XDECREF(X);				\
-		X = temp;				\
-		temp = NULL;				\
+#define REDUCE(X)						\
+	if (c != NULL) {					\
+		if (doMontgomery) {				\
+			temp = mont_reduce(X, c, cPrime);	\
+			if (temp == NULL)			\
+				goto Error;			\
+		}						\
+		else {						\
+			if (l_divmod(X, c, NULL, &temp) < 0)	\
+				goto Error;			\
+		}						\
+		Py_XDECREF(X);					\
+		X = temp;					\
+		temp = NULL;					\
 	}
 
 	/* Multiply two values, then reduce the result:
@@ -3487,7 +3682,7 @@
 	}
 	else {
 		/* Left-to-right 5-ary exponentiation (HAC Algorithm 14.82) */
-		Py_INCREF(z);	/* still holds 1L */
+		Py_INCREF(z);	/* still holds 1L (or Montgomery rep. of 1L) */
 		table[0] = z;
 		for (i = 1; i < 32; ++i)
 			MULT(table[i-1], a, table[i])
@@ -3505,6 +3700,9 @@
 		}
 	}
 
+	if (doMontgomery)
+		REDUCE(z);
+
 	if (negativeOutput && (Py_SIZE(z) != 0)) {
 		temp = (PyLongObject *)long_sub(z, c);
 		if (temp == NULL)