Index: Objects/longobject.c =================================================================== --- Objects/longobject.c (revision 77157) +++ Objects/longobject.c (working copy) @@ -26,6 +26,15 @@ */ #define FIVEARY_CUTOFF 8 +/* For modular exponentiation, use normal reduction unless the + * exponent is greater than MONTGOMERY_CUTOFF. In that case, switch + * to Montgomery Reduction. The potential drawback is that values + * need to be converted into Montgomery representation and then + * back again. + */ +#define MONTGOMERY_CUTOFF 64 + + #define ABS(x) ((x) < 0 ? -(x) : (x)) #undef MIN @@ -3345,6 +3354,155 @@ return z; } +/* +Helper for long_pow +------------------- +Compute negative inverse of an odd digit modulo PyLong_BASE: given an odd +digit x, return a digit y such that y * x is congruent to -1 modulo +PyLong_BASE. */ + +static digit +digit_neginv(digit x) +{ + /* We produce successive negated inverses modulo 16, 256, 65536, ... + for x, using the following trick: Suppose y satisfies + + x*y = -1 (mod N) + + for some N. Then x*y = k*N - 1 for some integer k, so x*y*(2 + + x*y) = (k*N - 1) * (k*N + 1) = k*k*N*N - 1, hence + + x*(y*(2+x*y)) = -1 (mod N*N) + + so y*(2+x*y) gives the desired negated inverse modulo N*N. To get + things started, note that -x*x*x gives a negated inverse of x + modulo 16, so it requires only 2 steps of this iteration to give an + inverse modulo 2**16, and one further step to give an inverse + modulo 2**32. + */ + int z; + digit y; + /* Take extra care to avoid undefined behaviour below: if 'digit' is + unsigned short, then x will typically be promoted to (signed!) int + in the line below (C99 6.3.1.1p2). Then e.g., 'x*x*x' can overflow + an int, giving undefined behaviour. The extra '1U * ' avoids this + by forcing all multiplications to be of unsigned types. The '0U' + in the next line isn't necessary for correctness, but silences + compilers that like to warn about unary negation applied to an + unsigned type. */ + y = 0U - 1U * x * x * x; + /* k iterations gives 4*2**k correct bits; 4*2**k >= PyLong_SHIFT iff + (PyLong_SHIFT-1)/(4*2**k) == 0. */ + for (z = (PyLong_SHIFT - 1)/4; z > 0; z >>= 1) + y *= 2U + 1U * x * y; + return y & PyLong_MASK; +} + +/* +Helper for long_pow +-------------------- +Return the Montgomery representative of a PyLong a with respect to the modulus +m. Given a nonnegative integer a and odd positive modulus m, return a * +PyLong_BASE**n % m, where n is the number of digits in the modulus. + +*/ +static PyLongObject * +mont_rep(PyLongObject *a, PyLongObject *m) +{ + Py_ssize_t n; + PyLongObject *x, *y; + int retcode; + if (a->ob_size < 0 || m->ob_size <= 0 || (m->ob_digit[0] & 1) == 0) { + PyErr_BadInternalCall(); + return NULL; + } + n = m->ob_size; + x = (PyLongObject *)_PyLong_New(a->ob_size + n); + if (x == NULL) + return NULL; + /* x = a * PyLong_BASE**n % m */ + memset(x->ob_digit, 0, n * sizeof(digit)); + memcpy(x->ob_digit + n, a->ob_digit, a->ob_size * sizeof(digit)); + + retcode = l_divmod(x, m, NULL, &y); + Py_DECREF(x); + if (retcode < 0) + return NULL; + return y; +} + +/* +Helper for long_pow +-------------------- +Implement Montgomery Reduction per HAC, Algorithm 14.32: +http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf + +INPUT: T, m, m' + PyLong_BASE = 2**15 + n = len(m) (# of digits in m) + R = PyLong_BASE**n (next power of PyLong_BASE greater than m) + +OUTPUT: T*(R**-1) % m +*/ +static PyLongObject* +mont_reduce(PyLongObject *T, PyLongObject *m, digit mPrime) +{ + Py_ssize_t n, i; + PyLongObject *A; + + if (T->ob_size > 2*m->ob_size) { + PyErr_BadInternalCall(); + return NULL; + } + + n = m->ob_size; + /* A = T */ + A = (PyLongObject *)_PyLong_New(2*n+1); + if (A == NULL) + return NULL; + memset(A->ob_digit, 0, A->ob_size * sizeof(digit)); + memcpy(A->ob_digit, T->ob_digit, T->ob_size * sizeof(digit)); + + for (i=0; i < n; i++) { + digit *aptr, *mptr, *mendptr; + twodigits carry = 0; + digit u; + + /* u = a[i] * m' mod base */ + /* the * 1U forces the multiplication to be done using + unsigned types; see comments in digit_neginv above. */ + u = ((1U * A->ob_digit[i] * mPrime) & PyLong_MASK); + + /* A += u * m * base**i + Multiply and add in place: + A[i+j] += (u * m[j]) + carry */ + aptr = A->ob_digit + i; + mptr = m->ob_digit; + mendptr = mptr + n; + while (mptr < mendptr) { + carry += *aptr + (twodigits)u * *mptr++; + *aptr++ = (digit) (carry & PyLong_MASK); + carry >>= PyLong_SHIFT; + } + while (carry) { + carry += (twodigits) (*aptr); + *aptr++ = (digit) (carry & PyLong_MASK); + carry >>= PyLong_SHIFT; + } + } + + /* A = A/base**n (i.e. A >>= n) */ + memmove(A->ob_digit, A->ob_digit + n, (n+1) * sizeof(digit)); + A->ob_size = n+1; + long_normalize(A); + + /* If A >= m then A = A - m */ + if (long_compare(A, m) >= 0) + return (PyLongObject *)long_sub(A, m); + else + return A; +} + /* pow(v, w, x) */ static PyObject * long_pow(PyObject *v, PyObject *w, PyObject *x) @@ -3362,6 +3520,10 @@ PyLongObject *table[32] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + /* Montgomery values */ + int doMontgomery = 0; + digit cPrime = 0; + /* a, b, c = v, w, x */ CONVERT_BINOP(v, w, &a, &b); if (PyLong_Check(x)) { @@ -3447,16 +3609,49 @@ if (z == NULL) goto Error; + /* If modulus exists and is odd, we can use Montgomery Reduction + If exponent is greater than MONTGOMERY_CUTOFF, it's worth it */ + doMontgomery = c && (c->ob_digit[0] & 1) && + (b->ob_size > 1 || b->ob_digit[0] > MONTGOMERY_CUTOFF); + + if (doMontgomery) { + /* Calculate parameter c' = -pow(-c, -1, PyLong_BASE) + for use in the Montgomery reduction step. */ + cPrime = digit_neginv(c->ob_digit[0]); + + /* Replace a and z by their Montgomery representations. */ + temp = mont_rep(a, c); + if (temp == NULL) + goto Error; + Py_DECREF(a); + a = temp; + temp = NULL; + + temp = mont_rep(z, c); + if (temp == NULL) + goto Error; + Py_DECREF(z); + z = temp; + temp = NULL; + } + /* Perform a modular reduction, X = X % c, but leave X alone if c - * is NULL. + * is NULL. Use Montgomery Reduction if applicable. */ -#define REDUCE(X) \ - if (c != NULL) { \ - if (l_divmod(X, c, NULL, &temp) < 0) \ - goto Error; \ - Py_XDECREF(X); \ - X = temp; \ - temp = NULL; \ +#define REDUCE(X) \ + if (c != NULL) { \ + if (doMontgomery) { \ + temp = mont_reduce(X, c, cPrime); \ + if (temp == NULL) \ + goto Error; \ + } \ + else { \ + if (l_divmod(X, c, NULL, &temp) < 0) \ + goto Error; \ + } \ + Py_XDECREF(X); \ + X = temp; \ + temp = NULL; \ } /* Multiply two values, then reduce the result: @@ -3487,7 +3682,7 @@ } else { /* Left-to-right 5-ary exponentiation (HAC Algorithm 14.82) */ - Py_INCREF(z); /* still holds 1L */ + Py_INCREF(z); /* still holds 1L (or Montgomery rep. of 1L) */ table[0] = z; for (i = 1; i < 32; ++i) MULT(table[i-1], a, table[i]) @@ -3505,6 +3700,9 @@ } } + if (doMontgomery) + REDUCE(z); + if (negativeOutput && (Py_SIZE(z) != 0)) { temp = (PyLongObject *)long_sub(z, c); if (temp == NULL)