Index: longobject.c =================================================================== --- longobject.c (revision 66590) +++ longobject.c (working copy) @@ -2495,18 +2495,94 @@ } } else { /* a is not the same as b -- gradeschool long mult */ - for (i = 0; i < size_a; ++i) { - twodigits carry = 0; - twodigits f = a->ob_digit[i]; + /* To reduce the number of shift and mask operations + * collect terms in the following way + * (a[0] + a[1]*BASE + a[2]*BASE**2 + ...) * + * (b[0] + b[1]*BASE + b[2]*BASE**2 + ...) = + * a[0]*b[0] + (a[0]*b[1] + a[1]*b[0])*BASE + + * (a[0]*b[2] + a[1]*b[1])*BASE**2 + ... + */ + for (i = 0; i < size_a - 1; i += 2) { + twodigits f0 = a->ob_digit[i]; + twodigits f1 = a->ob_digit[i+1]; digit *pz = z->ob_digit + i; digit *pb = b->ob_digit; + twodigits carry = *pz + pb[0] * f0; + *pz++ = (digit)(carry & PyLong_MASK); + /* carry <= MASK*(MASK+1) = BASE**2 - BASE */ + carry >>= PyLong_SHIFT; + /* carry <= BASE - 2 = MASK - 1 */ + SIGCHECK({ + Py_DECREF(z); + return NULL; + }) + int j; + /* Bounds on carry in the loop to guarantee + * that it does not overflow: + * For j = 0: + * carry += *pz + pb[j+1] * f0 + pb[j] * f1; + * carry <= MASK-1 + MASK + 2*MASK**2 + * = 2*MASK**2 + 2*MASK - 1 + * = 2*BASE**2 - 2*BASE - 1 + * carry >>= PyLong_SHIFT; + * carry <= BASE - 3 = 2*MASK - 1 + * For j = 1: + * carry += *pz + pb[j+1] * f0 + pb[j] * f1; + * carry <= 2*MASK-1 + MASK + 2*MASK**2 = + * = 2*MASK**2 + 3*MASK - 1 = + * = 2*BASE**2 - BASE - 2 + * carry >>= PyLong_SHIFT; + * carry <= 2*BASE - 2 = 2*MASK + * For j = 2: + * carry += *pz + pb[j+1] * f0 + pb[j] * f1; + * carry <= 2*MASK + MASK + 2*MASK**2 = + * = 2*MASK**2 + 3*MASK + * = 2*BASE**2 - BASE - 1 + * carry >>= PyLong_SHIFT; + * carry <= 2*BASE - 2 = 2*MASK; + * as in the case j=1, so that the bounds remain + * the same for the rest of the loop; therefore + * in this loop one has always + * carry <= 2*MASK**2 + 3*MASK + * which fits in a twodigits, see longintrepr.h + * + */ + for(j=0; j < size_b-1; j++) { + carry += *pz + pb[j+1] * f0 + pb[j] * f1; + *pz++ = (digit)(carry & PyLong_MASK); + carry >>= PyLong_SHIFT; + assert(carry <= 2*PyLong_MASK); + } + carry += *pz + pb[size_b-1] * f1; + /* carry <= 2*MASK + MASK + MASK**2 = + * = MASK**2 + 3*MASK = BASE**2 + BASE - 2 + */ + *pz++ = (digit)(carry & PyLong_MASK); + carry >>= PyLong_SHIFT; + /* carry <= BASE */ + if (carry) + *pz += (digit)(carry & PyLong_MASK); + /* according to the above bound one has + * (carry >> PyLong_SHIFT) <= 1 + * However it must be strictly + * (carry >> PyLong_SHIFT) == 0 + * otherwise z would exceed size_a + size_b, which + * is impossible. + */ + assert((carry >> PyLong_SHIFT) == 0); + } + if (size_a&1) { + twodigits carry = 0; + twodigits f = a->ob_digit[size_a - 1]; + digit *pz = z->ob_digit + size_a - 1; + digit *pb = b->ob_digit; digit *pbend = b->ob_digit + size_b; - + SIGCHECK({ Py_DECREF(z); return NULL; }) - + while (pb < pbend) { carry += *pz + *pb++ * f; *pz++ = (digit)(carry & PyLong_MASK);