Index: Objects/longobject.c =================================================================== --- Objects/longobject.c (revision 70547) +++ Objects/longobject.c (working copy) @@ -1495,6 +1495,27 @@ return (digit)rem; } +/* Multiply by a single digit, ignoring the sign. */ + +static PyLongObject * +mul1(PyLongObject *a, digit n) +{ + Py_ssize_t size_a = ABS(Py_SIZE(a)); + PyLongObject *z = _PyLong_New(size_a+1); + twodigits carry = 0; + Py_ssize_t i; + + if (z == NULL) + return NULL; + for (i = 0; i < size_a; ++i) { + carry += (twodigits)a->ob_digit[i] * n; + z->ob_digit[i] = (digit) (carry & PyLong_MASK); + carry >>= PyLong_SHIFT; + } + z->ob_digit[i] = (digit) carry; + return long_normalize(z); +} + /* Divide a long integer by a digit, returning both the quotient (as function result) and the remainder (through *prem). The sign of a is ignored; n should not be zero. */ @@ -2523,6 +2544,122 @@ return (PyObject *)z; } +/* Here's a simple optimization for basecase multiplication that can achieve + speedups of up to 400% on some 64-bit platforms. It uses the fact that + twodigits can represent values up to 16*PyLong_BASE*PyLong_BASE (assuming + that PyLong_SHIFT is 30), so up to 16 partial products can be accumulated + at once. The innermost loop then generally contains fewer instructions + than in the usual long multiplication algorithm. */ + +#if PyLong_SHIFT == 15 +#define BLOCK_MUL_SIZE 4 +#elif PyLong_SHIFT == 30 +#define BLOCK_MUL_SIZE 16 +#else +#error "expected PyLong_SHIFT to be 15 or 30" +#endif + +/* res[0:a_size+b_size] := a*b, assuming that b_size <= BLOCK_MUL_SIZE, + b_size <= a_size. */ + +static void +digits_multiply_init(digit *res, const digit *a, Py_ssize_t a_size, + const digit *b, Py_ssize_t b_size) +{ + twodigits acc = 0; + Py_ssize_t j, k; + assert(b_size <= BLOCK_MUL_SIZE && b_size <= a_size); + for (k=0; k>= PyLong_SHIFT; + } + for (; k>= PyLong_SHIFT; + } + for (; k>= PyLong_SHIFT; + } + assert(acc == 0); +} + +/* variant of the above: res[0:a_size+BLOCK_MUL_SIZE] := + a[0:a_size]*b[0:BLOCK_MUL_SIZE] + res[0:a_size], assuming that BLOCK_MUL_SIZE + <= a_size */ + +static void +digits_multiply_add(digit *res, const digit *a, Py_ssize_t a_size, + const digit *b) +{ + twodigits acc = 0; + Py_ssize_t j, k; + assert(BLOCK_MUL_SIZE <= a_size); + for (k=0; k>= PyLong_SHIFT; + } + for (k=BLOCK_MUL_SIZE; k>= PyLong_SHIFT; + } + for (k=0; k>= PyLong_SHIFT; + } + assert(acc == 0); +} + +/* res[0:a_size+b_size] := a * b */ + +static void +digits_multiply(digit *res, const digit *a, Py_ssize_t a_size, + const digit *b, Py_ssize_t b_size) +{ + Py_ssize_t init_size; + + /* force b_size <= a_size */ + if (a_size < b_size) { + const digit *temp; + Py_ssize_t temp_size; + temp = a; a = b; b = temp; + temp_size = a_size; a_size = b_size; b_size = temp_size; + } + + assert(b_size > 0); + + /* split b into pieces, the first of size at most BLOCK_MUL_SIZE and all + others of size exactly BLOCK_MUL_SIZE. digits_multiply_init and + digits_multiply_add do the actual multiplication. */ + init_size = (b_size - 1) % BLOCK_MUL_SIZE + 1; + digits_multiply_init(res, a, a_size, b, init_size); + b += init_size; + res += init_size; + b_size -= init_size; + + while (b_size >= BLOCK_MUL_SIZE) { + digits_multiply_add(res, a, a_size, b); + b_size -= BLOCK_MUL_SIZE; + b += BLOCK_MUL_SIZE; + res += BLOCK_MUL_SIZE; + } + assert(b_size == 0); +} + /* Grade school multiplication, ignoring the signs. * Returns the absolute value of the product, or NULL if error. */ @@ -2538,8 +2675,8 @@ if (z == NULL) return NULL; - memset(z->ob_digit, 0, Py_SIZE(z) * sizeof(digit)); if (a == b) { + memset(z->ob_digit, 0, Py_SIZE(z) * sizeof(digit)); /* Efficient squaring per HAC, Algorithm 14.16: * http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf * Gives slightly less than a 2x speedup when a == b, @@ -2584,28 +2721,8 @@ } } else { /* a is not the same as b -- gradeschool long mult */ - for (i = 0; i < size_a; ++i) { - twodigits carry = 0; - twodigits f = a->ob_digit[i]; - digit *pz = z->ob_digit + i; - digit *pb = b->ob_digit; - digit *pbend = b->ob_digit + size_b; - - SIGCHECK({ - Py_DECREF(z); - return NULL; - }) - - while (pb < pbend) { - carry += *pz + *pb++ * f; - *pz++ = (digit)(carry & PyLong_MASK); - carry >>= PyLong_SHIFT; - assert(carry <= PyLong_MASK); - } - if (carry) - *pz += (digit)(carry & PyLong_MASK); - assert((carry >> PyLong_SHIFT) == 0); - } + digits_multiply(z->ob_digit, a->ob_digit, size_a, + b->ob_digit, size_b); } return long_normalize(z); } @@ -2927,30 +3044,82 @@ long_mul(PyLongObject *a, PyLongObject *b) { PyLongObject *z; + Py_ssize_t a_size, b_size; CHECK_BINOP(a, b); /* fast path for single-digit multiplication */ if (ABS(Py_SIZE(a)) <= 1 && ABS(Py_SIZE(b)) <= 1) { - stwodigits v = (stwodigits)(MEDIUM_VALUE(a)) * MEDIUM_VALUE(b); -#ifdef HAVE_LONG_LONG - return PyLong_FromLongLong((PY_LONG_LONG)v); -#else - /* if we don't have long long then we're almost certainly - using 15-bit digits, so v will fit in a long. In the - unlikely event that we're using 30-bit digits on a platform - without long long, a large v will just cause us to fall - through to the general multiplication code below. */ - if (v >= LONG_MIN && v <= LONG_MAX) - return PyLong_FromLong((long)v); -#endif + /* XXX benchmark this! Is is worth keeping? */ + twodigits absz; + int sign; + sign = Py_SIZE(a) * Py_SIZE(b); + if (sign == 0) + return PyLong_FromLong(0L); + absz = (twodigits)a->ob_digit[0] * b->ob_digit[0]; + if (absz < PyLong_BASE) { + CHECK_SMALL_INT((sdigit)(sign*absz)); + z = _PyLong_New(1); + if (z != NULL) { + Py_SIZE(z) = sign; + z->ob_digit[0] = (digit)absz; + } + } + else { + z = _PyLong_New(2); + if (z != NULL) { + Py_SIZE(z) = 2*sign; + z->ob_digit[0] = (digit)(absz & PyLong_MASK); + assert(absz >>= 2*PyLong_SHIFT == 0); + z->ob_digit[1] = (digit)(absz >> PyLong_SHIFT); + } + } + return (PyObject *)z; } - z = k_mul(a, b); + a_size = ABS(Py_SIZE(a)); + b_size = ABS(Py_SIZE(b)); + if((a_size <= 1) || (b_size <= 1)){ + if (b_size < a_size){ + if(b_size) + z = mul1(a, b->ob_digit[0]); + else + return PyLong_FromLong(0L); + } + else { + if(a_size) + z = mul1(b, a->ob_digit[0]); + else + return PyLong_FromLong(0L); + } + if ((Py_SIZE(a) < 0) ^ (Py_SIZE(b) < 0)){ + NEGATE(z); + } + return (PyObject *)z; + } + + /* use fast basecase multiplication if the smaller of a and b has at + most 480 bits (60 bits if PyLong_SHIFT = 15) */ + if (b_size <= BLOCK_MUL_SIZE || a_size <= BLOCK_MUL_SIZE) { + z = _PyLong_New(a_size + b_size); + if (z == NULL) + return NULL; + if (b_size <= a_size) + digits_multiply_init(z->ob_digit, + a->ob_digit, a_size, + b->ob_digit, b_size); + else + digits_multiply_init(z->ob_digit, + b->ob_digit, b_size, + a->ob_digit, a_size); + } + else + z = k_mul(a, b); + + if ((Py_SIZE(a) < 0) ^ (Py_SIZE(b) < 0)) /* Negate if exactly one of the inputs is negative. */ - if (((Py_SIZE(a) ^ Py_SIZE(b)) < 0) && z) NEGATE(z); - return (PyObject *)z; + return (PyObject *)(long_normalize(z)); } /* The / and % operators are now defined in terms of divmod().