Index: Doc/library/math.rst
===================================================================
--- Doc/library/math.rst	(revision 65315)
+++ Doc/library/math.rst	(working copy)
@@ -90,32 +90,6 @@
    algorithm's accuracy depends on IEEE-754 arithmetic guarantees and the
    typical case where the rounding mode is half-even.
 
-   .. note::
-
-      On platforms where arithmetic results are not correctly rounded,
-      :func:`fsum` may occasionally produce incorrect results; these
-      results should be no less accurate than those from the builtin
-      :func:`sum` function, but nevertheless may have arbitrarily
-      large relative error.
-
-      In particular, this affects some older Intel hardware (for
-      example Pentium and earlier x86 processors) that makes use of
-      'extended precision' floating-point registers with 64 bits of
-      precision instead of the 53 bits of precision provided by a C
-      double.  Arithmetic operations using these registers may be
-      doubly rounded (rounded first to 64 bits, and then rerounded to
-      53 bits), leading to incorrectly rounded results.  To test
-      whether your machine is one of those affected, try the following
-      at a Python prompt::
-
-         >>> 1e16 + 2.9999
-         10000000000000002.0
-
-      Machines subject to the double-rounding problem described above
-      are likely to print ``10000000000000004.0`` instead of
-      ``10000000000000002.0``.
-
-
    .. versionadded:: 2.6
 
 
Index: Modules/mathmodule.c
===================================================================
--- Modules/mathmodule.c	(revision 65309)
+++ Modules/mathmodule.c	(working copy)
@@ -54,6 +54,7 @@
 
 #include "Python.h"
 #include "longintrepr.h" /* just for SHIFT */
+#include "float.h"
 
 #ifdef _OSF_SOURCE
 /* OSF1 5.1 doesn't make this available with XOPEN_SOURCE_EXTENDED defined */
@@ -312,226 +313,265 @@
 FUNC1(tanh, tanh, 0,
       "tanh(x)\n\nReturn the hyperbolic tangent of x.")
 
-/* Precision summation function as msum() by Raymond Hettinger in
+/* Full precision summation of a sequence of floats.  Based on the
+   function lsum by Raymond Hettinger at
    <http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/393090>,
-   enhanced with the exact partials sum and roundoff from Mark
-   Dickinson's post at <http://bugs.python.org/file10357/msum4.py>.
-   See those links for more details, proofs and other references.
+   and following implementation suggestions from Tim Peters.
 
-   Note 1: IEEE 754R floating point semantics are assumed,
-   but the current implementation does not re-establish special
-   value semantics across iterations (i.e. handling -Inf + Inf).
+   Method: maintain the sum so far in the form
 
-   Note 2:  No provision is made for intermediate overflow handling;
-   therefore, sum([1e+308, 1e-308, 1e+308]) returns 1e+308 while
-   sum([1e+308, 1e+308, 1e-308]) raises an OverflowError due to the
-   overflow of the first partial sum.
+     huge_integer * 2**FSUM_MIN_EXP
 
-   Note 3: The intermediate values lo, yr, and hi are declared volatile so
-   aggressive compilers won't algebraically reduce lo to always be exactly 0.0.
-   Also, the volatile declaration forces the values to be stored in memory as
-   regular doubles instead of extended long precision (80-bit) values.  This
-   prevents double rounding because any addition or subtraction of two doubles
-   can be resolved exactly into double-sized hi and lo values.  As long as the 
-   hi value gets forced into a double before yr and lo are computed, the extra
-   bits in downstream extended precision operations (x87 for example) will be
-   exactly zero and therefore can be losslessly stored back into a double,
-   thereby preventing double rounding.
+   where FSUM_MIN_EXP is such that every finite double can be
+   expressed as an integral multiple of 2**FSUM_MIN_EXP.  huge_integer
+   is stored in base FSUM_BASE.  Following a suggestion from Tim
+   Peters, we use a signed-digit representation: the digits for the
+   base FSUM_BASE representation all lie in the range [-FSUM_BASE/2,
+   FSUM_BASE/2).  It's not hard to show that every integer (positive
+   or negative), can be expressed uniquely in the form
 
-   Note 4: A similar implementation is in Modules/cmathmodule.c.
-   Be sure to update both when making changes.
+     a0 + a1*FSUM_BASE + a2*FSUM_BASE**2 + ...
 
-   Note 5: The signature of math.fsum() differs from __builtin__.sum()
+   This choice of representation makes it easy to deal with both
+   positive and negative summands.
+
+   Note: The signature of math.fsum() differs from __builtin__.sum()
    because the start argument doesn't make sense in the context of
-   accurate summation.  Since the partials table is collapsed before
-   returning a result, sum(seq2, start=sum(seq1)) may not equal the
+   accurate summation.  sum(seq2, start=sum(seq1)) may not equal the
    accurate result returned by sum(itertools.chain(seq1, seq2)).
 */
 
-#define NUM_PARTIALS  32  /* initial partials array size, on stack */
+#define FSUM_SIZE 30
+#define FSUM_BASE (1 << FSUM_SIZE)
+#define FSUM_MIN_EXP (DBL_MIN_EXP - DBL_MANT_DIG)
+/* allow at least 60 extra bits at the top end, to cope with intermediate
+   overflow.  On an IEEE 754 machine, FSUM_ACC_SIZE is 72. */
+#define FSUM_ACC_SIZE ((DBL_MAX_EXP - FSUM_MIN_EXP + 60)/FSUM_SIZE + 1)
 
-/* Extend the partials array p[] by doubling its size. */
-static int                          /* non-zero on error */
-_fsum_realloc(double **p_ptr, Py_ssize_t  n,
-             double  *ps,    Py_ssize_t *m_ptr)
+/* Express digit + carry in the form r + FSUM_BASE*carry, with -FSUM_BASE/2 <=
+   r < FSUM_BASE/2, and -1 <= carry <= 1.  Return r and update carry. */
+
+static long
+fsum_reduce_digit(long digit, long *carry)
 {
-	void *v = NULL;
-	Py_ssize_t m = *m_ptr;
-
-	m += m;  /* double */
-	if (n < m && m < (PY_SSIZE_T_MAX / sizeof(double))) {
-		double *p = *p_ptr;
-		if (p == ps) {
-			v = PyMem_Malloc(sizeof(double) * m);
-			if (v != NULL)
-				memcpy(v, ps, sizeof(double) * n);
-		}
-		else
-			v = PyMem_Realloc(p, sizeof(double) * m);
+	assert(-3*(FSUM_BASE/2)+1 <= digit && digit < 3*(FSUM_BASE/2) - 1);
+	digit += *carry;
+	if (digit < -FSUM_BASE/2) {
+		*carry = -1;
+		digit += FSUM_BASE;
+	} else if (digit >= FSUM_BASE/2) {
+		*carry = 1;
+		digit -= FSUM_BASE;
+	} else {
+		*carry = 0;
 	}
-	if (v == NULL) {        /* size overflow or no memory */
-		PyErr_SetString(PyExc_MemoryError, "math.fsum partials");
-		return 1;
-	}
-	*p_ptr = (double*) v;
-	*m_ptr = m;
-	return 0;
+	assert(-FSUM_BASE/2 <= digit && digit < FSUM_BASE/2);
+	return digit;
 }
 
-/* Full precision summation of a sequence of floats.
-
-   def msum(iterable):
-       partials = []  # sorted, non-overlapping partial sums
-       for x in iterable:
-           i = 0
-           for y in partials:
-               if abs(x) < abs(y):
-                   x, y = y, x
-               hi = x + y
-               lo = y - (hi - x)
-               if lo:
-                   partials[i] = lo
-                   i += 1
-               x = hi
-           partials[i:] = [x]
-       return sum_exact(partials)
-
-   Rounded x+y stored in hi with the roundoff stored in lo.  Together hi+lo
-   are exactly equal to x+y.  The inner loop applies hi/lo summation to each
-   partial so that the list of partial sums remains exact.
-
-   Sum_exact() adds the partial sums exactly and correctly rounds the final
-   result (using the round-half-to-even rule).  The items in partials remain
-   non-zero, non-special, non-overlapping and strictly increasing in
-   magnitude, but possibly not all having the same sign.
-
-   Depends on IEEE 754 arithmetic guarantees and half-even rounding.
-*/
-
 static PyObject*
 math_fsum(PyObject *self, PyObject *seq)
 {
-	PyObject *item, *iter, *sum = NULL;
-	Py_ssize_t i, j, n = 0, m = NUM_PARTIALS;
-	double x, y, t, ps[NUM_PARTIALS], *p = ps;
-	double xsave, special_sum = 0.0, inf_sum = 0.0;
-	volatile double hi, yr, lo;
+	PyObject *item, *iter;
+	double x, m, mr, inf_sum = 0.0, special_sum = 0.0;
+	int e, q, i, round_up, ndigits, top_exp;
+	long digit, half_eps, acc[FSUM_ACC_SIZE], carry, sign;
 
 	iter = PyObject_GetIter(seq);
 	if (iter == NULL)
 		return NULL;
 
-	PyFPE_START_PROTECT("fsum", Py_DECREF(iter); return NULL)
+	/* initialize accumulator */
+	for (i = 0; i < FSUM_ACC_SIZE; i++)
+		acc[i] = 0L;
 
-	for(;;) {           /* for x in iterable */
-		assert(0 <= n && n <= m);
-		assert((m == NUM_PARTIALS && p == ps) ||
-		       (m >  NUM_PARTIALS && p != NULL));
+	/********************************************
+	 * Stage 1: accumulate values from iterable *
+	 ********************************************/
 
+	for(;;) {
 		item = PyIter_Next(iter);
 		if (item == NULL) {
+			Py_DECREF(iter);
 			if (PyErr_Occurred())
-				goto _fsum_error;
+				return NULL;
 			break;
 		}
 		x = PyFloat_AsDouble(item);
 		Py_DECREF(item);
-		if (PyErr_Occurred())
-			goto _fsum_error;
+		if (PyErr_Occurred()){
+			Py_DECREF(iter);
+			return NULL;
+		}
 
-		xsave = x;
-		for (i = j = 0; j < n; j++) {       /* for y in partials */
-			y = p[j];
-			if (fabs(x) < fabs(y)) {
-				t = x; x = y; y = t;
-			}
-			hi = x + y;
-			yr = hi - x;
-			lo = y - yr;
-			if (lo != 0.0)
-				p[i++] = lo;
-			x = hi;
+		if (!Py_IS_FINITE(x)) {
+			/* accumulate specials in special_sum.  At the end
+			   of the summation, special_sum is:
+
+			    (1) 0.0 if no specials occured in the sum
+			    (2) inf if inf occurred, but -inf and nan did not
+			    (3) -inf if -inf occurred, but inf and nan did not
+			    (4) nan if either:
+			          nan occurred in the sum, and/or:
+			          both inf and -inf occurred in the sum.
+
+			   In case (2) or (3) we want to return the infinity.
+			   In case (4) we want to raise ValueError if both
+			   infinities occurred, else return nan.  To
+			   disambiguate, we also accumulate infinities (but
+			   not nans) in inf_sum.  inf_sum is then equal to nan
+			   at the end of the calculation if and only if both
+			   signs of infinity occurred in the sum.
+			*/
+			if (Py_IS_INFINITY(x))
+				inf_sum += x;
+			special_sum += x;
+			continue;
 		}
 
-		n = i;                              /* ps[i:] = [x] */
-		if (x != 0.0) {
-			if (! Py_IS_FINITE(x)) {
-				/* a nonfinite x could arise either as
-				   a result of intermediate overflow, or
-				   as a result of a nan or inf in the
-				   summands */
-				if (Py_IS_FINITE(xsave)) {
-					PyErr_SetString(PyExc_OverflowError,
-					      "intermediate overflow in fsum");
-					goto _fsum_error;
-				}
-				if (Py_IS_INFINITY(xsave))
-					inf_sum += xsave;
-				special_sum += xsave;
-				/* reset partials */
-				n = 0;
+		if (x == 0.0)
+			continue;
+
+		/* write x/2**FSUM_MIN_EXP as m*FSUM_BASE**q, m integral. */
+		m = frexp(x, &e);
+		e -= DBL_MIN_EXP;
+		if (e < 0) {
+			m = ldexp(m, e + DBL_MANT_DIG);
+			q = 0;
+		}
+		else {
+			m = ldexp(m, DBL_MANT_DIG + e % FSUM_SIZE);
+			q = e / FSUM_SIZE;
+		}
+		assert(m == floor(m));
+
+		/* add m*FSUM_BASE**q into the accumulator */
+		carry = 0;
+		while (m != 0.0) {
+			mr = fmod(m, (double)FSUM_BASE);
+			m = (m-mr)/(double)FSUM_BASE;
+			acc[q] = fsum_reduce_digit(acc[q] + (long)mr, &carry);
+			q++;
+		}
+		/* finished adding m, but there may still be a carry */
+		while (carry != 0) {
+			if (q == FSUM_ACC_SIZE) {
+				/* extremely unlikely to get here */
+				Py_DECREF(iter);
+				PyErr_SetString(PyExc_OverflowError,
+					"Intermediate overflow in fsum");
+				return NULL;
 			}
-			else if (n >= m && _fsum_realloc(&p, n, ps, &m))
-				goto _fsum_error;
-			else
-				p[n++] = x;
+			acc[q] = fsum_reduce_digit(acc[q], &carry);
+			q++;
 		}
 	}
 
+	/***************************************************
+	 * Stage 2: compute correctly rounded return value *
+	 ***************************************************/
+
+	/* deal with any special values that occurred.  See note above. */
 	if (special_sum != 0.0) {
-		if (Py_IS_NAN(inf_sum))
+		if (Py_IS_NAN(inf_sum)) {
 			PyErr_SetString(PyExc_ValueError,
 					"-inf + inf in fsum");
-		else
-			sum = PyFloat_FromDouble(special_sum);
-		goto _fsum_error;
+			return NULL;
+		}
+		return PyFloat_FromDouble(special_sum);
 	}
 
-	hi = 0.0;
-	if (n > 0) {
-		hi = p[--n];
-		/* sum_exact(ps, hi) from the top, stop when the sum becomes
-		   inexact. */
-		while (n > 0) {
-			x = hi;
-			y = p[--n];
-			assert(fabs(y) < fabs(x));
-			hi = x + y;
-			yr = hi - x;
-			lo = y - yr;
-			if (lo != 0.0)
-				break;
+	/* strip zero limbs from top */
+	ndigits = FSUM_ACC_SIZE;
+	while (ndigits > 0 && acc[ndigits-1] == 0)
+		ndigits--;
+	if (ndigits == 0)
+		return PyFloat_FromDouble(0.0);
+
+	/* sign of result == sign of topmost nonzero limb */
+	sign = acc[ndigits-1] > 0 ? 1 : -1;
+
+	/* take absolute value of accumulator, renormalizing digits
+	   to lie in the range [0, FSUM_BASE) */
+	carry = 0;
+	for (i = 0; i < ndigits; i++) {
+		/* FSUM_BASE/2-1 <= acc[i]*sign + carry <= FSUM_BASE/2 */
+		digit = acc[i] * sign + carry;
+		if (digit < 0) {
+			acc[i] = digit + FSUM_BASE;
+			carry = -1;
 		}
-		/* Make half-even rounding work across multiple partials.
-		   Needed so that sum([1e-16, 1, 1e16]) will round-up the last
-		   digit to two instead of down to zero (the 1e-16 makes the 1
-		   slightly closer to two).  With a potential 1 ULP rounding
-		   error fixed-up, math.fsum() can guarantee commutativity. */
-		if (n > 0 && ((lo < 0.0 && p[n-1] < 0.0) ||
-			      (lo > 0.0 && p[n-1] > 0.0))) {
-			y = lo * 2.0;
-			x = hi + y;
-			yr = x - hi;
-			if (y == yr)
-				hi = x;
+		else {
+			acc[i] = digit;
+			carry = 0;
 		}
 	}
-	sum = PyFloat_FromDouble(hi);
+	assert(carry == 0);
+	/* it's possible that the normalization leads to a zero top digit */
+	if (acc[ndigits-1] == 0)
+		ndigits--;
+	assert(ndigits > 0 && acc[ndigits-1] != 0);
 
-_fsum_error:
-	PyFPE_END_PROTECT(hi)
-	Py_DECREF(iter);
-	if (p != ps)
-		PyMem_Free(p);
-	return sum;
+	/* Round acc * 2**FSUM_MIN_EXP to the nearest double. */
+
+	/* 2**(top_exp-1) <= |sum| < 2**top_exp */
+	top_exp = FSUM_MIN_EXP + FSUM_SIZE*(ndigits-1);
+	for (digit = acc[ndigits-1]; digit != 0; digit /= 2)
+		top_exp++;
+	/* catch almost all overflows here */
+	if (top_exp > DBL_MAX_EXP)
+		goto overflow_error;
+
+	m = 0.0;
+	if (top_exp <= DBL_MIN_EXP) {
+		/* no need for rounding */
+		for (i = ndigits-1; i >= 0; i--)
+			m = FSUM_BASE*m + acc[i];
+		return PyFloat_FromDouble((double)sign *
+					  ldexp(m, FSUM_MIN_EXP));
+	}
+
+	/* round: e is the position of the first bit to be rounded away. */
+	e = top_exp - DBL_MIN_EXP - 1;
+	half_eps = 1 << (e % FSUM_SIZE);
+	q = e / FSUM_SIZE;
+	for (i = ndigits-1; i > q; i--)
+		m = FSUM_BASE*m + acc[i];
+	digit = acc[q];
+	m = FSUM_BASE*m + (digit & (FSUM_BASE-2*half_eps));
+	if ((digit & half_eps) != 0) {
+		round_up = 0;
+		if ((digit & (3*half_eps-1)) != 0 ||
+		    (half_eps == FSUM_BASE/2 && (acc[q+1] & 1) != 0))
+			round_up = 1;
+		else
+			for (i = q-1; i >= 0; i--)
+				if (acc[i] != 0) {
+					round_up = 1;
+					break;
+				}
+		if (round_up == 1) {
+			m += 2*half_eps;
+			if (top_exp == DBL_MAX_EXP &&
+			    m == ldexp((double)(2*half_eps), DBL_MANT_DIG))
+				/* overflow corner case: result before
+				   rounding is within range, but rounded
+				   result overflows. */
+				goto overflow_error;
+		}
+	}
+	return PyFloat_FromDouble((double)sign *
+				  ldexp(m, FSUM_MIN_EXP+FSUM_SIZE*q));
+
+  overflow_error:
+	PyErr_SetString(PyExc_OverflowError,
+			"fsum result too large to represent as a float");
+	return NULL;
 }
 
-#undef NUM_PARTIALS
-
 PyDoc_STRVAR(math_fsum_doc,
-"sum(iterable)\n\n\
-Return an accurate floating point sum of values in the iterable.\n\
-Assumes IEEE-754 floating point arithmetic.");
+"fsum(iterable)\n\n\
+Return an accurate floating point sum of values in the iterable.");
 
 static PyObject *
 math_factorial(PyObject *self, PyObject *arg)