## Module statistics.py ## ## Copyright (c) 2013 Steven D'Aprano. ## ## Permission is hereby granted, free of charge, to any person obtaining ## a copy of this software and associated documentation files (the ## "Software"), to deal in the Software without restriction, including ## without limitation the rights to use, copy, modify, merge, publish, ## distribute, sublicense, and/or sell copies of the Software, and to ## permit persons to whom the Software is furnished to do so, subject to ## the following conditions: ## ## The above copyright notice and this permission notice shall be ## included in all copies or substantial portions of the Software. ## ## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ## EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ## MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ## IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ## CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ## TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ## SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ Statistics module for Python 3.3 and better. Calculate statistics of data. This module provides the following functions and classes: Summary: ================== ============================================= Function Description ================== ============================================= add_partial Utility for performing high-precision sums. mean Arithmetic mean (average) of data. median Median (middle value) of data. mode Mode (most common value) of data. pstdev Population standard deviation of data. pvariance Population variance of data. StatisticsError Exception for statistics errors. stdev Sample standard deviation of data. sum High-precision sum of data. variance Sample variance of data. ================== ============================================= Examples -------- >>> mean([-1.0, 2.5, 3.25, 5.75]) 2.625 >>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS 4.38961843444... """ # Module metadata. __version__ = "0.1a" __date__ = "2013-07-31" __author__ = "Steven D'Aprano" __author_email__ = "steve+python@pearwood.info" __all__ = [ 'add_partial', 'sum', 'StatisticsError', 'pstdev', 'pvariance', 'stdev', 'variance', 'mean', 'median', 'mode', ] import collections import math import numbers import operator from builtins import sum as _sum # === Exceptions === class StatisticsError(ValueError): pass # === Public utilities === # Thanks to Raymond Hettinger for his recipe: # http://code.activestate.com/recipes/393090/ def add_partial(x, partials): """Helper function for full-precision summation of binary floats. Add float x in place to the list partials, keeping the sum exact with no rounding error. Arguments --------- x Must be a float. partials A list containing the partial sums. Description ----------- Initialise partials to be an empty list. Then for each float value ``x`` you wish to add, call ``add_partial(x, partials)``. When you are done, call the built-in ``sum(partials)`` to round the result to the standard float precision. If any x is not a float, or partials is not initialised to an empty list, results are undefined. Examples -------- >>> partials = [] >>> for x in (0.125, 1e100, 1e-50, 0.125, 1e100): ... add_partial(x, partials) >>> partials [0.0, 1e-50, 0.25, 2e+100] """ # Keep these as assertions so they can be optimized away. assert isinstance(x, float) and isinstance(partials, list) if not partials: partials.append(0.0) # Holder for NAN/INF values. if not math.isfinite(x): partials[0] += x return # Rounded x+y stored in hi with the round-off stored in lo. Together # hi+lo are exactly equal to x+y. The inner loop applies hi/lo summation # to each partial so that the list of partial sums remains exact. # Depends on IEEE-754 arithmetic guarantees. See proof of correctness at: # www-2.cs.cmu.edu/afs/cs/project/quake/public/papers/robust-arithmetic.ps i = 1 for y in partials[1:]: if abs(x) < abs(y): x, y = y, x hi = x + y lo = y - (hi - x) if lo: partials[i] = lo i += 1 x = hi assert i > 0 partials[i:] = [x] def sum(data, start=0): """sum(data [, start]) -> value Return a high-precision sum of the given numeric data. If optional argument ``start`` is given, it is added to the total. If ``data`` is empty, ``start`` (defaulting to 0) is returned. Examples -------- >>> sum([3, 2.25, 4.5, -0.5, 1.0], 0.75) 11.0 Float sums are calculated using high-precision floating point arithmetic that can avoid some sources of round-off error: >>> sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero. 1000.0 Fractions and Decimals are also supported: >>> from fractions import Fraction as F >>> sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)]) Fraction(63, 20) Decimal sums honour the context: >>> import decimal >>> D = decimal.Decimal >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")] >>> sum(data) Decimal('0.6963') >>> with decimal.localcontext( ... decimal.Context(prec=2, rounding=decimal.ROUND_DOWN)): ... sum(data) Decimal('0.68') Limitations ----------- ``sum`` supports mixed arithmetic with the following limitations: - mixing Fractions and Decimals raises TypeError; - mixing floats with either Fractions or Decimals coerces to float, which may lose precision; - complex numbers are not supported. These limitations may change without notice in future versions. """ if not isinstance(start, numbers.Number): raise TypeError('sum only accepts numbers') total = start data = iter(data) x = None if not isinstance(total, float): # Non-float sum. If we find a float, we exit this loop and continue # with the float code below. Until that happens, we keep adding. for x in data: if isinstance(x, float): # Convert running total to a float. See comment below for # why we do it this way. total = type(total).__float__(total) break total += x else: # No break, so we're done. return total # High-precision float sum. assert isinstance(total, float) partials = [] add_partial(total, partials) if x is not None: add_partial(x, partials) for x in data: try: # Don't call float() directly, as that converts strings and we # don't want that. Also, like all dunder methods, we should call # __float__ on the class, not the instance. x = type(x).__float__(x) except OverflowError: x = float('inf') if x > 0 else float('-inf') add_partial(x, partials) return _sum(partials) # === Private utilities === class _countiter: """Iterator that counts how many elements it has seen. >>> c = _countiter(['a', 1, None, 'c']) >>> _ = list(c) >>> c.count 4 """ def __init__(self, iterable): self.it = iter(iterable) self.count = 0 def __next__(self): x = next(self.it) # This must occur before incrementing the count. self.count += 1 return x def __iter__(self): return self def _welford(data): """Welford's one-pass method for calculating variance and mean. Expects ``data`` to be a _countiter, and returns a three-tuple of - sum of square deviations from the mean; - the mean; - the number of items. """ assert type(data) is _countiter n = 0 m = 0 ss = 0 for x in data: n = n + 1 delta = x - m m = m + delta/n ss = ss + delta*(x - m) assert n == data.count return (ss, m, n) def _direct(data, m, n): """Direct method for calculating variance (compensated version). Expects ``data`` to be a sequence, and ``m`` to be the mean of the data. ``n`` should be the number of items, or None. Returns the sum of squared deviations from the mean. """ assert m is not None ss = sum((x-m)**2 for x in data) if n: # The following sum should mathematically equal zero, but # due to rounding error may not. ss -= sum((x-m) for x in data)**2/n return ss def _var_helper(data, m): """Return (sum of square deviations, mean, count) of data.""" # Under no circumstances use the so-called "computational formula for # variance", as that is only suitable for hand calculations with a small # amount of low-precision data. It has terrible numeric properties. # # See a comparison of three computational methods here: # http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/ try: n = len(data) except TypeError: n = None data = _countiter(data) if m is None: if n is None: # data must be an iterator. ss, m, n = _welford(data) else: m = mean(data) ss = _direct(data, m, n) else: ss = _direct(data, m, n) if n is None: n = data.count assert not ss < 0, 'sum of square deviations is negative' return (ss, m, n) # === Measures of central tendency (averages) === def mean(data): """mean(data) -> arithmetic mean of data Return the sample arithmetic mean of ``data``, a sequence or iterator of real-valued numbers. The arithmetic mean is the sum of the data divided by the number of data points. It is commonly called "the average", although it is only one of many different mathematical averages. It is a measure of the central location of the data. Examples -------- >>> mean([1, 2, 3, 4, 4]) 2.8 >>> from fractions import Fraction as F >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)]) Fraction(13, 21) >>> from decimal import Decimal as D >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")]) Decimal('0.5625') Errors ------ If ``data`` is empty, StatisticsError will be raised. Additional Information ---------------------- The mean is strongly effected by outliers and is not a robust estimator for central location: the mean is not necessarily a typical example of the data points. For a more robust, although less efficient, measures of central location, see ``median`` and ``mode``. The sample mean gives an unbiased estimate of the true population mean, which means that on average, ``mean(sample)`` will equal the mean of the entire population. If you call ``mean`` with the entire population, the result returned is the population mean \N{GREEK SMALL LETTER MU}. """ try: n = len(data) except TypeError: n = None data = _countiter(data) total = sum(data) if n is None: n = data.count if n: return total/n else: raise StatisticsError('mean of empty data is not defined') class median: """Return the median (middle value) of numeric data. There are (at least) four different methods for calculating median, depending on whether or not you allow interpolation between data points: 1. The most common method returns the middle value if there are an odd number of values, or the average of the two middle values when there are an even number of values. 2. The "low median" returns the middle value, or the smaller of the two middle values. 3. The "high median" returns the middle value, or the larger of the two middle values. 4. For grouped continuous data, it is common to treat the median as the 50th percentile and interpolate when there are duplicate values. Examples -------- To get the regular median (#1 above), call ``median`` directly: >>> median([2, 3, 4, 5]) 3.5 This is best suited when your data is discrete, and you don't mind that the median may not be an actual data point. The other calculation methods are provided as methods on ``median``: >>> median.low([2, 3, 4, 5]) 3 >>> median.high([2, 3, 4, 5]) 4 >>> median.grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS 2.8333333333... For further details, see the individual methods. """ # If you think that having four definitions of median is annoying, you # ought to see the FIFTEEN definitions for quartiles! _ERROR = StatisticsError("no median for empty data") # FIXME: investigate ways to calculate medians without sorting? def __new__(cls, data): """Return median of data using the common mean-of-middle-two method. If the number of data points is odd, the middle data point is returned: >>> median([1, 3, 5]) 3 When the number of data points is even, the average of the two middle values is returned: >>> median([1, 3, 5, 7]) 4.0 """ data = sorted(data) n = len(data) if n == 0: raise cls._ERROR if n%2 == 1: return data[n//2] else: i = n//2 return (data[i - 1] + data[i])/2 @classmethod def low(cls, data): """Return the low median of data. The low median is always a member of the data set. When the number of data points is odd, the middle value is returned. When it is even, the smaller of the two middle values is returned. >>> median.low([1, 3, 5]) 3 >>> median.low([1, 3, 5, 7]) 3 """ data = sorted(data) n = len(data) if n == 0: raise cls._ERROR if n%2 == 1: return data[n//2] else: return data[n//2 - 1] @classmethod def high(cls, data): """Return the high median of data. The high median is always a member of the data set. When the number of data points is odd, the middle value is returned. When it is even, the larger of the two middle values is returned. >>> median.high([1, 3, 5]) 3 >>> median.high([1, 3, 5, 7]) 5 """ data = sorted(data) n = len(data) if n == 0: raise cls._ERROR return data[n//2] @classmethod def grouped(cls, data, interval=1): """"Return the median of grouped continuous data. >>> median.grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5]) 3.7 This calculates the median as the 50th percentile, and should be used when your data is continuous and grouped. In the above example, the values 1, 2, 3, etc. actually represent the midpoint of classes 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in class 3.5-4.5, and interpolation is used to estimate it. Optional argument ``interval`` represents the class interval, and defaults to 1. >>> median.grouped([52, 52, 53, 54]) 52.5 >>> median.grouped([1, 3, 3, 5, 7]) 3.25 >>> median.grouped([1, 3, 3, 5, 7], 2) 3.5 This function does not check whether the data points are at least ``interval`` apart, and is equivalent to the Gnumeric spreadsheet function "ssmedian". """ # References: # http://www.ualberta.ca/~opscan/median.html # https://mail.gnome.org/archives/gnumeric-list/2011-April/msg00018.html # https://projects.gnome.org/gnumeric/doc/gnumeric-function-SSMEDIAN.shtml data = sorted(data) n = len(data) if n == 0: raise cls._ERROR elif n == 1: return data[0] # Find the value at the midpoint. Remember this corresponds to the # centre of the class interval. x = data[n//2] L = x - interval/2 # The lower limit of the median interval. cf = data.index(x) # Number of values below the median interval. # FIXME The following line could be more efficient for big lists. f = data.count(x) # Number of data points in the median interval. return L + interval*(n/2 - cf)/f class mode: """mode(data [, window [, max_modes [, delta]]]) -> mode(s) Return the most common data point, or points, from ``data``. The mode (when it exists) is the most typical value, and is a robust measure of central location. Arguments --------- data Non-empty iterable of data points, not necessarily numeric. window Optional window size for estimating the mode when data is numeric and continuous. For discrete data (numeric or not), use the default value of 0. Otherwise, ``window`` must be an integer 3 or larger. See ``mode.collate`` for more details. max_modes The maximum number of modes to return. Defaults to 1. delta None or a number specifying the difference in scores that distinguishes a mode from a non-mode. Defaults to 0. See ``mode.extract`` for more details. Examples -------- By default, mode assumes discrete data, and returns a single value. This is the standard treatment of the mode as commonly taught in schools: >>> mode([1, 1, 2, 3, 3, 3, 3, 4]) 3 This also works with nominal (non-numeric) data: >>> mode(["red", "blue", "blue", "red", "green", "red", "red"]) 'red' If your data is continuous (and not grouped), then we expect most values will be unique, and ``mode`` to raise an exception: >>> data = [1.1, 1.8, 2.4, 3.3, 3.4, 3.5, 4.6] >>> mode(data) #doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... StatisticsError: no mode In this case, the mode represents the peak in the distribution, and we can estimate it by looking at multiple values at once, using a sliding window. Pass a non-zero int value, 3 or higher, for the ``window`` argument: >>> mode(data, window=3) 3.4 If you suspect that your data has more than one mode, pass a positive int as the ``max_mode`` argument, and no more than that many modes will be returned as a list: >>> mode([5, 3, 2, 1, 5, 4, 2, 2, 5], max_modes=3) [2, 5] By default, peaks must have exactly the same height to count as multiple modes. To relax that restriction, supply argument ``delta``. See the ``mode.extract`` method for details. Additional Methods ------------------ ``mode`` provides two methods to assist in determining how many peaks a data set actually has: mode.collate Returns the data set collated into sorted (value, frequency) pairs. mode.extract Decide which value or values should be considered a peak of the sample, and return only those values. See the individual methods for more details. Errors ------ If your data is empty, or if it has more modes than you specified as ``max_modes`` (default of 1), then ``mode`` will raise StatisticsError. If you specify ``window`` less than 3, or greater than the number of data points, ``mode`` will raise ValueError. Additional Information ---------------------- The mode is the only measure of location available for nominal data. Smaller window sizes have better resolution, and a chance to find narrow but tall peaks, but also increase the chance of missing the true mode and finding a chance fluctuation in the data. In general you should pick the largest window size your data will stand. The mode is a robust measure of the centre of a distribution even when data contains outliers. """ def __new__(cls, data, window=0, max_modes=1, delta=0): c = cls.collate(data, window) if not c: raise StatisticsError('data must not be empty (no mode)') elif c[0][1] and window == 1: # Discrete data and every item is unique. raise StatisticsError('data has no mode') modes = cls.extract(c, window, delta) if len(modes) > max_modes: raise StatisticsError('too many modes: %d' % len(modes)) elif max_modes == 1: assert len(modes) == 1 return modes[0] return modes @classmethod def collate(cls, data, window): """Collate values from data using window-sized intervals. Arguments --------- data Non-empty iterable of data; must be numeric unless ``window`` is zero. window for discrete or nominal data, 0; otherwise for continuous data, an int equal to or greater than 3. Returns a list of (value, score), sorted into descending order by score, where the values are: - data points, for discrete and nominal data; - an interpolated mid-point of the sliding window into the data, for continuous data (``window`` != 0). """ if window == 0: # Calculate scores for discrete data by counting. return collections.Counter(data).most_common() # Otherwise we estimate scores for continuous data using a # technique called "Estimating the rate of an inhomogeneous # Poisson process from Jth waiting times", using the algorithm # from "Numerical Recipes In Pascal", Press et al., # Cambridge University Press, 1992, p.508. if window < 3: raise ValueError('window size must be at least 3') data = sorted(data) n = len(data) if window > n: raise ValueError('too few data points for window size') collation = [] for i in range(n-window+1): a, b = data[i], data[i+window-1] x = (a+b)/2 score = window/(n*(b-a)) if b!= a else float('inf') collation.append((x, score)) collation.sort(key=operator.itemgetter(1), reverse=True) return collation @classmethod def extract(cls, collation, window, delta): """Extract modal values from collated (value, score) pairs. ``extract`` takes a sorted, collated list, and determines which elements should be considered modes by comparing the score of each element with the score of the first, then returns those values in a list in ascending order. Scores can be interpreted in two ways: - for discrete data, the score is the frequency of the value; - for continuous data, the score is an estimate of the frequency within the given window size. Arguments --------- collation List of (value, score) pairs, sorted in order of increasing frequency, as generated by the ``collate`` method. window 0, or window size as used by ``mode.collate``. delta None, or a numeric difference in score required to distinguish modes from non-modes in your data. If ``delta`` is a non-zero number, then two scores must differ by at least that amount to be distinguished. If ``delta`` is zero, (the default), then scores are compared using the ``!=`` operator. If ``delta`` is None, and ``window`` is zero (i.e. as used for discrete data) then scores are compared using ``!=``. Otherwise scores are distinguished if the relative error between them is greater than twice the square root of the window size. """ if delta is None: if window == 0: diff = operator.ne else: # See Press et al. above. k = math.sqrt(window) # Factor of 2 below gives a 95% confidence; use 1* for # a 68% confidence or 3* for a 98.5% confidence (approx). diff = lambda a, b: k*abs(a - b) >= 2*abs(max(a, b)) elif delta == 0: diff = operator.ne else: diff = lambda a, b: abs(a - b) >= delta a = collation[0][1] for i in range(1, len(collation)): b = collation[i][1] if diff(a, b): collation = collation[:i] break return [t[0] for t in collation] # === Measures of spread === # See http://mathworld.wolfram.com/Variance.html # http://mathworld.wolfram.com/SampleVariance.html # http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance def variance(data, m=None): """variance(data [, m]) -> sample variance of numeric data Return the sample variance of ``data``, a sequence or iterator of real-valued numbers. Variance is a measure of the variability (spread or dispersion) of data. A large variance indicates that the data is spread out; a small variance indicates it is clustered closely around the central location. Arguments --------- data iterable of numeric (non-complex) data with at least two values. m (optional) mean of data, or None. Examples -------- >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5] >>> variance(data) 1.3720238095238095 If you have already calculated the mean of your data, you can pass it as the optional second argument ``m`` to avoid recalculating it: >>> m = mean(data) >>> variance(data, m) 1.3720238095238095 .. CAUTION:: Using arbitrary values for ``m`` which are not the actual mean may lead to invalid or impossible results. Decimals and Fractions are supported: >>> from decimal import Decimal as D >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) Decimal('31.01875') >>> from fractions import Fraction as F >>> variance([F(1, 6), F(1, 2), F(5, 3)]) Fraction(67, 108) Additional Information ---------------------- This is the unbiased sample variance s\N{SUPERSCRIPT TWO} with Bessel's correction, also known as variance with N-1 degrees of freedom. If you know the true population mean \N{GREEK SMALL LETTER MU} you should use the ``pvariance`` function instead. """ ss, m, n = _var_helper(data, m) if n < 2: raise StatsError('variance requires at least two data points') return ss/(n-1) def pvariance(data, m=None): """pvariance(data [, m]) -> population variance of numeric data Return the population variance of ``data``, a sequence or iterator of real-valued numbers. Variance is a measure of the variability (spread or dispersion) of data. A large variance indicates that the data is spread out; a small variance indicates it is clustered closely around the central location. Arguments --------- data non-empty iterable of numeric (non-complex) data. m (optional) mean of data, or None. If your data represents the entire population, you should use this function; otherwise you should normally use ``variance`` instead. Examples -------- >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25] >>> pvariance(data) 1.25 If you have already calculated the mean of your data, you can pass it as the optional second argument ``m`` to avoid recalculating it: >>> m = mean(data) >>> pvariance(data, m) 1.25 .. CAUTION:: Using arbitrary values for ``m`` which are not the actual mean may lead to invalid or impossible results. Decimals and Fractions are supported: >>> from decimal import Decimal as D >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) Decimal('24.8150') >>> from fractions import Fraction as F >>> pvariance([F(1, 4), F(5, 4), F(1, 2)]) Fraction(13, 72) Additional Information ---------------------- When called with the entire population, this gives the population variance \N{GREEK SMALL LETTER SIGMA}\N{SUPERSCRIPT TWO}. When called on a sample instead, this is the biased sample variance s\N{SUPERSCRIPT TWO}, also known as variance with N degrees of freedom. If you somehow know the true population mean \N{GREEK SMALL LETTER MU}, you should use this function to calculate the sample variance instead of the ``variance`` function, giving the known population mean as argument ``m``. In that case, the result will be an unbiased estimate of the population variance. """ ss, m, n = _var_helper(data, m) if n < 1: raise StatsError('pvariance requires at least one data point') return ss/n def stdev(data, m=None): """stdev(data [, m]) -> sample standard deviation of numeric data Return the square root of the sample variance. See ``variance`` for arguments and other details. >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 1.0810874155219827 """ var = variance(data, m) try: return var.sqrt() except AttributeError: return math.sqrt(var) def pstdev(data, m=None): """pstdev(data [, m]) -> population standard deviation of numeric data Return the square root of the population variance. See ``pvariance`` for arguments and other details. >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 0.986893273527251 """ var = pvariance(data, m) try: return var.sqrt() except AttributeError: return math.sqrt(var)