diff --git a/Lib/heapq.py b/Lib/heapq.py --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -192,81 +192,6 @@ for i in reversed(range(n//2)): _siftup_max(x, i) - -# Algorithm notes for nlargest() and nsmallest() -# ============================================== -# -# Makes just one pass over the data while keeping the n most extreme values -# in a heap. Memory consumption is limited to keeping n values in a list. -# -# Number of comparisons for n random inputs, keeping the k smallest values: -# ----------------------------------------------------------- -# Step Comparisons Action -# 1 1.66*k heapify the first k-inputs -# 2 n - k compare new input elements to top of heap -# 3 k*lg2(k)*(ln(n)-ln(k)) add new extreme values to the heap -# 4 k*lg2(k) final sort of the k most extreme values -# -# number of comparisons -# n-random inputs k-extreme values average of 5 trials % more than min() -# --------------- ---------------- ------------------- ----------------- -# 10,000 100 14,046 40.5% -# 100,000 100 105,749 5.7% -# 1,000,000 100 1,007,751 0.8% -# -# Computing the number of comparisons for step 3: -# ----------------------------------------------- -# * For the i-th new value from the iterable, the probability of being in the -# k most extreme values is k/i. For example, the probability of the 101st -# value seen being in the 100 most extreme values is 100/101. -# * If the value is a new extreme value, the cost of inserting it into the -# heap is log(k, 2). -# * The probabilty times the cost gives: -# (k/i) * log(k, 2) -# * Summing across the remaining n-k elements gives: -# sum((k/i) * log(k, 2) for xrange(k+1, n+1)) -# * This reduces to: -# (H(n) - H(k)) * k * log(k, 2) -# * Where H(n) is the n-th harmonic number estimated by: -# H(n) = log(n, e) + gamma + 1.0 / (2.0 * n) -# gamma = 0.5772156649 -# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence -# * Substituting the H(n) formula and ignoring the (1/2*n) fraction gives: -# comparisons = k * log(k, 2) * (log(n,e) - log(k, e)) -# -# Worst-case for step 3: -# ---------------------- -# In the worst case, the input data is reversed sorted so that every new element -# must be inserted in the heap: -# comparisons = log(k, 2) * (n - k) -# -# Alternative Algorithms -# ---------------------- -# Other algorithms were not used because they: -# 1) Took much more auxiliary memory, -# 2) Made multiple passes over the data. -# 3) Made more comparisons in common cases (small k, large n, semi-random input). -# See detailed comparisons at: -# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest - -def nlargest(n, iterable): - """Find the n largest elements in a dataset. - - Equivalent to: sorted(iterable, reverse=True)[:n] - """ - if n <= 0: - return [] - it = iter(iterable) - result = list(islice(it, n)) - if not result: - return result - heapify(result) - _heappushpop = heappushpop - for elem in it: - _heappushpop(result, elem) - result.sort(reverse=True) - return result - def nsmallest(n, iterable): """Find the n smallest elements in a dataset. @@ -442,7 +367,70 @@ yield v yield from next.__self__ -# Extend the implementations of nsmallest and nlargest to use a key= argument +# Algorithm notes for nlargest() and nsmallest() +# ============================================== +# +# Makes just a single pass over the data while keeping the k most extreme values +# in a heap. Memory consumption is limited to keeping k values in a list. +# +# Measured performance for random inputs +# --------------------------------------------------------------------------- +# number of comparisons +# n inputs k-extreme values (average of 5 trials) % more than min() +# ------------- ---------------- - ------------------- ----------------- +# 1,000 100 3,317 133.2% +# 10,000 100 14,046 40.5% +# 100,000 100 105,749 5.7% +# 1,000,000 100 1,007,751 0.8% +# +# Theoretical number of comparisons for k smallest of n random inputs +# +# Step Comparisons Action +# ---- -------------------------- --------------------------- +# 1 1.66 * k heapify the first k-inputs +# 2 n - k compare remaining elements to top of heap +# 3 k * (1 + lg2(k)) * ln(n/k) replace the topmost value on the heap +# 4 k * lg2(k) - (k/2) final sort of the k most extreme values +# Combining and simplifying for a rough estimate gives: +# comparisons = n + k * (1 + log(n/k)) * (1 + log(k, 2)) +# +# Computing the number of comparisons for step 3: +# ----------------------------------------------- +# * For the i-th new value from the iterable, the probability of being in the +# k most extreme values is k/i. For example, the probability of the 101st +# value seen being in the 100 most extreme values is 100/101. +# * If the value is a new extreme value, the cost of inserting it into the +# heap is 1 + log(k, 2). +# * The probabilty times the cost gives: +# (k/i) * (1 + log(k, 2)) +# * Summing across the remaining n-k elements gives: +# sum((k/i) * (1 + log(k, 2)) for xrange(k+1, n+1)) +# * This reduces to: +# (H(n) - H(k)) * k * (1 + log(k, 2)) +# * Where H(n) is the n-th harmonic number estimated by: +# H(n) = log(n, e) + gamma + 1.0 / (2.0 * n) +# gamma = 0.5772156649 +# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence +# * Substituting the H(n) formula: +# comparisons = k * (1 + log(k, 2)) * (log(n/k, e) + (1/n - 1/k) / 2) +# +# Worst-case for step 3: +# ---------------------- +# In the worst case, the input data is reversed sorted so that every new element +# must be inserted in the heap: +# +# comparisons = 1.66 * k + log(k, 2) * (n - k) +# +# Alternative Algorithms +# ---------------------- +# Other algorithms were not used because they: +# 1) Took much more auxiliary memory, +# 2) Made multiple passes over the data. +# 3) Made more comparisons in common cases (small k, large n, semi-random input). +# See detailed comparisons at: +# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest + +# Extend the implementation of nsmallest to use a key= argument _nsmallest = nsmallest def nsmallest(n, iterable, key=None): """Find the n smallest elements in a dataset. @@ -480,7 +468,6 @@ result = _nsmallest(n, it) return [r[2] for r in result] # undecorate -_nlargest = nlargest def nlargest(n, iterable, key=None): """Find the n largest elements in a dataset. @@ -508,15 +495,39 @@ # When key is none, use simpler decoration if key is None: - it = zip(iterable, count(0,-1)) # decorate - result = _nlargest(n, it) - return [r[0] for r in result] # undecorate + it = iter(iterable) + result = list(islice(zip(it, count(0, -1)), n)) + if not result: + return result + heapify(result) + order = -n + top = result[0][0] + for elem in it: + if top < elem: + order -= 1 + heapreplace(result, (elem, order)) + top = result[0][0] + result.sort(reverse=True) + return [r[0] for r in result] # General case, slowest method - in1, in2 = tee(iterable) - it = zip(map(key, in1), count(0,-1), in2) # decorate - result = _nlargest(n, it) - return [r[2] for r in result] # undecorate + it = iter(iterable) + result = [(key(elem), i, elem) + for i, elem in zip(count(0, -1), islice(it, n))] + if not result: + return result + heapify(result) + order = -n + top = result[0][0] + for elem in it: + k = key(elem) + if top < k: + order -= 1 + heapreplace(result, (k, order, elem)) + top = result[0][0] + result.sort(reverse=True) + return [r[2] for r in result] + if __name__ == "__main__": # Simple sanity test diff --git a/Lib/test/test_heapq.py b/Lib/test/test_heapq.py --- a/Lib/test/test_heapq.py +++ b/Lib/test/test_heapq.py @@ -13,7 +13,7 @@ # _heapq.nlargest/nsmallest are saved in heapq._nlargest/_smallest when # _heapq is imported, so check them there func_names = ['heapify', 'heappop', 'heappush', 'heappushpop', - 'heapreplace', '_nlargest', '_nsmallest'] + 'heapreplace', '_nsmallest'] class TestModules(TestCase): def test_py_functions(self): diff --git a/Modules/_heapqmodule.c b/Modules/_heapqmodule.c --- a/Modules/_heapqmodule.c +++ b/Modules/_heapqmodule.c @@ -267,89 +267,6 @@ PyDoc_STRVAR(heapify_doc, "Transform list into a heap, in-place, in O(len(heap)) time."); -static PyObject * -nlargest(PyObject *self, PyObject *args) -{ - PyObject *heap=NULL, *elem, *iterable, *sol, *it, *oldelem; - Py_ssize_t i, n; - int cmp; - - if (!PyArg_ParseTuple(args, "nO:nlargest", &n, &iterable)) - return NULL; - - it = PyObject_GetIter(iterable); - if (it == NULL) - return NULL; - - heap = PyList_New(0); - if (heap == NULL) - goto fail; - - for (i=0 ; i=0 ; i--) - if(_siftup((PyListObject *)heap, i) == -1) - goto fail; - - sol = PyList_GET_ITEM(heap, 0); - while (1) { - elem = PyIter_Next(it); - if (elem == NULL) { - if (PyErr_Occurred()) - goto fail; - else - goto sortit; - } - cmp = PyObject_RichCompareBool(sol, elem, Py_LT); - if (cmp == -1) { - Py_DECREF(elem); - goto fail; - } - if (cmp == 0) { - Py_DECREF(elem); - continue; - } - oldelem = PyList_GET_ITEM(heap, 0); - PyList_SET_ITEM(heap, 0, elem); - Py_DECREF(oldelem); - if (_siftup((PyListObject *)heap, 0) == -1) - goto fail; - sol = PyList_GET_ITEM(heap, 0); - } -sortit: - if (PyList_Sort(heap) == -1) - goto fail; - if (PyList_Reverse(heap) == -1) - goto fail; - Py_DECREF(it); - return heap; - -fail: - Py_DECREF(it); - Py_XDECREF(heap); - return NULL; -} - -PyDoc_STRVAR(nlargest_doc, -"Find the n largest elements in a dataset.\n\ -\n\ -Equivalent to: sorted(iterable, reverse=True)[:n]\n"); - static int _siftdownmax(PyListObject *heap, Py_ssize_t startpos, Py_ssize_t pos) { @@ -531,8 +448,6 @@ METH_VARARGS, heapreplace_doc}, {"heapify", (PyCFunction)heapify, METH_O, heapify_doc}, - {"nlargest", (PyCFunction)nlargest, - METH_VARARGS, nlargest_doc}, {"nsmallest", (PyCFunction)nsmallest, METH_VARARGS, nsmallest_doc}, {NULL, NULL} /* sentinel */