Issue 21424: Apply the nlargest() optimizations to nsmallest() as well.

This commit is contained in:
Raymond Hettinger 2014-05-11 14:21:23 -07:00
parent 3a17e21755
commit 234fb2d503
4 changed files with 138 additions and 118 deletions

View File

@ -127,7 +127,7 @@ From all times, sorting has always been a Great Art! :-)
__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge', __all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge',
'nlargest', 'nsmallest', 'heappushpop'] 'nlargest', 'nsmallest', 'heappushpop']
from itertools import islice, count, tee, chain from itertools import islice, count
def heappush(heap, item): def heappush(heap, item):
"""Push item onto heap, maintaining the heap invariant.""" """Push item onto heap, maintaining the heap invariant."""
@ -179,12 +179,12 @@ def heapify(x):
for i in reversed(range(n//2)): for i in reversed(range(n//2)):
_siftup(x, i) _siftup(x, i)
def _heappushpop_max(heap, item): def _heapreplace_max(heap, item):
"""Maxheap version of a heappush followed by a heappop.""" """Maxheap version of a heappop followed by a heappush."""
if heap and item < heap[0]: returnitem = heap[0] # raises appropriate IndexError if heap is empty
item, heap[0] = heap[0], item heap[0] = item
_siftup_max(heap, 0) _siftup_max(heap, 0)
return item return returnitem
def _heapify_max(x): def _heapify_max(x):
"""Transform list into a maxheap, in-place, in O(len(x)) time.""" """Transform list into a maxheap, in-place, in O(len(x)) time."""
@ -192,24 +192,6 @@ def _heapify_max(x):
for i in reversed(range(n//2)): for i in reversed(range(n//2)):
_siftup_max(x, i) _siftup_max(x, i)
def nsmallest(n, iterable):
"""Find the n smallest elements in a dataset.
Equivalent to: sorted(iterable)[:n]
"""
if n <= 0:
return []
it = iter(iterable)
result = list(islice(it, n))
if not result:
return result
_heapify_max(result)
_heappushpop = _heappushpop_max
for elem in it:
_heappushpop(result, elem)
result.sort()
return result
# 'heap' is a heap at all indices >= startpos, except possibly for pos. pos # 'heap' is a heap at all indices >= startpos, except possibly for pos. pos
# is the index of a leaf with a possibly out-of-order value. Restore the # is the index of a leaf with a possibly out-of-order value. Restore the
# heap invariant. # heap invariant.
@ -327,6 +309,10 @@ try:
from _heapq import * from _heapq import *
except ImportError: except ImportError:
pass pass
try:
from _heapq import _heapreplace_max
except ImportError:
pass
def merge(*iterables): def merge(*iterables):
'''Merge multiple sorted inputs into a single sorted output. '''Merge multiple sorted inputs into a single sorted output.
@ -367,22 +353,86 @@ def merge(*iterables):
yield v yield v
yield from next.__self__ yield from next.__self__
# Extend the implementations of nsmallest and nlargest to use a key= argument
_nsmallest = nsmallest # Algorithm notes for nlargest() and nsmallest()
# ==============================================
#
# Makes just a single pass over the data while keeping the k most extreme values
# in a heap. Memory consumption is limited to keeping k values in a list.
#
# Measured performance for random inputs:
#
# number of comparisons
# n inputs k-extreme values (average of 5 trials) % more than min()
# ------------- ---------------- - ------------------- -----------------
# 1,000 100 3,317 133.2%
# 10,000 100 14,046 40.5%
# 100,000 100 105,749 5.7%
# 1,000,000 100 1,007,751 0.8%
# 10,000,000 100 10,009,401 0.1%
#
# Theoretical number of comparisons for k smallest of n random inputs:
#
# Step Comparisons Action
# ---- -------------------------- ---------------------------
# 1 1.66 * k heapify the first k-inputs
# 2 n - k compare remaining elements to top of heap
# 3 k * (1 + lg2(k)) * ln(n/k) replace the topmost value on the heap
# 4 k * lg2(k) - (k/2) final sort of the k most extreme values
# Combining and simplifying for a rough estimate gives:
# comparisons = n + k * (1 + log(n/k)) * (1 + log(k, 2))
#
# Computing the number of comparisons for step 3:
# -----------------------------------------------
# * For the i-th new value from the iterable, the probability of being in the
# k most extreme values is k/i. For example, the probability of the 101st
# value seen being in the 100 most extreme values is 100/101.
# * If the value is a new extreme value, the cost of inserting it into the
# heap is 1 + log(k, 2).
# * The probabilty times the cost gives:
# (k/i) * (1 + log(k, 2))
# * Summing across the remaining n-k elements gives:
# sum((k/i) * (1 + log(k, 2)) for xrange(k+1, n+1))
# * This reduces to:
# (H(n) - H(k)) * k * (1 + log(k, 2))
# * Where H(n) is the n-th harmonic number estimated by:
# gamma = 0.5772156649
# H(n) = log(n, e) + gamma + 1.0 / (2.0 * n)
# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence
# * Substituting the H(n) formula:
# comparisons = k * (1 + log(k, 2)) * (log(n/k, e) + (1/n - 1/k) / 2)
#
# Worst-case for step 3:
# ----------------------
# In the worst case, the input data is reversed sorted so that every new element
# must be inserted in the heap:
#
# comparisons = 1.66 * k + log(k, 2) * (n - k)
#
# Alternative Algorithms
# ----------------------
# Other algorithms were not used because they:
# 1) Took much more auxiliary memory,
# 2) Made multiple passes over the data.
# 3) Made more comparisons in common cases (small k, large n, semi-random input).
# See the more detailed comparison of approach at:
# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest
def nsmallest(n, iterable, key=None): def nsmallest(n, iterable, key=None):
"""Find the n smallest elements in a dataset. """Find the n smallest elements in a dataset.
Equivalent to: sorted(iterable, key=key)[:n] Equivalent to: sorted(iterable, key=key)[:n]
""" """
# Short-cut for n==1 is to use min() when len(iterable)>0 # Short-cut for n==1 is to use min() when len(iterable)>0
if n == 1: if n == 1:
it = iter(iterable) it = iter(iterable)
head = list(islice(it, 1)) sentinel = object()
if not head:
return []
if key is None: if key is None:
return [min(chain(head, it))] result = min(it, default=sentinel)
return [min(chain(head, it), key=key)] else:
result = min(it, default=sentinel, key=key)
return [] if result is sentinel else [result]
# When n>=size, it's faster to use sorted() # When n>=size, it's faster to use sorted()
try: try:
@ -395,15 +445,39 @@ def nsmallest(n, iterable, key=None):
# When key is none, use simpler decoration # When key is none, use simpler decoration
if key is None: if key is None:
it = zip(iterable, count()) # decorate it = iter(iterable)
result = _nsmallest(n, it) result = list(islice(zip(it, count()), n))
return [r[0] for r in result] # undecorate if not result:
return result
_heapify_max(result)
order = n
top = result[0][0]
_heapreplace = _heapreplace_max
for elem in it:
if elem < top:
_heapreplace(result, (elem, order))
top = result[0][0]
order += 1
result.sort()
return [r[0] for r in result]
# General case, slowest method # General case, slowest method
in1, in2 = tee(iterable) it = iter(iterable)
it = zip(map(key, in1), count(), in2) # decorate result = [(key(elem), i, elem) for i, elem in zip(range(n), it)]
result = _nsmallest(n, it) if not result:
return [r[2] for r in result] # undecorate return result
_heapify_max(result)
order = n
top = result[0][0]
_heapreplace = _heapreplace_max
for elem in it:
k = key(elem)
if k < top:
_heapreplace(result, (k, order, elem))
top = result[0][0]
order += 1
result.sort()
return [r[2] for r in result]
def nlargest(n, iterable, key=None): def nlargest(n, iterable, key=None):
"""Find the n largest elements in a dataset. """Find the n largest elements in a dataset.
@ -442,9 +516,9 @@ def nlargest(n, iterable, key=None):
_heapreplace = heapreplace _heapreplace = heapreplace
for elem in it: for elem in it:
if top < elem: if top < elem:
order -= 1
_heapreplace(result, (elem, order)) _heapreplace(result, (elem, order))
top = result[0][0] top = result[0][0]
order -= 1
result.sort(reverse=True) result.sort(reverse=True)
return [r[0] for r in result] return [r[0] for r in result]
@ -460,9 +534,9 @@ def nlargest(n, iterable, key=None):
for elem in it: for elem in it:
k = key(elem) k = key(elem)
if top < k: if top < k:
order -= 1
_heapreplace(result, (k, order, elem)) _heapreplace(result, (k, order, elem))
top = result[0][0] top = result[0][0]
order -= 1
result.sort(reverse=True) result.sort(reverse=True)
return [r[2] for r in result] return [r[2] for r in result]

View File

@ -13,7 +13,7 @@ c_heapq = support.import_fresh_module('heapq', fresh=['_heapq'])
# _heapq.nlargest/nsmallest are saved in heapq._nlargest/_smallest when # _heapq.nlargest/nsmallest are saved in heapq._nlargest/_smallest when
# _heapq is imported, so check them there # _heapq is imported, so check them there
func_names = ['heapify', 'heappop', 'heappush', 'heappushpop', func_names = ['heapify', 'heappop', 'heappush', 'heappushpop',
'heapreplace', '_nsmallest'] 'heapreplace', '_heapreplace_max']
class TestModules(TestCase): class TestModules(TestCase):
def test_py_functions(self): def test_py_functions(self):

View File

@ -84,8 +84,8 @@ Library
- Issue #21156: importlib.abc.InspectLoader.source_to_code() is now a - Issue #21156: importlib.abc.InspectLoader.source_to_code() is now a
staticmethod. staticmethod.
- Issue #21424: Simplified and optimized heaqp.nlargest() to make fewer - Issue #21424: Simplified and optimized heaqp.nlargest() and nmsmallest()
tuple comparisons. to make fewer tuple comparisons.
- Issue #21396: Fix TextIOWrapper(..., write_through=True) to not force a - Issue #21396: Fix TextIOWrapper(..., write_through=True) to not force a
flush() on the underlying binary stream. Patch by akira. flush() on the underlying binary stream. Patch by akira.

View File

@ -354,88 +354,34 @@ _siftupmax(PyListObject *heap, Py_ssize_t pos)
} }
static PyObject * static PyObject *
nsmallest(PyObject *self, PyObject *args) _heapreplace_max(PyObject *self, PyObject *args)
{ {
PyObject *heap=NULL, *elem, *iterable, *los, *it, *oldelem; PyObject *heap, *item, *returnitem;
Py_ssize_t i, n;
int cmp;
if (!PyArg_ParseTuple(args, "nO:nsmallest", &n, &iterable)) if (!PyArg_UnpackTuple(args, "_heapreplace_max", 2, 2, &heap, &item))
return NULL; return NULL;
it = PyObject_GetIter(iterable); if (!PyList_Check(heap)) {
if (it == NULL) PyErr_SetString(PyExc_TypeError, "heap argument must be a list");
return NULL; return NULL;
heap = PyList_New(0);
if (heap == NULL)
goto fail;
for (i=0 ; i<n ; i++ ){
elem = PyIter_Next(it);
if (elem == NULL) {
if (PyErr_Occurred())
goto fail;
else
goto sortit;
}
if (PyList_Append(heap, elem) == -1) {
Py_DECREF(elem);
goto fail;
}
Py_DECREF(elem);
}
n = PyList_GET_SIZE(heap);
if (n == 0)
goto sortit;
for (i=n/2-1 ; i>=0 ; i--)
if(_siftupmax((PyListObject *)heap, i) == -1)
goto fail;
los = PyList_GET_ITEM(heap, 0);
while (1) {
elem = PyIter_Next(it);
if (elem == NULL) {
if (PyErr_Occurred())
goto fail;
else
goto sortit;
}
cmp = PyObject_RichCompareBool(elem, los, Py_LT);
if (cmp == -1) {
Py_DECREF(elem);
goto fail;
}
if (cmp == 0) {
Py_DECREF(elem);
continue;
}
oldelem = PyList_GET_ITEM(heap, 0);
PyList_SET_ITEM(heap, 0, elem);
Py_DECREF(oldelem);
if (_siftupmax((PyListObject *)heap, 0) == -1)
goto fail;
los = PyList_GET_ITEM(heap, 0);
} }
sortit: if (PyList_GET_SIZE(heap) < 1) {
if (PyList_Sort(heap) == -1) PyErr_SetString(PyExc_IndexError, "index out of range");
goto fail; return NULL;
Py_DECREF(it); }
return heap;
fail: returnitem = PyList_GET_ITEM(heap, 0);
Py_DECREF(it); Py_INCREF(item);
Py_XDECREF(heap); PyList_SET_ITEM(heap, 0, item);
return NULL; if (_siftupmax((PyListObject *)heap, 0) == -1) {
Py_DECREF(returnitem);
return NULL;
}
return returnitem;
} }
PyDoc_STRVAR(nsmallest_doc, PyDoc_STRVAR(heapreplace_max_doc, "Maxheap variant of heapreplace");
"Find the n smallest elements in a dataset.\n\
\n\
Equivalent to: sorted(iterable)[:n]\n");
static PyMethodDef heapq_methods[] = { static PyMethodDef heapq_methods[] = {
{"heappush", (PyCFunction)heappush, {"heappush", (PyCFunction)heappush,
@ -448,8 +394,8 @@ static PyMethodDef heapq_methods[] = {
METH_VARARGS, heapreplace_doc}, METH_VARARGS, heapreplace_doc},
{"heapify", (PyCFunction)heapify, {"heapify", (PyCFunction)heapify,
METH_O, heapify_doc}, METH_O, heapify_doc},
{"nsmallest", (PyCFunction)nsmallest, {"_heapreplace_max",(PyCFunction)_heapreplace_max,
METH_VARARGS, nsmallest_doc}, METH_VARARGS, heapreplace_max_doc},
{NULL, NULL} /* sentinel */ {NULL, NULL} /* sentinel */
}; };