SF patch #969791: Add nlargest() and nsmallest() to heapq.

This commit is contained in:
Raymond Hettinger 2004-06-10 05:03:17 +00:00
parent 7d019664d7
commit 33ecffb65a
5 changed files with 75 additions and 5 deletions

View File

@ -83,6 +83,30 @@ True
>>>
\end{verbatim}
The module also offers two general purpose functions based on heaps.
\begin{funcdesc}{nlargest}{iterable, n}
Return a list with the \var{n} largest elements from the dataset defined
by \var{iterable}. Equivalent to: \code{sorted(iterable, reverse=True)[:n]}
\versionadded{2.4}
\end{funcdesc}
\begin{funcdesc}{nsmallest}{iterable, n}
Return a list with the \var{n} smallest elements from the dataset defined
by \var{iterable}. Equivalent to: \code{sorted(iterable)[:n]}
\versionadded{2.4}
\end{funcdesc}
Though the above functions appear symmetrical, they each have different
speed and space requirements. In particular, \function{nsmallest()}
operates on a full copy of the dataset. In contrast, \function{nlargest()}
only requires storage space for \var{n} elements.
Both functions perform best for smaller values of \var{n}. For larger
values, it is more efficient to use the \function{sorted()} function. Also,
when \code{n==1}, it is more efficient to use the builtin \function{min()}
and \function{max()} functions.
\subsection{Theory}

View File

@ -449,7 +449,10 @@ improved performance: \module{Queue}, \module{mutex}, \module{shlex}
\item The \module{heapq} module has been converted to C. The resulting
tenfold improvement in speed makes the module suitable for handling
high volumes of data.
high volumes of data. In addition, the module has two new functions
\function{nlargest()} and \function{nsmallest()} that use heaps to
find the largest or smallest n values in a dataset without the
expense of a full sort.
\item The \module{imaplib} module now supports IMAP's THREAD command.
(Contributed by Yves Dionne.)

View File

@ -30,7 +30,7 @@ without surprises: heap[0] is the smallest item, and heap.sort()
maintains the heap invariant!
"""
# Original code by Kevin O'Connor, augmented by Tim Peters
# Original code by Kevin O'Connor, augmented by Tim Peters and Raymond Hettinger
__about__ = """Heap queues
@ -126,7 +126,10 @@ Believe me, real good tape sorts were quite spectacular to watch!
From all times, sorting has always been a Great Art! :-)
"""
__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace']
__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'nlargest',
'nsmallest']
from itertools import islice, repeat
def heappush(heap, item):
"""Push item onto heap, maintaining the heap invariant."""
@ -168,6 +171,35 @@ def heapify(x):
for i in reversed(xrange(n//2)):
_siftup(x, i)
def nlargest(iterable, n):
"""Find the n largest elements in a dataset.
Equivalent to: sorted(iterable, reverse=True)[:n]
"""
it = iter(iterable)
result = list(islice(it, n))
if not result:
return result
heapify(result)
_heapreplace = heapreplace
sol = result[0] # sol --> smallest of the nlargest
for elem in it:
if elem <= sol:
continue
_heapreplace(result, elem)
sol = result[0]
result.sort(reverse=True)
return result
def nsmallest(iterable, n):
"""Find the n smallest elements in a dataset.
Equivalent to: sorted(iterable)[:n]
"""
h = list(iterable)
heapify(h)
return map(heappop, repeat(h, min(n, len(h))))
# 'heap' is a heap at all indices >= startpos, except possibly for pos. pos
# is the index of a leaf with a possibly out-of-order value. Restore the
# heap invariant.

View File

@ -2,7 +2,7 @@
from test.test_support import verify, vereq, verbose, TestFailed
from heapq import heappush, heappop, heapify, heapreplace
from heapq import heappush, heappop, heapify, heapreplace, nlargest, nsmallest
import random
def check_invariant(heap):
@ -84,6 +84,15 @@ def test_main():
data.sort()
sorted = [heappop(heap) for i in range(size)]
vereq(data, sorted)
# 7) Check nlargest() and nsmallest()
data = [random.randrange(2000) for i in range(1000)]
copy = data[:]
copy.sort(reverse=True)
vereq(nlargest(data, 400), copy[:400])
copy.sort()
vereq(nsmallest(data, 400), copy[:400])
# Make user happy
if verbose:
print "All OK"

View File

@ -416,7 +416,9 @@ Library
os.path.exists(), switched to using os.lstat() directly if possible.
- bisect.py and heapq.py now have underlying C implementations
for better performance
for better performance.
- heapq.py has two new functions, nsmallest() and nlargest().
- traceback.format_exc has been added (similar to print_exc but it returns
a string).