From f6b26676bc97f2f023cbfeabec5583895e9e054f Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 5 Mar 2013 01:36:30 -0500 Subject: [PATCH] Issue #16098: Update heapq.nsmallest to use the same algorithm as nlargest. This removes the dependency on bisect and it bring the pure Python code in-sync with the C code. --- Lib/heapq.py | 84 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index dec15aec9ce..6859821feee 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -127,8 +127,7 @@ From all times, sorting has always been a Great Art! :-) __all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge', 'nlargest', 'nsmallest', 'heappushpop'] -from itertools import islice, repeat, count, tee, chain -import bisect +from itertools import islice, count, tee, chain def heappush(heap, item): """Push item onto heap, maintaining the heap invariant.""" @@ -180,6 +179,19 @@ def heapify(x): for i in reversed(range(n//2)): _siftup(x, i) +def _heappushpop_max(heap, item): + """Maxheap version of a heappush followed by a heappop.""" + if heap and item < heap[0]: + item, heap[0] = heap[0], item + _siftup_max(heap, 0) + return item + +def _heapify_max(x): + """Transform list into a maxheap, in-place, in O(len(x)) time.""" + n = len(x) + for i in reversed(range(n//2)): + _siftup_max(x, i) + def nlargest(n, iterable): """Find the n largest elements in a dataset. @@ -205,30 +217,16 @@ def nsmallest(n, iterable): """ if n < 0: return [] - if hasattr(iterable, '__len__') and n * 10 <= len(iterable): - # For smaller values of n, the bisect method is faster than a minheap. - # It is also memory efficient, consuming only n elements of space. - it = iter(iterable) - result = sorted(islice(it, 0, n)) - if not result: - return result - insort = bisect.insort - pop = result.pop - los = result[-1] # los --> Largest of the nsmallest - for elem in it: - if elem < los: - insort(result, elem) - pop() - los = result[-1] + it = iter(iterable) + result = list(islice(it, n)) + if not result: return result - # An alternative approach manifests the whole iterable in memory but - # saves comparisons by heapifying all at once. Also, saves time - # over bisect.insort() which has O(n) data movement time for every - # insertion. Finding the n smallest of an m length iterable requires - # O(m) + O(n log m) comparisons. - h = list(iterable) - heapify(h) - return list(map(heappop, repeat(h, min(n, len(h))))) + _heapify_max(result) + _heappushpop = _heappushpop_max + for elem in it: + _heappushpop(result, elem) + result.sort() + return result # 'heap' is a heap at all indices >= startpos, except possibly for pos. pos # is the index of a leaf with a possibly out-of-order value. Restore the @@ -306,6 +304,42 @@ def _siftup(heap, pos): heap[pos] = newitem _siftdown(heap, startpos, pos) +def _siftdown_max(heap, startpos, pos): + 'Maxheap variant of _siftdown' + newitem = heap[pos] + # Follow the path to the root, moving parents down until finding a place + # newitem fits. + while pos > startpos: + parentpos = (pos - 1) >> 1 + parent = heap[parentpos] + if parent < newitem: + heap[pos] = parent + pos = parentpos + continue + break + heap[pos] = newitem + +def _siftup_max(heap, pos): + 'Minheap variant of _siftup' + endpos = len(heap) + startpos = pos + newitem = heap[pos] + # Bubble up the larger child until hitting a leaf. + childpos = 2*pos + 1 # leftmost child position + while childpos < endpos: + # Set childpos to index of larger child. + rightpos = childpos + 1 + if rightpos < endpos and not heap[rightpos] < heap[childpos]: + childpos = rightpos + # Move the larger child up. + heap[pos] = heap[childpos] + pos = childpos + childpos = 2*pos + 1 + # The leaf at pos is empty now. Put newitem there, and bubble it up + # to its final resting place (by sifting its parents down). + heap[pos] = newitem + _siftdown_max(heap, startpos, pos) + # If available, use C implementation try: from _heapq import *