mirror of https://github.com/python/cpython
Add algorithmic notes for nsmallest() and nlargest().
This commit is contained in:
parent
25d9040cb6
commit
2aad6ef774
56
Lib/heapq.py
56
Lib/heapq.py
|
@ -192,6 +192,62 @@ def _heapify_max(x):
|
|||
for i in reversed(range(n//2)):
|
||||
_siftup_max(x, i)
|
||||
|
||||
|
||||
# Algorithm notes for nlargest() and nsmallest()
|
||||
# ==============================================
|
||||
#
|
||||
# Makes just one pass over the data while keeping the n most extreme values
|
||||
# in a heap. Memory consumption is limited to keeping n values in a list.
|
||||
#
|
||||
# Number of comparisons for n random inputs, keeping the k smallest values:
|
||||
# -----------------------------------------------------------
|
||||
# Step Comparisons Action
|
||||
# 1 2*k heapify the first k-inputs
|
||||
# 2 n-k compare new input elements to top of heap
|
||||
# 3 k*lg2(k)*(ln(n)-lg(k)) add new extreme values to the heap
|
||||
# 4 k*lg2(k) final sort of the k most extreme values
|
||||
#
|
||||
# n-random inputs k-extreme values number of comparisons % more than min()
|
||||
# --------------- ---------------- ------------------- -----------------
|
||||
# 10,000 100 13,634 36.3%
|
||||
# 100,000 100 105,163 5.2%
|
||||
# 1,000,000 100 1,006,694 0.7%
|
||||
#
|
||||
# Computing the number of comparisons for step 3:
|
||||
# -----------------------------------------------
|
||||
# * For the i-th new value from the iterable, the probability of being in the
|
||||
# k most extreme values is k/i. For example, the probability of the 101st
|
||||
# value seen being in the 100 most extreme values is 100/101.
|
||||
# * If the value is a new extreme value, the cost of inserting it into the
|
||||
# heap is log(k, 2).
|
||||
# * The probabilty times the cost gives:
|
||||
# (k/i) * log(k, 2)
|
||||
# * Summing across the remaining n-k elements gives:
|
||||
# sum((k/i) * log(k, 2) for xrange(k+1, n+1))
|
||||
# * This reduces to:
|
||||
# (H(n) - H(k)) * k * log(k, 2)
|
||||
# * Where H(n) is the n-th harmonic number estimated by:
|
||||
# H(n) = log(n, e) + gamma + 1.0 / (2.0 * n)
|
||||
# gamma = 0.5772156649
|
||||
# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence
|
||||
# * Substituting the H(n) formula and ignoring the (1/2*n) fraction gives:
|
||||
# comparisons = k * log(k, 2) * (log(n,e) - log(k, e))
|
||||
#
|
||||
# Worst-case for step 3:
|
||||
# ---------------------
|
||||
# In the worst case, the input data is reversed sorted so that every new element
|
||||
# must be inserted in the heap:
|
||||
# comparisons = log(k, 2) * (n - k)
|
||||
#
|
||||
# Alternative Algorithms
|
||||
# ----------------------
|
||||
# Other algorithms were not used because they:
|
||||
# 1) Took much more auxiliary memory,
|
||||
# 2) Made multiple passes over the data.
|
||||
# 3) Made more comparisons in common cases (small k, large n, semi-random input).
|
||||
# See detailed comparisons at:
|
||||
# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest
|
||||
|
||||
def nlargest(n, iterable):
|
||||
"""Find the n largest elements in a dataset.
|
||||
|
||||
|
|
Loading…
Reference in New Issue