Improved clarity and thoroughness of docstring.

Added design notes in comments.
Used better variable names.
Eliminated the unsavory "pool[-k:]" which was an aspiring bug (for k==0).
Used if/else to show the two algorithms in parallel style.
Added one more test assertion.
This commit is contained in:
Raymond Hettinger 2002-11-13 15:26:37 +00:00
parent 674dae245a
commit c0b4034b81
1 changed files with 41 additions and 20 deletions

View File

@ -377,39 +377,59 @@ class Random:
def sample(self, population, k, random=None, int=int): def sample(self, population, k, random=None, int=int):
"""Chooses k unique random elements from a population sequence. """Chooses k unique random elements from a population sequence.
Returns a new list containing elements from the population. The Returns a new list containing elements from the population while
list itself is in random order so that all sub-slices are also leaving the original population unchanged. The resulting list is
random samples. The original sequence is left undisturbed. in selection order so that all sub-slices will also be valid random
samples. This allows raffle winners (the sample) to be partitioned
into grand prize and second place winners (the subslices).
If the population has repeated elements, then each occurrence is Members of the population need not be hashable or unique. If the
a possible selection in the sample. population contains repeats, then each occurrence is a possible
selection in the sample.
If indices are needed for a large population, use xrange as an To choose a sample in a range of integers, use xrange as an argument.
argument: sample(xrange(10000000), 60) This is especially fast and space efficient for sampling from a
large population: sample(xrange(10000000), 60)
Optional arg random is a 0-argument function returning a random Optional arg random is a 0-argument function returning a random
float in [0.0, 1.0); by default, the standard random.random. float in [0.0, 1.0); by default, the standard random.random.
""" """
# Sampling without replacement entails tracking either potential
# selections (the pool) or previous selections.
# Pools are stored in lists which provide __getitem__ for selection
# and provide a way to remove selections. But each list.remove()
# rebuilds the entire list, so it is better to rearrange the list,
# placing non-selected elements at the head of the list. Tracking
# the selection pool is only space efficient with small populations.
# Previous selections are stored in dictionaries which provide
# __contains__ for detecting repeat selections. Discarding repeats
# is efficient unless most of the population has already been chosen.
# So, tracking selections is useful when sample sizes are much
# smaller than the total population.
n = len(population) n = len(population)
if not 0 <= k <= n: if not 0 <= k <= n:
raise ValueError, "sample larger than population" raise ValueError, "sample larger than population"
if random is None: if random is None:
random = self.random random = self.random
result = [None] * k
if n < 6 * k: # if n len list takes less space than a k len dict if n < 6 * k: # if n len list takes less space than a k len dict
pool = list(population) pool = list(population) # track potential selections
for i in xrange(n-1, n-k-1, -1): for i in xrange(k):
j = int(random() * (i+1)) j = int(random() * (n-i)) # non-selected at [0,n-i)
pool[i], pool[j] = pool[j], pool[i] result[i] = pool[j] # save selected element
return pool[-k:] pool[j] = pool[n-i-1] # non-selected to head of list
inorder = [None] * k else:
selections = {} selected = {} # track previous selections
for i in xrange(k): for i in xrange(k):
j = int(random() * n) j = int(random() * n)
while j in selections: while j in selected: # discard and replace repeats
j = int(random() * n) j = int(random() * n)
selections[j] = inorder[i] = population[j] result[i] = selected[j] = population[j]
return inorder # return selections in the order they were picked return result # return selections in the order they were picked
## -------------------- real-valued distributions ------------------- ## -------------------- real-valued distributions -------------------
@ -756,6 +776,7 @@ def _test_sample(n):
for k in xrange(n+1): for k in xrange(n+1):
s = sample(population, k) s = sample(population, k)
assert len(dict([(elem,True) for elem in s])) == len(s) == k assert len(dict([(elem,True) for elem in s])) == len(s) == k
assert None not in s
def _sample_generator(n, k): def _sample_generator(n, k):
# Return a fixed element from the sample. Validates random ordering. # Return a fixed element from the sample. Validates random ordering.
@ -787,7 +808,7 @@ def _test(N=2000):
_test_generator(N, 'weibullvariate(1.0, 1.0)') _test_generator(N, 'weibullvariate(1.0, 1.0)')
_test_generator(N, '_sample_generator(50, 5)') # expected s.d.: 14.4 _test_generator(N, '_sample_generator(50, 5)') # expected s.d.: 14.4
_test_generator(N, '_sample_generator(50, 45)') # expected s.d.: 14.4 _test_generator(N, '_sample_generator(50, 45)') # expected s.d.: 14.4
_test_sample(1000) _test_sample(500)
# Test jumpahead. # Test jumpahead.
s = getstate() s = getstate()