Issue #18844: Add random.weighted_choices()

2016-09-06 17:15:29 -07:00 · 2016-09-06 17:15:29 -07:00 · e8f1e002c6
parent 63d98bcd4c
commit e8f1e002c6
4 changed files with 118 additions and 1 deletions
--- a/Doc/library/random.rst
+++ b/Doc/library/random.rst
@ -124,6 +124,27 @@ Functions for sequences:
   Return a random element from the non-empty sequence *seq*. If *seq* is empty,
   raises :exc:`IndexError`.

+.. function:: weighted_choices(k, population, weights=None, *, cum_weights=None)
+
+   Return a *k* sized list of elements chosen from the *population* with replacement.
+   If the *population* is empty, raises :exc:`IndexError`.
+
+   If a *weights* sequence is specified, selections are made according to the
+   relative weights.  Alternatively, if a *cum_weights* sequence is given, the
+   selections are made according to the cumulative weights.  For example, the
+   relative weights ``[10, 5, 30, 5]`` are equivalent to the cumulative
+   weights ``[10, 15, 45, 50]``.  Internally, the relative weights are
+   converted to cumulative weights before making selections, so supplying the
+   cumulative weights saves work.
+
+   If neither *weights* nor *cum_weights* are specified, selections are made
+   with equal probability.  If a weights sequence is supplied, it must be
+   the same length as the *population* sequence.  It is a :exc:`TypeError`
+   to specify both *weights* and *cum_weights*.
+
+   The *weights* or *cum_weights* can use any numeric type that interoperates
+   with the :class:`float` values returned by :func:`random` (that includes
+   integers, floats, and fractions but excludes decimals).

 .. function:: shuffle(x[, random])

--- a/Lib/random.py
+++ b/Lib/random.py
@ -8,6 +8,7 @@
    ---------
           pick random element
           pick random sample
+           pick weighted random sample
           generate random permutation

    distributions on the real line:
@ -43,12 +44,14 @@ from math import sqrt as _sqrt, acos as _acos, cos as _cos, sin as _sin
 from os import urandom as _urandom
 from _collections_abc import Set as _Set, Sequence as _Sequence
 from hashlib import sha512 as _sha512
+import itertools as _itertools
+import bisect as _bisect

 __all__ = ["Random","seed","random","uniform","randint","choice","sample",
           "randrange","shuffle","normalvariate","lognormvariate",
           "expovariate","vonmisesvariate","gammavariate","triangular",
           "gauss","betavariate","paretovariate","weibullvariate",
-           "getstate","setstate", "getrandbits",
+           "getstate","setstate", "getrandbits", "weighted_choices",
           "SystemRandom"]

 NV_MAGICCONST = 4 * _exp(-0.5)/_sqrt(2.0)
@ -334,6 +337,28 @@ class Random(_random.Random):
                result[i] = population[j]
        return result

+    def weighted_choices(self, k, population, weights=None, *, cum_weights=None):
+        """Return a k sized list of population elements chosen with replacement.
+
+        If the relative weights or cumulative weights are not specified,
+        the selections are made with equal probability.
+
+        """
+        if cum_weights is None:
+            if weights is None:
+                choice = self.choice
+                return [choice(population) for i in range(k)]
+            else:
+                cum_weights = list(_itertools.accumulate(weights))
+        elif weights is not None:
+            raise TypeError('Cannot specify both weights and cumulative_weights')
+        if len(cum_weights) != len(population):
+            raise ValueError('The number of weights does not match the population')
+        bisect = _bisect.bisect
+        random = self.random
+        total = cum_weights[-1]
+        return [population[bisect(cum_weights, random() * total)] for i in range(k)]
+
 ## -------------------- real-valued distributions  -------------------

 ## -------------------- uniform distribution -------------------
@ -724,6 +749,7 @@ choice = _inst.choice
 randrange = _inst.randrange
 sample = _inst.sample
 shuffle = _inst.shuffle
+weighted_choices = _inst.weighted_choices
 normalvariate = _inst.normalvariate
 lognormvariate = _inst.lognormvariate
 expovariate = _inst.expovariate
--- a/Lib/test/test_random.py
+++ b/Lib/test/test_random.py
@ -7,6 +7,7 @@ import warnings
 from functools import partial
 from math import log, exp, pi, fsum, sin
 from test import support
+from fractions import Fraction

 class TestBasicOps:
    # Superclass with tests common to all generators.
@ -141,6 +142,73 @@ class TestBasicOps:
    def test_sample_on_dicts(self):
        self.assertRaises(TypeError, self.gen.sample, dict.fromkeys('abcdef'), 2)

+    def test_weighted_choices(self):
+        weighted_choices = self.gen.weighted_choices
+        data = ['red', 'green', 'blue', 'yellow']
+        str_data = 'abcd'
+        range_data = range(4)
+        set_data = set(range(4))
+
+        # basic functionality
+        for sample in [
+            weighted_choices(5, data),
+            weighted_choices(5, data, range(4)),
+            weighted_choices(k=5, population=data, weights=range(4)),
+            weighted_choices(k=5, population=data, cum_weights=range(4)),
+        ]:
+            self.assertEqual(len(sample), 5)
+            self.assertEqual(type(sample), list)
+            self.assertTrue(set(sample) <= set(data))
+
+        # test argument handling
+        with self.assertRaises(TypeError):                                        # missing arguments
+            weighted_choices(2)
+
+        self.assertEqual(weighted_choices(0, data), [])                           # k == 0
+        self.assertEqual(weighted_choices(-1, data), [])                          # negative k behaves like ``[0] * -1``
+        with self.assertRaises(TypeError):
+            weighted_choices(2.5, data)                                           # k is a float
+
+        self.assertTrue(set(weighted_choices(5, str_data)) <= set(str_data))      # population is a string sequence
+        self.assertTrue(set(weighted_choices(5, range_data)) <= set(range_data))  # population is a range
+        with self.assertRaises(TypeError):
+            weighted_choices(2.5, set_data)                                       # population is not a sequence
+
+        self.assertTrue(set(weighted_choices(5, data, None)) <= set(data))        # weights is None
+        self.assertTrue(set(weighted_choices(5, data, weights=None)) <= set(data))
+        with self.assertRaises(ValueError):
+            weighted_choices(5, data, [1,2])                                      # len(weights) != len(population)
+        with self.assertRaises(IndexError):
+            weighted_choices(5, data, [0]*4)                                      # weights sum to zero
+        with self.assertRaises(TypeError):
+            weighted_choices(5, data, 10)                                         # non-iterable weights
+        with self.assertRaises(TypeError):
+            weighted_choices(5, data, [None]*4)                                   # non-numeric weights
+        for weights in [
+                [15, 10, 25, 30],                                                 # integer weights
+                [15.1, 10.2, 25.2, 30.3],                                         # float weights
+                [Fraction(1, 3), Fraction(2, 6), Fraction(3, 6), Fraction(4, 6)], # fractional weights
+                [True, False, True, False]                                        # booleans (include / exclude)
+        ]:
+            self.assertTrue(set(weighted_choices(5, data, weights)) <= set(data))
+
+        with self.assertRaises(ValueError):
+            weighted_choices(5, data, cum_weights=[1,2])                          # len(weights) != len(population)
+        with self.assertRaises(IndexError):
+            weighted_choices(5, data, cum_weights=[0]*4)                          # cum_weights sum to zero
+        with self.assertRaises(TypeError):
+            weighted_choices(5, data, cum_weights=10)                             # non-iterable cum_weights
+        with self.assertRaises(TypeError):
+            weighted_choices(5, data, cum_weights=[None]*4)                       # non-numeric cum_weights
+        with self.assertRaises(TypeError):
+            weighted_choices(5, data, range(4), cum_weights=range(4))             # both weights and cum_weights
+        for weights in [
+                [15, 10, 25, 30],                                                 # integer cum_weights
+                [15.1, 10.2, 25.2, 30.3],                                         # float cum_weights
+                [Fraction(1, 3), Fraction(2, 6), Fraction(3, 6), Fraction(4, 6)], # fractional cum_weights
+        ]:
+            self.assertTrue(set(weighted_choices(5, data, cum_weights=weights)) <= set(data))
+
    def test_gauss(self):
        # Ensure that the seed() method initializes all the hidden state.  In
        # particular, through 2.2.1 it failed to reset a piece of state used
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -101,6 +101,8 @@ Library
 - Issue #27691: Fix ssl module's parsing of GEN_RID subject alternative name
  fields in X.509 certs.

+- Issue #18844: Add random.weighted_choices().
+
 - Issue #25761: Improved error reporting about truncated pickle data in
  C implementation of unpickler.  UnpicklingError is now raised instead of
  AttributeError and ValueError in some cases.